[cig-commits] r19213 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER: examples/homogeneous_halfspace/in_data_files examples/layered_halfspace/in_data_files examples/meshfem3D_examples/many_interfaces examples/meshfem3D_examples/simple_model examples/meshfem3D_examples/socal1D examples/meshfem3D_examples/socal1D/example_utm examples/noise_tomography/in_data_files examples/tomographic_model/in_data_files in_data_files src/cuda src/shared src/specfem3D
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Thu Nov 17 15:51:04 PST 2011
Author: danielpeter
Date: 2011-11-17 15:51:03 -0800 (Thu, 17 Nov 2011)
New Revision: 19213
Modified:
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/homogeneous_halfspace/in_data_files/Par_file
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/layered_halfspace/in_data_files/Par_file
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/many_interfaces/Par_file
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/simple_model/Par_file
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/Par_file
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/example_utm/Par_file_utm
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step1
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step2
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step3
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/tomographic_model/in_data_files/Par_file
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/in_data_files/Par_file.in
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/param_reader.c
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
Log:
updates device initialization
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/homogeneous_halfspace/in_data_files/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/homogeneous_halfspace/in_data_files/Par_file 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/homogeneous_halfspace/in_data_files/Par_file 2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
ATTENUATION = .false.
USE_OLSEN_ATTENUATION = .false.
ANISOTROPY = .false.
+GRAVITY = .false.
# absorbing boundary conditions for a regional simulation
ABSORBING_CONDITIONS = .false.
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/layered_halfspace/in_data_files/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/layered_halfspace/in_data_files/Par_file 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/layered_halfspace/in_data_files/Par_file 2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
ATTENUATION = .false.
USE_OLSEN_ATTENUATION = .false.
ANISOTROPY = .false.
+GRAVITY = .false.
# absorbing boundary conditions for a regional simulation
ABSORBING_CONDITIONS = .true.
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/many_interfaces/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/many_interfaces/Par_file 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/many_interfaces/Par_file 2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
ATTENUATION = .false.
USE_OLSEN_ATTENUATION = .false.
ANISOTROPY = .false.
+GRAVITY = .false.
# absorbing boundary conditions for a regional simulation
ABSORBING_CONDITIONS = .true.
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/simple_model/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/simple_model/Par_file 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/simple_model/Par_file 2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
ATTENUATION = .false.
USE_OLSEN_ATTENUATION = .false.
ANISOTROPY = .false.
+GRAVITY = .false.
# absorbing boundary conditions for a regional simulation
ABSORBING_CONDITIONS = .true.
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/Par_file 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/Par_file 2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
ATTENUATION = .false.
USE_OLSEN_ATTENUATION = .false.
ANISOTROPY = .false.
+GRAVITY = .false.
# absorbing boundary conditions for a regional simulation
ABSORBING_CONDITIONS = .true.
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/example_utm/Par_file_utm
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/example_utm/Par_file_utm 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/example_utm/Par_file_utm 2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
ATTENUATION = .false.
USE_OLSEN_ATTENUATION = .false.
ANISOTROPY = .false.
+GRAVITY = .false.
# absorbing boundary conditions for a regional simulation
ABSORBING_CONDITIONS = .true.
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step1
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step1 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step1 2011-11-17 23:51:03 UTC (rev 19213)
@@ -22,6 +22,7 @@
ATTENUATION = .false.
USE_OLSEN_ATTENUATION = .false.
ANISOTROPY = .false.
+GRAVITY = .false.
# absorbing boundary conditions for a regional simulation
ABSORBING_CONDITIONS = .true.
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step2
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step2 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step2 2011-11-17 23:51:03 UTC (rev 19213)
@@ -22,6 +22,7 @@
ATTENUATION = .false.
USE_OLSEN_ATTENUATION = .false.
ANISOTROPY = .false.
+GRAVITY = .false.
# absorbing boundary conditions for a regional simulation
ABSORBING_CONDITIONS = .true.
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step3
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step3 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step3 2011-11-17 23:51:03 UTC (rev 19213)
@@ -22,6 +22,7 @@
ATTENUATION = .false.
USE_OLSEN_ATTENUATION = .false.
ANISOTROPY = .false.
+GRAVITY = .false.
# absorbing boundary conditions for a regional simulation
ABSORBING_CONDITIONS = .true.
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/tomographic_model/in_data_files/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/tomographic_model/in_data_files/Par_file 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/tomographic_model/in_data_files/Par_file 2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
ATTENUATION = .false.
USE_OLSEN_ATTENUATION = .false.
ANISOTROPY = .false.
+GRAVITY = .false.
# absorbing boundary conditions for a regional simulation
ABSORBING_CONDITIONS = .true.
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/in_data_files/Par_file.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/in_data_files/Par_file.in 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/in_data_files/Par_file.in 2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
ATTENUATION = .false.
USE_OLSEN_ATTENUATION = .false.
ANISOTROPY = .false.
+GRAVITY = .false.
# absorbing boundary conditions for a regional simulation
ABSORBING_CONDITIONS = .false.
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -368,7 +368,7 @@
realw* h_max;
int blocksize = 256;
- int num_blocks_x = ceil(mp->NGLOB_AB/blocksize);
+ int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
//printf("num_blocks_x %i \n",num_blocks_x);
h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
@@ -506,7 +506,7 @@
realw* h_max;
int blocksize = 256;
- int num_blocks_x = ceil(mp->NGLOB_AB/blocksize);
+ int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
//printf("num_blocks_x %i \n",num_blocks_x);
h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -117,7 +117,7 @@
int num_blocks_x = NSOURCES;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -171,7 +171,7 @@
int num_blocks_x = NSOURCES;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -293,7 +293,7 @@
int num_blocks_x = mp->nadj_rec_local;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -114,7 +114,7 @@
int num_blocks_x = NSOURCES;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -168,7 +168,7 @@
int num_blocks_x = NSOURCES;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -331,7 +331,7 @@
int num_blocks_x = mp->nadj_rec_local;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -133,7 +133,7 @@
int num_blocks_x = num_coupling_ac_el_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -287,7 +287,7 @@
int num_blocks_x = num_coupling_ac_el_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -88,7 +88,7 @@
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -196,7 +196,7 @@
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -546,7 +546,7 @@
int num_blocks_x = nb_blocks_to_compute;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -778,7 +778,7 @@
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
dim3 grid(num_blocks_x,num_blocks_y);
@@ -822,7 +822,7 @@
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
dim3 grid(num_blocks_x,num_blocks_y);
@@ -913,7 +913,7 @@
int num_blocks_x = mp->num_free_surface_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
dim3 grid(num_blocks_x,num_blocks_y,1);
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -102,7 +102,7 @@
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -212,7 +212,7 @@
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -1146,7 +1146,7 @@
int num_blocks_x = nb_blocks_to_compute;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -1580,7 +1580,7 @@
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -1633,7 +1633,7 @@
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -1738,7 +1738,7 @@
int num_blocks_x = mp->num_free_surface_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -119,7 +119,7 @@
int num_blocks_x = mp->NSPEC_AB;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -213,7 +213,7 @@
int num_blocks_x = mp->num_free_surface_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -429,7 +429,7 @@
int num_blocks_x = mp->NSPEC_AB;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -596,7 +596,7 @@
int num_blocks_x = mp->NSPEC_AB;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -146,7 +146,7 @@
int num_blocks_x = mp->d_num_abs_boundary_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -158,7 +158,7 @@
int num_blocks_x = mp->d_num_abs_boundary_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -98,7 +98,7 @@
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -200,7 +200,7 @@
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -138,7 +138,7 @@
int num_blocks_x = mp->num_free_surface_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
dim3 grid(num_blocks_x,num_blocks_y,1);
@@ -250,7 +250,7 @@
int num_blocks_x = mp->num_free_surface_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
dim3 grid(num_blocks_x,num_blocks_y,1);
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -156,7 +156,7 @@
get_free_memory(&free_db,&used_db,&total_db);
- sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_mem_usage_proc_%03d.txt",myrank);
+ sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_mem_usage_proc_%06d.txt",myrank);
fp = fopen(filename,"a+");
fprintf(fp,"%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
@@ -420,6 +420,123 @@
/* ----------------------------------------------------------------------------------------------- */
extern "C"
+void FC_FUNC_(prepare_cuda_device,
+ PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+ TRACE("prepare_cuda_device");
+
+ // Gets rank number of MPI process
+ int myrank = *myrank_f;
+
+ // cuda initialization (needs -lcuda library)
+ // daniel: note, cuInit initializes the driver API.
+ // it is needed for any following CUDA driver API function call (format cuFUNCTION(..) )
+ // however, for the CUDA runtime API functions (format cudaFUNCTION(..) )
+ // the initialization is implicit, thus cuInit() here would not be needed...
+ CUresult status = cuInit(0);
+ if ( CUDA_SUCCESS != status ) exit_on_error("CUDA driver API device initialization failed\n");
+
+ // returns a handle to the first cuda compute device
+ CUdevice dev;
+ status = cuDeviceGet(&dev, 0);
+ if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device not found\n");
+
+ // gets device properties
+ int major,minor;
+ status = cuDeviceComputeCapability(&major,&minor,dev);
+ if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device information not found\n");
+
+ // make sure that the device has compute capability >= 1.3
+ if (major < 1){
+ fprintf(stderr,"Compute capability major number should be at least 1, got: %d \nexiting...\n",major);
+ exit_on_error("CUDA Compute capability major number should be at least 1\n");
+ }
+ if (major == 1 && minor < 3){
+ fprintf(stderr,"Compute capability should be at least 1.3, got: %d.%d \nexiting...\n",major,minor);
+ exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
+ }
+
+ //daniel: from here on we use the runtime API ...
+ // Gets number of GPU devices
+ int device_count = 0;
+ cudaGetDeviceCount(&device_count);
+ exit_on_cuda_error("CUDA runtime cudaGetDeviceCount: check if driver and runtime libraries work together\nexiting...\n");
+
+ // returns device count to fortran
+ if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
+ *ncuda_devices = device_count;
+
+ // Sets the active device
+ if(device_count > 1) {
+ // generalized for more GPUs per node
+ cudaSetDevice( myrank % device_count );
+ exit_on_cuda_error("cudaSetDevice");
+ }
+
+ // returns a handle to the active device
+ int device;
+ cudaGetDevice(&device);
+
+ // get device properties
+ struct cudaDeviceProp deviceProp;
+ cudaGetDeviceProperties(&deviceProp,device);
+
+ // exit if the machine has no CUDA-enabled device
+ if (deviceProp.major == 9999 && deviceProp.minor == 9999){
+ fprintf(stderr,"No CUDA-enabled device found, exiting...\n\n");
+ exit_on_error("CUDA runtime error: there is no CUDA-enabled device found\n");
+ }
+
+ // outputs device infos to file
+ char filename[BUFSIZ];
+ FILE* fp;
+ sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_device_info_proc_%06d.txt",myrank);
+ fp = fopen(filename,"a+");
+
+ // display device properties
+ fprintf(fp,"Device Name = %s\n",deviceProp.name);
+ fprintf(fp,"multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
+ fprintf(fp,"totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
+ fprintf(fp,"totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
+ fprintf(fp,"sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
+ fprintf(fp,"Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
+ fprintf(fp,"Maximum size of each dimension of a block: %d x %d x %d\n",
+ deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
+ fprintf(fp,"Maximum sizes of each dimension of a grid: %d x %d x %d\n",
+ deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
+ fprintf(fp,"Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
+ if(deviceProp.canMapHostMemory){
+ fprintf(fp,"canMapHostMemory: TRUE\n");
+ }else{
+ fprintf(fp,"canMapHostMemory: FALSE\n");
+ }
+ if(deviceProp.deviceOverlap){
+ fprintf(fp,"deviceOverlap: TRUE\n");
+ }else{
+ fprintf(fp,"deviceOverlap: FALSE\n");
+ }
+
+ // make sure that the device has compute capability >= 1.3
+ //if (deviceProp.major < 1){
+ // fprintf(stderr,"Compute capability major number should be at least 1, exiting...\n\n");
+ // exit_on_error("CUDA Compute capability major number should be at least 1");
+ //}
+ //if (deviceProp.major == 1 && deviceProp.minor < 3){
+ // fprintf(stderr,"Compute capability should be at least 1.3, exiting...\n");
+ // exit_on_error("CUDA Compute capability major number should be at least 1.3");
+ //}
+
+ // outputs initial memory infos via cudaMemGetInfo()
+ double free_db,used_db,total_db;
+ get_free_memory(&free_db,&used_db,&total_db);
+ fprintf(fp,"%d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank,
+ used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+
+ fclose(fp);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
void FC_FUNC_(prepare_constants_device,
PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
int* h_NGLLX,
@@ -453,34 +570,10 @@
int* nrec_local_f,
int* SIMULATION_TYPE,
int* USE_MESH_COLORING_GPU_f,
- int* nspec_acoustic,int* nspec_elastic,
- int* myrank_f,int* ncuda_devices) {
+ int* nspec_acoustic,int* nspec_elastic) {
TRACE("prepare_constants_device");
- int device_count = 0;
-
- // cuda initialization (needs -lcuda library)
- //cuInit(0);
- CUresult status = cuInit(0);
- if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device initialization failed");
-
- // Gets number of GPU devices
- cudaGetDeviceCount(&device_count);
- //printf("Cuda Devices: %d\n", device_count);
- if (device_count == 0) exit_on_error("There is no device supporting CUDA\n");
- *ncuda_devices = device_count;
-
- // Gets rank number of MPI process
- int myrank = *myrank_f;
-
- // Sets the active device
- if(device_count > 1) {
- // generalized for more GPUs per node
- cudaSetDevice( myrank%device_count );
- exit_on_cuda_error("cudaSetDevice");
- }
-
// allocates mesh parameter structure
Mesh* mp = (Mesh*) malloc( sizeof(Mesh) );
if (mp == NULL) exit_on_error("error allocating mesh pointer");
@@ -670,75 +763,9 @@
#endif
}
-/* ----------------------------------------------------------------------------------------------- */
-// purely adjoint & kernel simulations
-
/* ----------------------------------------------------------------------------------------------- */
-extern "C"
-void FC_FUNC_(prepare_sim2_or_3_const_device,
- PREPARE_SIM2_OR_3_CONST_DEVICE)(
- long* Mesh_pointer_f,
- int* islice_selected_rec,
- int* islice_selected_rec_size,
- int* nadj_rec_local,
- int* nrec,
- int* myrank) {
-
-TRACE("prepare_sim2_or_3_const_device");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-
- // allocates arrays for receivers
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_rec,
- *islice_selected_rec_size*sizeof(int)),7001);
- // copies arrays to GPU device
- print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_rec,islice_selected_rec,
- *islice_selected_rec_size*sizeof(int),cudaMemcpyHostToDevice),7002);
-
- // adjoint source arrays
- mp->nadj_rec_local = *nadj_rec_local;
- if( mp->nadj_rec_local > 0 ){
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
- (mp->nadj_rec_local)*3*NGLL3*sizeof(realw)),7003);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
- (mp->nadj_rec_local)*sizeof(int)),7004);
-
- // prepares local irec array:
- // the irec_local variable needs to be precomputed (as
- // h_pre_comp..), because normally it is in the loop updating accel,
- // and due to how it's incremented, it cannot be parallelized
- int* h_pre_computed_irec = (int*) malloc( (mp->nadj_rec_local)*sizeof(int) );
- if( h_pre_computed_irec == NULL ) exit_on_error("prepare_sim2_or_3_const_device: h_pre_computed_irec not allocated\n");
-
- int irec_local = 0;
- for(int irec = 0; irec < *nrec; irec++) {
- if(*myrank == islice_selected_rec[irec]) {
- irec_local++;
- h_pre_computed_irec[irec_local-1] = irec;
- }
- }
- if( irec_local != mp->nadj_rec_local ) exit_on_error("prepare_sim2_or_3_const_device: irec_local not equal\n");
- // copies values onto GPU
- print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec,
- (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),7010);
- free(h_pre_computed_irec);
-
- // temporary array to prepare extracted source array values
- mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
- if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
-
- }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("prepare_sim2_or_3_const_device");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
// for ACOUSTIC simulations
/* ----------------------------------------------------------------------------------------------- */
@@ -778,42 +805,42 @@
int size_glob = mp->NGLOB_AB;
// allocates arrays on device (GPU)
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size_glob),9001);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size_glob),9002);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),9003);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size_glob),2001);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size_glob),2002);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),2003);
// mpi buffer
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),
- (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),9004);
+ (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),2004);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),9005);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),2005);
print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
- sizeof(realw)*size_glob,cudaMemcpyHostToDevice),9100);
+ sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
// padded array
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),9006);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),2006);
// transfer constant element data with padding
for(int i=0; i < mp->NSPEC_AB; i++) {
print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*NGLL3_PADDED, &rhostore[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),9106);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),2106);
}
// non-padded array
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),9007);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),2007);
print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
- NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),9105);
+ NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),2105);
// phase elements
mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic),
- mp->num_phase_ispec_acoustic*2*sizeof(int)),9008);
+ mp->num_phase_ispec_acoustic*2*sizeof(int)),2008);
print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
- mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),9101);
+ mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),2101);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic),
- mp->NSPEC_AB*sizeof(int)),9009);
+ mp->NSPEC_AB*sizeof(int)),2009);
print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic,
- mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),9102);
+ mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),2102);
// free surface
if( *NOISE_TOMOGRAPHY == 0 ){
@@ -821,30 +848,30 @@
mp->num_free_surface_faces = *num_free_surface_faces;
if( mp->num_free_surface_faces > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
- mp->num_free_surface_faces*sizeof(int)),9201);
+ mp->num_free_surface_faces*sizeof(int)),2201);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
- mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9203);
+ mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2203);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
- 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),9202);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),2202);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
- 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9204);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2204);
}
}
// absorbing boundaries
if( *ABSORBING_CONDITIONS ){
mp->d_b_reclen_potential = *b_reclen_potential;
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),9301);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),2301);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,
- mp->d_b_reclen_potential,cudaMemcpyHostToDevice),9302);
+ mp->d_b_reclen_potential,cudaMemcpyHostToDevice),2302);
}
// for seismograms
if( mp->nrec_local > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),
- mp->nrec_local*NGLL3*sizeof(realw)),9107);
+ mp->nrec_local*NGLL3*sizeof(realw)),2107);
mp->h_station_seismo_potential = (realw*) malloc( mp->nrec_local*NGLL3*sizeof(realw) );
if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
@@ -854,24 +881,24 @@
// coupling with elastic parts
if( *ELASTIC_SIMULATION && *num_coupling_ac_el_faces > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ispec),
- (*num_coupling_ac_el_faces)*sizeof(int)),9601);
+ (*num_coupling_ac_el_faces)*sizeof(int)),2601);
print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,
- (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9602);
+ (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2602);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk),
- 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),9603);
+ 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),2603);
print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,
- 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9604);
+ 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2604);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal),
- 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),9605);
+ 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2605);
print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
- 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),9606);
+ 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2606);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw),
- NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),9607);
+ NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2607);
print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
- NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),9608);
+ NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2608);
}
@@ -906,26 +933,26 @@
if( *SIMULATION_TYPE != 3 ) return;
// allocates backward/reconstructed arrays on device (GPU)
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size_glob),9014);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size_glob),9015);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size_glob),9016);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size_glob),3014);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size_glob),3015);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size_glob),3016);
// allocates kernels
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),9017);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),9018);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3017);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3018);
// initializes kernel values to zero
print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),9019);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),3019);
print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),9020);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),3020);
// preconditioner
if( *APPROXIMATE_HESS_KL ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),9030);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3030);
// initializes with zeros
print_CUDA_error_if_any(cudaMemset(mp->d_hess_ac_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),9031);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),3031);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1003,25 +1030,25 @@
int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),8001);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),8002);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),8003);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),4001);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),4002);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),4003);
// mpi buffer
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
- 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),8004);
+ 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),4004);
// mass matrix
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),8005);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
// transfer element data
print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
- sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8010);
+ sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4010);
// element indices
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),8009);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),4009);
print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic,
- mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),8012);
+ mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),4012);
// daniel: check
//check_ispec_is(Mesh_pointer_f,1);
@@ -1029,9 +1056,9 @@
// phase elements
mp->num_phase_ispec_elastic = *num_phase_ispec_elastic;
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic),
- mp->num_phase_ispec_elastic*2*sizeof(int)),8008);
+ mp->num_phase_ispec_elastic*2*sizeof(int)),4008);
print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,
- mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),8011);
+ mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),4011);
//daniel: check
//check_phase_ispec(Mesh_pointer_f,1);
@@ -1039,7 +1066,7 @@
// for seismograms
if( mp->nrec_local > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
- 3*NGLL3*(mp->nrec_local)*sizeof(realw)),8015);
+ 3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) );
if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
@@ -1048,22 +1075,22 @@
// absorbing conditions
if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){
// non-padded arrays
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),8006);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),8007);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),4006);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),4007);
// rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
- NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8013);
+ NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4013);
print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
- NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8014);
+ NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4014);
// absorb_field array used for file i/o
if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD )){
mp->d_b_reclen_field = *h_b_reclen_field;
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field),
- mp->d_b_reclen_field),8016);
+ mp->d_b_reclen_field),4016);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
- mp->d_b_reclen_field,cudaMemcpyHostToDevice),8017);
+ mp->d_b_reclen_field,cudaMemcpyHostToDevice),4017);
}
}
@@ -1073,25 +1100,25 @@
int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
- epsilondev_size*sizeof(realw)),8301);
+ epsilondev_size*sizeof(realw)),4301);
print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw),
- cudaMemcpyHostToDevice),8302);
+ cudaMemcpyHostToDevice),4302);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
- epsilondev_size*sizeof(realw)),8302);
+ epsilondev_size*sizeof(realw)),4302);
print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw),
- cudaMemcpyHostToDevice),8303);
+ cudaMemcpyHostToDevice),4303);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
- epsilondev_size*sizeof(realw)),8304);
+ epsilondev_size*sizeof(realw)),4304);
print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw),
- cudaMemcpyHostToDevice),8305);
+ cudaMemcpyHostToDevice),4305);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
- epsilondev_size*sizeof(realw)),8306);
+ epsilondev_size*sizeof(realw)),4306);
print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw),
- cudaMemcpyHostToDevice),8307);
+ cudaMemcpyHostToDevice),4307);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
- epsilondev_size*sizeof(realw)),8308);
+ epsilondev_size*sizeof(realw)),4308);
print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw),
- cudaMemcpyHostToDevice),8309);
+ cudaMemcpyHostToDevice),4309);
}
@@ -1099,56 +1126,56 @@
if( *ATTENUATION ){
// memory arrays
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
- (*R_size)*sizeof(realw)),8401);
+ (*R_size)*sizeof(realw)),4401);
print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),8402);
+ cudaMemcpyHostToDevice),4402);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
- (*R_size)*sizeof(realw)),8403);
+ (*R_size)*sizeof(realw)),4403);
print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),8404);
+ cudaMemcpyHostToDevice),4404);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
- (*R_size)*sizeof(realw)),8405);
+ (*R_size)*sizeof(realw)),4405);
print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),8406);
+ cudaMemcpyHostToDevice),4406);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
- (*R_size)*sizeof(realw)),8407);
+ (*R_size)*sizeof(realw)),4407);
print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),8408);
+ cudaMemcpyHostToDevice),4408);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
- (*R_size)*sizeof(realw)),8409);
+ (*R_size)*sizeof(realw)),4409);
print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),8410);
+ cudaMemcpyHostToDevice),4410);
// attenuation factors
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
- NGLL3*mp->NSPEC_AB*sizeof(realw)),8430);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),4430);
print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
- NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8431);
+ NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4431);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
- N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),8432);
+ N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),4432);
print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
- N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8433);
+ N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4433);
// alpha,beta,gamma factors
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
- N_SLS*sizeof(realw)),8434);
+ N_SLS*sizeof(realw)),4434);
print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
- N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8435);
+ N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
- N_SLS*sizeof(realw)),8436);
+ N_SLS*sizeof(realw)),4436);
print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
- N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8437);
+ N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
- N_SLS*sizeof(realw)),8438);
+ N_SLS*sizeof(realw)),4438);
print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
- N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8439);
+ N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439);
}
@@ -1156,92 +1183,92 @@
if( *ANISOTROPY ){
// allocates memory on GPU
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store),
- size_padded*sizeof(realw)),8700);
+ size_padded*sizeof(realw)),4700);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store),
- size_padded*sizeof(realw)),8701);
+ size_padded*sizeof(realw)),4701);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store),
- size_padded*sizeof(realw)),8702);
+ size_padded*sizeof(realw)),4702);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store),
- size_padded*sizeof(realw)),8703);
+ size_padded*sizeof(realw)),4703);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store),
- size_padded*sizeof(realw)),8704);
+ size_padded*sizeof(realw)),4704);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store),
- size_padded*sizeof(realw)),8705);
+ size_padded*sizeof(realw)),4705);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store),
- size_padded*sizeof(realw)),8706);
+ size_padded*sizeof(realw)),4706);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store),
- size_padded*sizeof(realw)),8707);
+ size_padded*sizeof(realw)),4707);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store),
- size_padded*sizeof(realw)),8708);
+ size_padded*sizeof(realw)),4708);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store),
- size_padded*sizeof(realw)),8709);
+ size_padded*sizeof(realw)),4709);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store),
- size_padded*sizeof(realw)),8710);
+ size_padded*sizeof(realw)),4710);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store),
- size_padded*sizeof(realw)),8711);
+ size_padded*sizeof(realw)),4711);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store),
- size_padded*sizeof(realw)),8712);
+ size_padded*sizeof(realw)),4712);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store),
- size_padded*sizeof(realw)),8713);
+ size_padded*sizeof(realw)),4713);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store),
- size_padded*sizeof(realw)),8714);
+ size_padded*sizeof(realw)),4714);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store),
- size_padded*sizeof(realw)),8715);
+ size_padded*sizeof(realw)),4715);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store),
- size_padded*sizeof(realw)),8716);
+ size_padded*sizeof(realw)),4716);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store),
- size_padded*sizeof(realw)),8717);
+ size_padded*sizeof(realw)),4717);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store),
- size_padded*sizeof(realw)),8718);
+ size_padded*sizeof(realw)),4718);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store),
- size_padded*sizeof(realw)),8719);
+ size_padded*sizeof(realw)),4719);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store),
- size_padded*sizeof(realw)),8720);
+ size_padded*sizeof(realw)),4720);
// transfer constant element data with padding
for(int i=0;i < mp->NSPEC_AB;i++) {
print_CUDA_error_if_any(cudaMemcpy(mp->d_c11store + i*NGLL3_PADDED, &c11store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8800);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4800);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c12store + i*NGLL3_PADDED, &c12store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8801);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4801);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c13store + i*NGLL3_PADDED, &c13store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8802);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4802);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c14store + i*NGLL3_PADDED, &c14store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8803);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4803);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c15store + i*NGLL3_PADDED, &c15store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8804);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4804);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c16store + i*NGLL3_PADDED, &c16store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8805);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4805);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c22store + i*NGLL3_PADDED, &c22store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8806);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4806);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c23store + i*NGLL3_PADDED, &c23store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8807);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4807);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c24store + i*NGLL3_PADDED, &c24store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8808);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4808);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c25store + i*NGLL3_PADDED, &c25store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8809);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4809);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c26store + i*NGLL3_PADDED, &c26store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8810);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4810);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c33store + i*NGLL3_PADDED, &c33store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8811);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4811);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c34store + i*NGLL3_PADDED, &c34store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8812);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4812);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c35store + i*NGLL3_PADDED, &c35store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8813);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4813);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c36store + i*NGLL3_PADDED, &c36store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8814);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4814);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c44store + i*NGLL3_PADDED, &c44store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8815);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4815);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c45store + i*NGLL3_PADDED, &c45store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8816);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4816);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c46store + i*NGLL3_PADDED, &c46store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8817);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4817);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c55store + i*NGLL3_PADDED, &c55store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8818);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4818);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c56store + i*NGLL3_PADDED, &c56store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8819);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4819);
print_CUDA_error_if_any(cudaMemcpy(mp->d_c66store + i*NGLL3_PADDED, &c66store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8820);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4820);
}
}
@@ -1252,28 +1279,28 @@
if( mp->num_free_surface_faces > 0 ){
// mass matrix
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
- sizeof(realw)*mp->NGLOB_AB),8501);
+ sizeof(realw)*mp->NGLOB_AB),4501);
print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
- sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8502);
+ sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4502);
// surface normal
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
- 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),8503);
+ 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),4503);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
- 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),8504);
+ 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),4504);
// temporary global array: used to synchronize updates on global accel array
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load),
- sizeof(int)*mp->NGLOB_AB),8505);
+ sizeof(int)*mp->NGLOB_AB),4505);
if( *NOISE_TOMOGRAPHY == 0 && *ACOUSTIC_SIMULATION == 0 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
- mp->num_free_surface_faces*sizeof(int)),8601);
+ mp->num_free_surface_faces*sizeof(int)),4601);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
- mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),8603);
+ mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4603);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
- 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),8602);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4602);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
- 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),8604);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4604);
}
}
}
@@ -1317,22 +1344,22 @@
// kernel simulations
// allocates backward/reconstructed arrays on device (GPU)
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*(*size)),8201);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*(*size)),8202);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*(*size)),8203);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*(*size)),5201);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*(*size)),5202);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*(*size)),5203);
// allocates kernels
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8204);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8205);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8206);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5204);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5205);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5206);
// initializes kernel values to zero
print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),8207);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),5207);
print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),8208);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),5208);
print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),8209);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),5209);
// strains used for attenuation and kernel simulations
if( *COMPUTE_AND_STORE_STRAIN ){
@@ -1341,87 +1368,87 @@
// solid pressure
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
- NGLL3*mp->NSPEC_AB*sizeof(realw)),8310);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),5310);
print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
- NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8311);
+ NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5311);
// backward solid pressure
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
- NGLL3*mp->NSPEC_AB*sizeof(realw)),8312);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),5312);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
- NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8313);
+ NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5313);
// prepares backward strains
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
- epsilondev_size*sizeof(realw)),8321);
+ epsilondev_size*sizeof(realw)),5321);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
- epsilondev_size*sizeof(realw)),8322);
+ epsilondev_size*sizeof(realw)),5322);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
- epsilondev_size*sizeof(realw)),8323);
+ epsilondev_size*sizeof(realw)),5323);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
- epsilondev_size*sizeof(realw)),8324);
+ epsilondev_size*sizeof(realw)),5324);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
- epsilondev_size*sizeof(realw)),8325);
+ epsilondev_size*sizeof(realw)),5325);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
- epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8326);
+ epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5326);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
- epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8327);
+ epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5327);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
- epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8328);
+ epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5328);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
- epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8329);
+ epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5329);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
- epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8330);
+ epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5330);
}
// attenuation memory variables
if( *ATTENUATION ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx),
- (*R_size)*sizeof(realw)),8421);
+ (*R_size)*sizeof(realw)),5421);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),8422);
+ cudaMemcpyHostToDevice),5422);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy),
- (*R_size)*sizeof(realw)),8423);
+ (*R_size)*sizeof(realw)),5423);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),8424);
+ cudaMemcpyHostToDevice),5424);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy),
- (*R_size)*sizeof(realw)),8425);
+ (*R_size)*sizeof(realw)),5425);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),8426);
+ cudaMemcpyHostToDevice),5426);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz),
- (*R_size)*sizeof(realw)),8427);
+ (*R_size)*sizeof(realw)),5427);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),8428);
+ cudaMemcpyHostToDevice),5428);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz),
- (*R_size)*sizeof(realw)),8429);
+ (*R_size)*sizeof(realw)),5429);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),8420);
+ cudaMemcpyHostToDevice),5420);
// alpha,beta,gamma factors for backward fields
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval),
- N_SLS*sizeof(realw)),8434);
+ N_SLS*sizeof(realw)),5434);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
- N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8435);
+ N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5435);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval),
- N_SLS*sizeof(realw)),8436);
+ N_SLS*sizeof(realw)),5436);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
- N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8437);
+ N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5437);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval),
- N_SLS*sizeof(realw)),8438);
+ N_SLS*sizeof(realw)),5438);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
- N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8439);
+ N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5439);
}
if( *APPROXIMATE_HESS_KL ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8450);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5450);
// initializes with zeros
print_CUDA_error_if_any(cudaMemset(mp->d_hess_el_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),8451);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),5451);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1429,10 +1456,76 @@
#endif
}
+/* ----------------------------------------------------------------------------------------------- */
+// purely adjoint & kernel simulations
/* ----------------------------------------------------------------------------------------------- */
+extern "C"
+void FC_FUNC_(prepare_sim2_or_3_const_device,
+ PREPARE_SIM2_OR_3_CONST_DEVICE)(
+ long* Mesh_pointer_f,
+ int* islice_selected_rec,
+ int* islice_selected_rec_size,
+ int* nadj_rec_local,
+ int* nrec,
+ int* myrank) {
+
+ TRACE("prepare_sim2_or_3_const_device");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+ // allocates arrays for receivers
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_rec,
+ *islice_selected_rec_size*sizeof(int)),6001);
+ // copies arrays to GPU device
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_rec,islice_selected_rec,
+ *islice_selected_rec_size*sizeof(int),cudaMemcpyHostToDevice),6002);
+
+ // adjoint source arrays
+ mp->nadj_rec_local = *nadj_rec_local;
+ if( mp->nadj_rec_local > 0 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
+ (mp->nadj_rec_local)*3*NGLL3*sizeof(realw)),6003);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
+ (mp->nadj_rec_local)*sizeof(int)),6004);
+
+ // prepares local irec array:
+ // the irec_local variable needs to be precomputed (as
+ // h_pre_comp..), because normally it is in the loop updating accel,
+ // and due to how it's incremented, it cannot be parallelized
+ int* h_pre_computed_irec = (int*) malloc( (mp->nadj_rec_local)*sizeof(int) );
+ if( h_pre_computed_irec == NULL ) exit_on_error("prepare_sim2_or_3_const_device: h_pre_computed_irec not allocated\n");
+
+ int irec_local = 0;
+ for(int irec = 0; irec < *nrec; irec++) {
+ if(*myrank == islice_selected_rec[irec]) {
+ irec_local++;
+ h_pre_computed_irec[irec_local-1] = irec;
+ }
+ }
+ if( irec_local != mp->nadj_rec_local ) exit_on_error("prepare_sim2_or_3_const_device: irec_local not equal\n");
+ // copies values onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec,
+ (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),6010);
+ free(h_pre_computed_irec);
+
+ // temporary array to prepare extracted source array values
+ mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
+ if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
+
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("prepare_sim2_or_3_const_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
// for NOISE simulations
/* ----------------------------------------------------------------------------------------------- */
@@ -1462,25 +1555,25 @@
mp->num_free_surface_faces = *num_free_surface_faces;
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec,
- mp->num_free_surface_faces*sizeof(int)),4001);
+ mp->num_free_surface_faces*sizeof(int)),7001);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec,
- mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4002);
+ mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7002);
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk,
- 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4003);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),7003);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
- 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4004);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7004);
// alloc storage for the surface buffer to be copied
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie,
- 3*NGLL2*mp->num_free_surface_faces*sizeof(realw)),4005);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(realw)),7005);
// prepares noise source array
if( *NOISE_TOMOGRAPHY == 1 ){
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
- 3*NGLL3*(*NSTEP)*sizeof(realw)),4101);
+ 3*NGLL3*(*NSTEP)*sizeof(realw)),7101);
print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
- 3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),4102);
+ 3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),7102);
}
// prepares noise directions
@@ -1488,35 +1581,35 @@
int nface_size = NGLL2*(*num_free_surface_faces);
// allocates memory on GPU
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
- nface_size*sizeof(realw)),4301);
+ nface_size*sizeof(realw)),7301);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
- nface_size*sizeof(realw)),4302);
+ nface_size*sizeof(realw)),7302);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
- nface_size*sizeof(realw)),4303);
+ nface_size*sizeof(realw)),7303);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
- nface_size*sizeof(realw)),4304);
+ nface_size*sizeof(realw)),7304);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
- nface_size*sizeof(realw)),4305);
+ nface_size*sizeof(realw)),7305);
// transfers data onto GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise,
- nface_size*sizeof(realw),cudaMemcpyHostToDevice),4306);
+ nface_size*sizeof(realw),cudaMemcpyHostToDevice),7306);
print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise,
- nface_size*sizeof(realw),cudaMemcpyHostToDevice),4307);
+ nface_size*sizeof(realw),cudaMemcpyHostToDevice),7307);
print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
- nface_size*sizeof(realw),cudaMemcpyHostToDevice),4308);
+ nface_size*sizeof(realw),cudaMemcpyHostToDevice),7308);
print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
- nface_size*sizeof(realw),cudaMemcpyHostToDevice),4309);
+ nface_size*sizeof(realw),cudaMemcpyHostToDevice),7309);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw,
- nface_size*sizeof(realw),cudaMemcpyHostToDevice),4310);
+ nface_size*sizeof(realw),cudaMemcpyHostToDevice),7310);
}
// prepares noise strength kernel
if( *NOISE_TOMOGRAPHY == 3 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),
- NGLL3*(mp->NSPEC_AB)*sizeof(realw)),4401);
+ NGLL3*(mp->NSPEC_AB)*sizeof(realw)),7401);
// initializes kernel values to zero
print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),4403);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),7403);
}
@@ -1551,25 +1644,25 @@
mp->gravity = *GRAVITY;
if( mp->gravity ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_deriv_gravity),
- (mp->NGLOB_AB)*sizeof(realw)),7700);
+ (mp->NGLOB_AB)*sizeof(realw)),8000);
print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_deriv_gravity, minus_deriv_gravity,
- (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),7701);
+ (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8001);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_g),
- (mp->NGLOB_AB)*sizeof(realw)),7702);
+ (mp->NGLOB_AB)*sizeof(realw)),8002);
print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_g, minus_g,
- (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),7703);
+ (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8003);
if( *ACOUSTIC_SIMULATION == 0 ){
// rhostore not allocated yet
int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
// padded array
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),7706);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),8006);
// transfer constant element data with padding
for(int i=0; i < mp->NSPEC_AB; i++) {
print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*NGLL3_PADDED, &rhostore[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),7707);
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8007);
}
}
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c 2011-11-17 23:51:03 UTC (rev 19213)
@@ -72,7 +72,7 @@
fp = fopen(filename, "wb");
fwrite(vector, sizeof(int), *size, fp);
fclose(fp);
-
+
}
@@ -82,7 +82,7 @@
int procid;
MPI_Comm_rank(MPI_COMM_WORLD,&procid);
sprintf(filename,"/scratch/eiger/rietmann/SPECFEM3D_AIGLE/in_out_files/DATABASES_MPI/proc%06d_surface_movie",procid);
-
+
FILE* fp; int it;
printf("Opening %s for analysis\n",filename);
fp = fopen(filename,"rb");
@@ -92,8 +92,8 @@
printf("FILE ERROR:%s\n",strerror(errno));
perror("file error\n");
exit(1);
- }
-
+ }
+
float* vector = (float*)malloc(nodes_per_iteration*sizeof(float));
float max_val;
int i;
@@ -121,7 +121,7 @@
if(vector1[i] != vector2[i]) {
error_count++;
if(error_count < 10) {
- printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]);
+ printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]);
}
}
}
@@ -137,20 +137,20 @@
for(i=0;i<size;++i) {
if(vector1[i] != 0) {
if( fabsf(vector1[i]-vector2[i])/vector1[i] > 0.01) {
- if(fabsf(vector1[i]-vector2[i]) > 1e-20) {
- error_count++;
- if(error_count<10) {
- printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]);
- }
+ if(fabsf(vector1[i]-vector2[i]) > 1e-20) {
+ error_count++;
+ if(error_count<10) {
+ printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]);
}
}
+ }
}
/* if(vector1[i] != vector2[i]) { */
/* if(fabsf(vector1[i]-vector2[i]) > 1e-25) { */
- /* error_count++; */
- /* if(error_count<50) { */
- /* printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]); */
- /* } */
+ /* error_count++; */
+ /* if(error_count<50) { */
+ /* printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]); */
+ /* } */
/* } */
/* } */
}
@@ -159,7 +159,7 @@
}
void compare_surface_files_(int* bytes_per_iteration, int* number_of_iterations) {
-
+
char* cpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_proc000001_surface_movie";
char* gpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_v2_proc000001_surface_movie";
@@ -169,7 +169,7 @@
if(fp_cpu == 0) {
//errorstr = (char*) strerror(errno);
//printf("CPU FILE ERROR:%s\n",errorstr);
- printf("CPU FILE ERROR:%s\n",strerror(errno));
+ printf("CPU FILE ERROR:%s\n",strerror(errno));
perror("cpu file error\n");
}
FILE* fp_gpu;
@@ -178,14 +178,14 @@
if(fp_gpu == NULL) {
//errorstr = (char*) strerror(errno);
//printf("GPU FILE ERROR:%s\n",errorstr);
- printf("GPU FILE ERROR:%s\n",strerror(errno));
+ printf("GPU FILE ERROR:%s\n",strerror(errno));
perror("gpu file error\n");
}
-
+
/* pause_for_debug(); */
-
+
float* gpu_vector = (float*)malloc(*bytes_per_iteration);
- float* cpu_vector = (float*)malloc(*bytes_per_iteration);
+ float* cpu_vector = (float*)malloc(*bytes_per_iteration);
int i,it,error_count=0;
for(it=0;it<*number_of_iterations;it++) {
int pos = (*bytes_per_iteration)*(it);
@@ -203,14 +203,14 @@
float cpu_max_val=0;
if(it<100) {
for(i=0;i<size;i++) {
- if((fabs(cpu_vector[i] - gpu_vector[i])/(fabs(cpu_vector[i])+1e-31) > 0.01)) {
- if(error_count < 30) printf("ERROR[%d]: %g != %g\n",i,cpu_vector[i], gpu_vector[i]);
- if(cpu_vector[i] > 1e-30) error_count++;
- }
- if(gpu_vector[i]>gpu_max_val) gpu_max_val = gpu_vector[i];
- if(gpu_vector[i]<gpu_min_val) gpu_min_val = gpu_vector[i];
- if(cpu_vector[i]>cpu_max_val) cpu_max_val = cpu_vector[i];
- if(cpu_vector[i]<cpu_min_val) cpu_min_val = cpu_vector[i];
+ if((fabs(cpu_vector[i] - gpu_vector[i])/(fabs(cpu_vector[i])+1e-31) > 0.01)) {
+ if(error_count < 30) printf("ERROR[%d]: %g != %g\n",i,cpu_vector[i], gpu_vector[i]);
+ if(cpu_vector[i] > 1e-30) error_count++;
+ }
+ if(gpu_vector[i]>gpu_max_val) gpu_max_val = gpu_vector[i];
+ if(gpu_vector[i]<gpu_min_val) gpu_min_val = gpu_vector[i];
+ if(cpu_vector[i]>cpu_max_val) cpu_max_val = cpu_vector[i];
+ if(cpu_vector[i]<cpu_min_val) cpu_min_val = cpu_vector[i];
}
printf("%d Total Errors\n",error_count);
printf("size:%d\n",size);
@@ -245,7 +245,7 @@
else
printf("File read error.");
}
-
+
fclose(fp);
int i;
@@ -253,11 +253,11 @@
for(i=0;i<*size;i++) {
if((fabs(vector[i] - compare_vector[i])/vector[i] > 0.0001)) {
if(error_count < 30) {
- printf("ERROR[%d]: %g != %g\n",i,compare_vector[i], vector[i]);
+ printf("ERROR[%d]: %g != %g\n",i,compare_vector[i], vector[i]);
}
error_count++;
- /* if(compare_vector[i] > 1e-30) error_count++; */
- }
+ /* if(compare_vector[i] > 1e-30) error_count++; */
+ }
}
printf("%d Total Errors\n",error_count);
printf("size:%d\n",*size);
@@ -289,7 +289,7 @@
else
printf("File read error.");
}
-
+
fclose(fp);
int i;
@@ -298,7 +298,7 @@
if((abs(vector[i] - compare_vector[i])/vector[i] > 0.01) && error_count < 30) {
printf("ERROR[%d]: %g != %g\n",i,compare_vector[i], vector[i]);
error_count++;
- }
+ }
}
printf("%d Total Errors\n",error_count);
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2011-11-17 23:51:03 UTC (rev 19213)
@@ -6,9 +6,9 @@
/* from check_fields_cuda.cu */
void FC_FUNC_(check_max_norm_displ_gpu,
CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID){}
-
+
void FC_FUNC_(check_max_norm_vector,
- CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID){}
+ CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID){}
void FC_FUNC_(check_max_norm_displ,
CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID){}
@@ -34,23 +34,23 @@
GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer){}
void FC_FUNC_(get_norm_acoustic_from_device,
- GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
+ GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
int* SIMULATION_TYPE){}
void FC_FUNC_(get_norm_elastic_from_device,
- GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
+ GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
int* SIMULATION_TYPE){}
-
+
/* from file compute_add_sources_elastic_cuda.cu */
void FC_FUNC_(compute_add_sources_el_cuda,
COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
int* phase_is_innerf,
int* NSOURCESf,
- double* h_stf_pre_compute,
+ double* h_stf_pre_compute,
int* myrankf){}
void FC_FUNC_(compute_add_sources_el_s3_cuda,
@@ -61,21 +61,21 @@
int* myrank){}
void FC_FUNC_(add_source_master_rec_noise_cu,
- ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
- int* myrank_f,
- int* it_f,
- int* irec_master_noise_f,
+ ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
+ int* myrank_f,
+ int* it_f,
+ int* irec_master_noise_f,
int* islice_selected_rec){}
void FC_FUNC_(add_sources_el_sim_type_2_or_3,
- ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
+ ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
realw* h_adj_sourcearrays,
int* phase_is_inner,
int* h_ispec_is_inner,
- int* h_ispec_is_elastic,
+ int* h_ispec_is_elastic,
int* h_ispec_selected_rec,
- int* myrank,
- int* nrec,
+ int* myrank,
+ int* nrec,
int* time_index,
int* h_islice_selected_rec,
int* nadj_rec_local,
@@ -84,23 +84,23 @@
/* from file compute_add_sources_acoustic_cuda.cu */
void FC_FUNC_(compute_add_sources_ac_cuda,
- COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
+ COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
int* phase_is_innerf,
- int* NSOURCESf,
+ int* NSOURCESf,
int* SIMULATION_TYPEf,
- double* h_stf_pre_compute,
+ double* h_stf_pre_compute,
int* myrankf){}
void FC_FUNC_(compute_add_sources_ac_s3_cuda,
- COMPUTE_ADD_SOURCES_AC_S3_CUDA)(long* Mesh_pointer_f,
+ COMPUTE_ADD_SOURCES_AC_S3_CUDA)(long* Mesh_pointer_f,
int* phase_is_innerf,
- int* NSOURCESf,
+ int* NSOURCESf,
int* SIMULATION_TYPEf,
- double* h_stf_pre_compute,
+ double* h_stf_pre_compute,
int* myrankf){}
void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
- ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
+ ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
realw* h_adj_sourcearrays,
int* phase_is_inner,
int* h_ispec_is_inner,
@@ -114,43 +114,43 @@
int* NTSTEP_BETWEEN_ADJSRC){}
/* from compute_coupling_cuda.cu */
-
+
void FC_FUNC_(compute_coupling_ac_el_cuda,
COMPUTE_COUPLING_AC_EL_CUDA)(
- long* Mesh_pointer_f,
- int* phase_is_innerf,
- int* num_coupling_ac_el_facesf,
+ long* Mesh_pointer_f,
+ int* phase_is_innerf,
+ int* num_coupling_ac_el_facesf,
int* SIMULATION_TYPEf){}
void FC_FUNC_(compute_coupling_el_ac_cuda,
COMPUTE_COUPLING_EL_AC_CUDA)(
- long* Mesh_pointer_f,
- int* phase_is_innerf,
- int* num_coupling_ac_el_facesf,
+ long* Mesh_pointer_f,
+ int* phase_is_innerf,
+ int* num_coupling_ac_el_facesf,
int* SIMULATION_TYPEf){}
/* from compute_forces_acoustic_cuda.cu */
void FC_FUNC_(transfer_boun_pot_from_device,
TRANSFER_BOUN_POT_FROM_DEVICE)(
- int* size,
- long* Mesh_pointer_f,
- realw* potential_dot_dot_acoustic,
+ int* size,
+ long* Mesh_pointer_f,
+ realw* potential_dot_dot_acoustic,
realw* send_potential_dot_dot_buffer,
- int* num_interfaces_ext_mesh,
+ int* num_interfaces_ext_mesh,
int* max_nibool_interfaces_ext_mesh,
- int* nibool_interfaces_ext_mesh,
+ int* nibool_interfaces_ext_mesh,
int* ibool_interfaces_ext_mesh,
int* FORWARD_OR_ADJOINT){}
void FC_FUNC_(transfer_asmbl_pot_to_device,
TRANSFER_ASMBL_POT_TO_DEVICE)(
- long* Mesh_pointer,
- realw* potential_dot_dot_acoustic,
+ long* Mesh_pointer,
+ realw* potential_dot_dot_acoustic,
realw* buffer_recv_scalar_ext_mesh,
- int* num_interfaces_ext_mesh,
+ int* num_interfaces_ext_mesh,
int* max_nibool_interfaces_ext_mesh,
- int* nibool_interfaces_ext_mesh,
+ int* nibool_interfaces_ext_mesh,
int* ibool_interfaces_ext_mesh,
int* FORWARD_OR_ADJOINT){}
@@ -163,18 +163,18 @@
void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
long* Mesh_pointer,
- int* size_F,
+ int* size_F,
int* SIMULATION_TYPE){}
void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
long* Mesh_pointer,
- int* size_F,
- realw* deltatover2_F,
- int* SIMULATION_TYPE,
+ int* size_F,
+ realw* deltatover2_F,
+ int* SIMULATION_TYPE,
realw* b_deltatover2_F){}
void FC_FUNC_(acoustic_enforce_free_surf_cuda,
- ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
+ ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
int* SIMULATION_TYPE,
int* ABSORB_FREE_SURFACE){}
@@ -182,22 +182,22 @@
/* from compute_forces_elastic_cuda.cu */
void FC_FUNC_(transfer_boun_accel_from_device,
TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
- realw* send_accel_buffer,
- int* num_interfaces_ext_mesh,
- int* max_nibool_interfaces_ext_mesh,
- int* nibool_interfaces_ext_mesh,
- int* ibool_interfaces_ext_mesh,
- int* FORWARD_OR_ADJOINT){}
+ realw* send_accel_buffer,
+ int* num_interfaces_ext_mesh,
+ int* max_nibool_interfaces_ext_mesh,
+ int* nibool_interfaces_ext_mesh,
+ int* ibool_interfaces_ext_mesh,
+ int* FORWARD_OR_ADJOINT){}
-
+
void FC_FUNC_(transfer_asmbl_accel_to_device,
- TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
+ TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
realw* accel,
realw* buffer_recv_vector_ext_mesh,
int* num_interfaces_ext_mesh,
int* max_nibool_interfaces_ext_mesh,
int* nibool_interfaces_ext_mesh,
- int* ibool_interfaces_ext_mesh,int* FORWARD_OR_ADJOINT){}
+ int* ibool_interfaces_ext_mesh,int* FORWARD_OR_ADJOINT){}
void FC_FUNC_(compute_forces_elastic_cuda,
COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f,
@@ -211,91 +211,91 @@
void FC_FUNC_(kernel_3_a_cuda,
KERNEL_3_A_CUDA)(long* Mesh_pointer,
- int* size_F,
- realw* deltatover2_F,
- int* SIMULATION_TYPE_f,
+ int* size_F,
+ realw* deltatover2_F,
+ int* SIMULATION_TYPE_f,
realw* b_deltatover2_F,
int* OCEANS){}
void FC_FUNC_(kernel_3_b_cuda,
KERNEL_3_B_CUDA)(long* Mesh_pointer,
- int* size_F,
- realw* deltatover2_F,
- int* SIMULATION_TYPE_f,
- realw* b_deltatover2_F){}
+ int* size_F,
+ realw* deltatover2_F,
+ int* SIMULATION_TYPE_f,
+ realw* b_deltatover2_F){}
void FC_FUNC_(elastic_ocean_load_cuda,
- ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
+ ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
int* SIMULATION_TYPE){}
/* from file compute_kernels_cuda.cu */
-
+
void FC_FUNC_(compute_kernels_elastic_cuda,
COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
realw* deltat_f){}
void FC_FUNC_(compute_kernels_strgth_noise_cu,
- COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
+ COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
realw* h_noise_surface_movie,
realw* deltat){}
void FC_FUNC_(compute_kernels_acoustic_cuda,
COMPUTE_KERNELS_ACOUSTIC_CUDA)(
- long* Mesh_pointer,
+ long* Mesh_pointer,
realw* deltat_f){}
void FC_FUNC_(compute_kernels_hess_cuda,
COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
realw* deltat_f) {}
-
+
/* from file compute_stacey_acoustic_cuda.cu */
void FC_FUNC_(compute_stacey_acoustic_cuda,
COMPUTE_STACEY_ACOUSTIC_CUDA)(
- long* Mesh_pointer_f,
- int* phase_is_innerf,
- int* SIMULATION_TYPEf,
+ long* Mesh_pointer_f,
+ int* phase_is_innerf,
+ int* SIMULATION_TYPEf,
int* SAVE_FORWARDf,
realw* h_b_absorb_potential){}
/* from file compute_stacey_elastic_cuda.cu */
-
+
void FC_FUNC_(compute_stacey_elastic_cuda,
- COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
- int* phase_is_innerf,
- int* SIMULATION_TYPEf,
+ COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
+ int* phase_is_innerf,
+ int* SIMULATION_TYPEf,
int* SAVE_FORWARDf,
realw* h_b_absorb_field){}
/* from file it_update_displacement_cuda.cu */
-
+
void FC_FUNC_(it_update_displacement_cuda,
it_update_displacement_cuda)(long* Mesh_pointer_f,
- int* size_F,
- realw* deltat_F,
- realw* deltatsqover2_F,
+ int* size_F,
+ realw* deltat_F,
+ realw* deltatsqover2_F,
realw* deltatover2_F,
- int* SIMULATION_TYPE,
- realw* b_deltat_F,
- realw* b_deltatsqover2_F,
+ int* SIMULATION_TYPE,
+ realw* b_deltat_F,
+ realw* b_deltatsqover2_F,
realw* b_deltatover2_F){}
void FC_FUNC_(it_update_displacement_ac_cuda,
- IT_UPDATE_DISPLACEMENT_AC_CUDA)(long* Mesh_pointer_f,
+ IT_UPDATE_DISPLACEMENT_AC_CUDA)(long* Mesh_pointer_f,
int* size_F,
- realw* deltat_F,
- realw* deltatsqover2_F,
+ realw* deltat_F,
+ realw* deltatsqover2_F,
realw* deltatover2_F,
- int* SIMULATION_TYPE,
- realw* b_deltat_F,
- realw* b_deltatsqover2_F,
+ int* SIMULATION_TYPE,
+ realw* b_deltat_F,
+ realw* b_deltatsqover2_F,
realw* b_deltatover2_F){}
/* from file noise_tomography_cuda.cu */
-
-void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
-void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id){}
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id){}
+
void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val){}
void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val){}
@@ -307,13 +307,13 @@
realw* h_noise_surface_movie){}
void FC_FUNC_(noise_read_add_surface_movie_cu,
- NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
- realw* h_noise_surface_movie,
+ NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
+ realw* h_noise_surface_movie,
int* NOISE_TOMOGRAPHYf){}
-
-/* from file prepare_mesh_constants_cuda.cu */
-
+
+/* from file prepare_mesh_constants_cuda.cu */
+
void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)(){}
void FC_FUNC_(output_free_device_memory,
OUTPUT_FREE_DEVICE_MEMORY)(int* id){}
@@ -324,26 +324,30 @@
void FC_FUNC_(get_free_device_memory,
get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ){}
+extern "C"
+void FC_FUNC_(prepare_cuda_device,
+ PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices){}
+
void FC_FUNC_(prepare_constants_device,
PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
- int* h_NGLLX,
+ int* h_NGLLX,
int* NSPEC_AB, int* NGLOB_AB,
realw* h_xix, realw* h_xiy, realw* h_xiz,
realw* h_etax, realw* h_etay, realw* h_etaz,
realw* h_gammax, realw* h_gammay, realw* h_gammaz,
realw* h_kappav, realw* h_muv,
- int* h_ibool,
+ int* h_ibool,
int* num_interfaces_ext_mesh, int* max_nibool_interfaces_ext_mesh,
int* h_nibool_interfaces_ext_mesh, int* h_ibool_interfaces_ext_mesh,
- realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
+ realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
- realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
- int* ABSORBING_CONDITIONS,
+ realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
+ int* ABSORBING_CONDITIONS,
int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
realw* h_abs_boundary_normal,
realw* h_abs_boundary_jacobian2Dw,
int* h_num_abs_boundary_faces,
- int* h_ispec_is_inner,
+ int* h_ispec_is_inner,
int* NSOURCES,
int* nsources_local,
realw* h_sourcearrays,
@@ -355,8 +359,7 @@
int* nrec_local_f,
int* SIMULATION_TYPE,
int* USE_MESH_COLORING_GPU,
- int* nspec_acoustic,int* nspec_elastic,
- int* myrank_f, int* ncuda_devices)
+ int* nspec_acoustic,int* nspec_elastic)
{
fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
exit(1);
@@ -372,11 +375,11 @@
int* myrank){}
void FC_FUNC_(prepare_fields_acoustic_device,
- PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
- realw* rmass_acoustic,
+ PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
+ realw* rmass_acoustic,
realw* rhostore,
realw* kappastore,
- int* num_phase_ispec_acoustic,
+ int* num_phase_ispec_acoustic,
int* phase_ispec_inner_acoustic,
int* ispec_is_acoustic,
int* NOISE_TOMOGRAPHY,
@@ -394,12 +397,12 @@
realw* coupling_ac_el_jacobian2Dw,
int* num_colors_outer_acoustic,
int* num_colors_inner_acoustic,
- int* num_elem_colors_acoustic){}
+ int* num_elem_colors_acoustic){}
void FC_FUNC_(prepare_fields_acoustic_adj_dev,
PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
int* SIMULATION_TYPE,
- int* APPROXIMATE_HESS_KL) {}
+ int* APPROXIMATE_HESS_KL) {}
void FC_FUNC_(prepare_fields_elastic_device,
PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
int* size,
@@ -416,7 +419,7 @@
int* COMPUTE_AND_STORE_STRAIN,
realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
realw* epsilondev_xz,realw* epsilondev_yz,
- int* ATTENUATION,
+ int* ATTENUATION,
int* R_size,
realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
realw* one_minus_sum_beta,realw* factor_common,
@@ -426,7 +429,7 @@
int* NOISE_TOMOGRAPHY,
realw* free_surface_normal,
int* free_surface_ispec,
- int* free_surface_ijk,
+ int* free_surface_ijk,
int* num_free_surface_faces,
int* ACOUSTIC_SIMULATION,
int* num_colors_outer_elastic,
@@ -441,7 +444,7 @@
realw *c16store,
realw *c22store,
realw *c23store,
- realw *c24store,
+ realw *c24store,
realw *c25store,
realw *c26store,
realw *c33store,
@@ -454,25 +457,25 @@
realw *c55store,
realw *c56store,
realw *c66store){}
-
+
void FC_FUNC_(prepare_fields_elastic_adj_dev,
PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
int* size,
int* SIMULATION_TYPE,
int* COMPUTE_AND_STORE_STRAIN,
- realw* epsilon_trace_over_3,
+ realw* epsilon_trace_over_3,
realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
realw* b_epsilondev_xz,realw* b_epsilondev_yz,
realw* b_epsilon_trace_over_3,
- int* ATTENUATION,
+ int* ATTENUATION,
int* R_size,
realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
int* APPROXIMATE_HESS_KL){}
-
+
void FC_FUNC_(prepare_fields_noise_device,
- PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
+ PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
int* NSPEC_AB, int* NGLOB_AB,
int* free_surface_ispec,
int* free_surface_ijk,
@@ -504,12 +507,12 @@
void FC_FUNC_(prepare_fields_gravity_device,
PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
int* GRAVITY,
- realw* minus_deriv_gravity,
+ realw* minus_deriv_gravity,
realw* minus_g,
realw* h_wgll_cube,
int* ACOUSTIC_SIMULATION,
realw* rhostore){}
-/* from file transfer_fields_cuda.cu */
+/* from file transfer_fields_cuda.cu */
void FC_FUNC_(transfer_fields_el_to_device,
TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f){}
@@ -573,7 +576,7 @@
realw* b_epsilon_trace_over_3,
int* size_epsilon_trace_over_3) {}
*/
-
+
void FC_FUNC_(transfer_b_fields_att_to_device,
TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
@@ -597,46 +600,46 @@
int* size_epsilondev){}
void FC_FUNC_(transfer_kernels_el_to_host,
- TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
+ TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
realw* h_rho_kl,
- realw* h_mu_kl,
+ realw* h_mu_kl,
realw* h_kappa_kl,
int* NSPEC_AB){}
void FC_FUNC_(transfer_kernels_noise_to_host,
- TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
+ TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
realw* h_Sigma_kl,
int* NSPEC_AB){}
-
+
void FC_FUNC_(transfer_fields_ac_to_device,
TRANSFER_FIELDS_AC_TO_DEVICE)(
- int* size,
- realw* potential_acoustic,
- realw* potential_dot_acoustic,
+ int* size,
+ realw* potential_acoustic,
+ realw* potential_dot_acoustic,
realw* potential_dot_dot_acoustic,
long* Mesh_pointer_f){}
void FC_FUNC_(transfer_b_fields_ac_to_device,
TRANSFER_B_FIELDS_AC_TO_DEVICE)(
- int* size,
- realw* b_potential_acoustic,
- realw* b_potential_dot_acoustic,
+ int* size,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
realw* b_potential_dot_dot_acoustic,
long* Mesh_pointer_f){}
void FC_FUNC_(transfer_fields_ac_from_device,TRANSFER_FIELDS_AC_FROM_DEVICE)(
- int* size,
- realw* potential_acoustic,
- realw* potential_dot_acoustic,
+ int* size,
+ realw* potential_acoustic,
+ realw* potential_dot_acoustic,
realw* potential_dot_dot_acoustic,
long* Mesh_pointer_f){}
void FC_FUNC_(transfer_b_fields_ac_from_device,
TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
- int* size,
- realw* b_potential_acoustic,
- realw* b_potential_dot_acoustic,
+ int* size,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
realw* b_potential_dot_dot_acoustic,
long* Mesh_pointer_f){}
@@ -647,7 +650,7 @@
TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f){}
void FC_FUNC_(transfer_kernels_ac_to_host,
- TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
+ TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
realw* h_rho_ac_kl,
realw* h_kappa_ac_kl,
int* NSPEC_AB){}
@@ -675,8 +678,8 @@
realw* potential_acoustic,
realw* potential_dot_acoustic,
realw* potential_dot_dot_acoustic,
- realw* b_potential_acoustic,
- realw* b_potential_dot_acoustic,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
realw* b_potential_dot_dot_acoustic,
long* Mesh_pointer_f,
int* number_receiver_global,
@@ -685,5 +688,5 @@
int* ibool,
int* SIMULATION_TYPEf){}
-
-
+
+
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu 2011-11-17 23:51:03 UTC (rev 19213)
@@ -79,7 +79,7 @@
int num_blocks_x = mp->nrec_local;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
@@ -205,7 +205,7 @@
int num_blocks_x = mp->nrec_local;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
num_blocks_y = num_blocks_y*2;
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/param_reader.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/param_reader.c 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/param_reader.c 2011-11-17 23:51:03 UTC (rev 19213)
@@ -142,7 +142,7 @@
/* Regular expression for parsing lines from param file.
** Good luck reading this regular expression. Basically, the lines of
** the parameter file should be of the form 'parameter = value',
- ** optionally followed by a #-delimited comment.
+ ** optionally followed by a #-delimited comment.
** 'value' can be any number of space- or tab-separated words. Blank
** lines, lines containing only white space and lines whose first non-
** whitespace character is '#' are ignored. White space is generally
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c 2011-11-17 23:51:03 UTC (rev 19213)
@@ -1,683 +1,683 @@
-/*
-!=====================================================================
-!
-! S p e c f e m 3 D V e r s i o n 2 . 0
-! ---------------------------------------
-!
-! Main authors: Dimitri Komatitsch and Jeroen Tromp
-! Princeton University, USA and University of Pau / CNRS / INRIA
-! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
-! April 2011
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-*/
-
-// for large files
-#define _FILE_OFFSET_BITS 64
-
-// after Brian's function
-
-#include "config.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-static int fd;
-
-void
-FC_FUNC_(open_file,OPEN_FILE)(char *file) {
- /* fprintf(stderr, "Opening file: %s\n", file); */
- fd = open(file, O_WRONLY | O_CREAT, 0644);
- if(fd == -1) {
- fprintf(stderr, "Error opening file: %s exiting\n", file);
- exit(-1);
- }
-}
-
-void
-FC_FUNC_(close_file,CLOSE_FILE)() {
- /* fprintf(stderr, "Closing file\n"); */
- close(fd);
-}
-
-void
-FC_FUNC_(write_integer,WRITE_INTEGER)(int *z) {
- int dummy_unused_variable = write(fd, z, sizeof(int));
-}
-
-void
-FC_FUNC_(write_real,WRITE_REAL)(float *z) {
- int dummy_unused_variable = write(fd, z, sizeof(float));
-}
-
-
-/* ---------------------------------------
-
- IO performance test
-
- Software Optimization for High Performance Computing: Creating Faster Applications
-
- By Isom L. Crawford and Kevin R. Wadleigh
- Jul 18, 2003
-
- - uses functions fopen/fread/fwrite for binary file I/O
-
- --------------------------------------- */
-
-#define __USE_GNU
-#include <string.h>
-#include <regex.h>
-
-#define MIN(x,y) ((x) < (y) ? (x) : (y))
-
-/* fastest performance on nehalem nodes:
-
- Linux 2.6.18-164.11.1.el5 #1 SMP Wed Jan 20 10:04:55 EST 2010 x86_64 x86_64 x86_64 GNU/Linux
-
- achieved with 16 KB buffers: */
-
-//#define MAX_B 65536 // 64 KB
-//#define MAX_B 32768 // 32 KB
-#define MAX_B 16384 // 16 KB
-//#define MAX_B 8192 // 8 KB
-
-// absorbing files: instead of passing file descriptor, we use the array index
-// index 0 for elastic domain file
-// index 1 for acoustic domain file
-// (reserved, but unused yet) index 2 - for NOISE_TOMOGRAPHY (SURFACE_MOVIE)
-#define ABS_FILEID 3 // number of absorbing files, note: in C we start indexing arrays from 0
-
-// file points
-static FILE * fp_abs[ABS_FILEID];
-// file work buffers
-static char * work_buffer[ABS_FILEID];
-
-
-//void
-//FC_FUNC_(open_file_abs_r_fbin,OPEN_FILE_ABS_R_FBIN)(int *fid, char *filename,int *length, int *filesize){
-void open_file_abs_r_fbin(int *fid, char *filename,int *length, long long *filesize){
-
- // opens file for read access
-
- //This sequence assigns the MAX_B array work_buffer to the file pointer
- // to be used for its buffering. performance should benefit.
- char * fncopy;
- char * blank;
- FILE *ft;
- int ret;
-
- // checks filesize
- if( *filesize == 0 ){
- perror("Error file size for reading");
- exit(EXIT_FAILURE);
- }
-
- // Trim the file name.
- fncopy = strndup(filename, *length);
- blank = strchr(fncopy, ' ');
- if (blank != NULL) {
- fncopy[blank - fncopy] = '\0';
- }
-
-/*
-//debug checks file size
-// see:
-//https://www.securecoding.cert.org/confluence/display/seccode/FIO19-C.+Do+not+use+fseek()+and+ftell()+to+compute+the+size+of+a+file
- printf("file size: %lld \n",*filesize);
- int fd;
- struct stat stbuf;
- long long size;
- fd = open(fncopy, O_RDONLY);
- if(fd == -1) {
- fprintf(stderr, "Error opening file: %s exiting\n", fncopy);
- exit(-1);
- }
- if( fstat(fd, &stbuf) == 0 ){
- size = stbuf.st_size;
- printf("file size found is: %lld (Bytes) \n",size);
- }
- close(fd);
-*/
-
- // opens file
- //ft = fopen( fncopy, "r+" );
- ft = fopen( fncopy, "rb+" ); // read binary file
- if( ft == NULL ) { perror("fopen"); exit(-1); }
-
-
- // sets mode for full buffering
- work_buffer[*fid] = (char *)malloc(MAX_B);
- ret = setvbuf( ft, work_buffer[*fid], _IOFBF, (size_t)MAX_B );
- if( ret != 0 ){
- perror("Error setting working buffer");
- exit(EXIT_FAILURE);
- }
-
- // stores file index id fid: from 0 to 8
- fp_abs[*fid] = ft;
-
- free(fncopy);
-}
-
-//void
-//FC_FUNC_(open_file_abs_w_fbin,OPEN_FILE_ABS_W_FBIN)(int *fid, char *filename, int *length, int *filesize){
-void open_file_abs_w_fbin(int *fid, char *filename, int *length, long long *filesize){
-
- // opens file for write access
-
- //This sequence assigns the MAX_B array work_buffer to the file pointer
- // to be used for its buffering. performance should benefit.
- char * fncopy;
- char * blank;
- FILE *ft;
- int ret;
-
- // checks filesize
- if( *filesize == 0 ){
- perror("Error file size for writing");
- exit(EXIT_FAILURE);
- }
-
- // Trim the file name.
- fncopy = strndup(filename, *length);
- blank = strchr(fncopy, ' ');
- if (blank != NULL) {
- fncopy[blank - fncopy] = '\0';
- }
-
- // opens file
- //ft = fopen( fncopy, "w+" );
- ft = fopen( fncopy, "wb+" ); // write binary file
- if( ft == NULL ) { perror("fopen"); exit(-1); }
-
- // sets mode for full buffering
- work_buffer[*fid] = (char *)malloc(MAX_B);
- ret = setvbuf( ft, work_buffer[*fid], _IOFBF, (size_t)MAX_B );
- if( ret != 0 ){
- perror("Error setting working buffer");
- exit(EXIT_FAILURE);
- }
-
- // stores file index id fid: from 0 to 8
- fp_abs[*fid] = ft;
-
- free(fncopy);
-
-}
-
-//void
-//FC_FUNC_(close_file_abs_fbin,CLOSE_FILE_ABS_FBIN)(int * fid){
-void close_file_abs_fbin(int * fid){
-
- // closes file
-
- fclose(fp_abs[*fid]);
-
- free(work_buffer[*fid]);
-
-}
-
-//void
-//FC_FUNC_(write_abs_fbin,WRITE_ABS_FBIN)(int *fid, void *buffer, int *length, int *index){
-void write_abs_fbin(int *fid, char *buffer, int *length, int *index){
-
- // writes binary file data in chunks of MAX_B
-
- FILE *ft;
- int itemlen,remlen,donelen,ret;
- void *buf;
-
- // file pointer
- ft = fp_abs[*fid];
-
- donelen = 0;
- remlen = *length;
- buf = buffer;
- ret = 0;
-
-/*
-//debug
- float dat[*length/4];
- memcpy(dat,buffer,*length);
- printf("buffer length: %d %d\n",*length,*index);
- printf("buffer size: %d %d \n",sizeof(dat),sizeof(buffer));
- int i;
- for(i=0;i< 50;i++){
- printf("buffer: %d %e \n",i,dat[i]);
- }
-
- // positions file pointer (for reverse time access)
- // make sure to use 64-bit arithmetic to avoid overflow for very large files
- long long pos,cur;
-
- pos = ((long long)*length) * (*index -1 );
- cur = ftell(ft);
-
- printf("current position: %d %lld %lld \n",*fid,cur,pos);
- ret = fseek(ft, pos , SEEK_SET);
- if ( ret != 0 ) {
- perror("Error fseek");
- exit(EXIT_FAILURE);
- }
- */
-
-
- // writes items of maximum MAX_B to the file
- while (remlen > 0){
-
- itemlen = MIN(remlen,MAX_B);
- // note: we want to write out exactly *itemlen* bytes
- ret = fwrite(buf,1,itemlen,ft);
- if (ret > 0){
- donelen = donelen + ret;
- remlen = remlen - MAX_B;
- buf += MAX_B;
- }
- else{
- remlen = 0;
- }
- }
-
- //debug
- // printf("buffer done length: %d %d\n",donelen,*length);
-}
-
-//void
-//FC_FUNC_(read_abs_fbin,READ_ABS_FBIN)(int *fid, void *buffer, int *length, int *index){
-void read_abs_fbin(int *fid, char *buffer, int *length, int *index){
-
- // reads binary file data in chunks of MAX_B
-
- FILE *ft;
- int ret,itemlen,remlen,donelen;
- long long pos;
- void *buf;
-
- // file pointer
- ft = fp_abs[*fid];
-
- // positions file pointer (for reverse time access)
- // make sure to use 64-bit arithmetic to avoid overflow for very large files
- pos = ((long long)*length) * (*index -1 );
-
- ret = fseek(ft, pos , SEEK_SET);
- if ( ret != 0 ) {
- perror("Error fseek");
- exit(EXIT_FAILURE);
- }
-
- donelen = 0;
- remlen = *length;
- buf = buffer;
- ret = 0;
-
- // cleans buffer
- //memset( buf,0,remlen);
-
- // reads items of maximum MAX_B to the file
- while (remlen > 0){
-
- // checks end of file
- if (ferror(ft) || feof(ft)) return;
-
- itemlen = MIN(remlen,MAX_B);
- ret = fread(buf,1,itemlen,ft);
-
- if (ferror(ft) || feof(ft)) return;
-
- if (ret > 0){
- donelen = donelen + ret;
- remlen = remlen - MAX_B;
- buf += MAX_B;
- }
- else{
- remlen = 0;
- }
- }
-
-/*
-// debug
- printf("position: %lld %d %d \n",pos,*length,*index);
- printf("buffer done length: %d %d\n",donelen,*length);
- float dat[*length/4];
- memcpy(dat,buffer,*length);
- printf("return buffer length: %d %d\n",*length,*index);
- printf("return buffer size: %d %d \n",sizeof(dat),sizeof(buffer));
- int i;
- for(i=0;i< 50;i++){
- printf("return buffer: %d %e \n",i,dat[i]);
- }
-*/
-}
-
-
-
-
-/* ---------------------------------------
-
- IO performance test
-
-
- A Performance Comparison of "read" and "mmap" in the Solaris 8 OS
-
- By Oyetunde Fadele, September 2002
-
- http://developers.sun.com/solaris/articles/read_mmap.html
-
- or
-
- High-performance network programming, Part 2: Speed up processing at both the client and server
-
- by Girish Venkatachalam
-
- http://www.ibm.com/developerworks/aix/library/au-highperform2/
-
-
- - uses functions mmap/memcpy for mapping file I/O
-
- ------------------------------------- */
-
-
-#include <errno.h>
-#include <limits.h>
-#include <sys/mman.h>
-
-// file maps
-static char * map_abs[ABS_FILEID];
-// file descriptors
-static int map_fd_abs[ABS_FILEID];
-// file sizes
-static long long filesize_abs[ABS_FILEID];
-
-//void
-//FC_FUNC_(open_file_abs_w_map,OPEN_FILE_ABS_W_MAP)(int *fid, char *filename, int *length, int *filesize){
-void open_file_abs_w_map(int *fid, char *filename, int *length, long long *filesize){
-
- // opens file for write access
-
- int ft;
- int result;
- char *map;
- char *fncopy;
- char *blank;
-
- // checks filesize
- if( *filesize == 0 ){
- perror("Error file size for writing");
- exit(EXIT_FAILURE);
- }
-
- /*
- // debug check filesize below or above 2 GB
- // filesize gives bytes needed; 4-byte integer limited to +- 2,147,483,648 bytes ~ 2 GB
- float s = *filesize / 1024. / 1024. / 1024.;
- if( s > 2.0 ){
- printf("file size bigger than 2 GB: %lld B or %f GB \n",*filesize,s);
- }
- */
-
- // Trim the file name.
- fncopy = strndup(filename, *length);
- blank = strchr(fncopy, ' ');
- if (blank != NULL) {
- fncopy[blank - fncopy] = '\0';
- }
-
- /* Open a file for writing.
- * - Creating the file if it doesn't exist.
- * - Truncating it to 0 size if it already exists. (not really needed)
- *
- * Note: "O_WRONLY" mode is not sufficient when mmaping.
- */
- ft = open(fncopy, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600);
- if (ft == -1) {
- perror("Error opening file for writing");
- exit(EXIT_FAILURE);
- }
-
- // file index id fid: from 0 to 8
- map_fd_abs[*fid] = ft;
-
- free(fncopy);
-
- /* Stretch the file size to the size of the (mmapped) array of ints
- */
- filesize_abs[*fid] = *filesize;
- result = lseek(ft, filesize_abs[*fid] - 1, SEEK_SET);
- if (result == -1) {
- close(ft);
- perror("Error calling lseek() to 'stretch' the file");
- exit(EXIT_FAILURE);
- }
-
- //printf("file length: %d \n",filesize_abs[*fid]);
-
-
- /* Something needs to be written at the end of the file to
- * have the file actually have the new size.
- * Just writing an empty string at the current file position will do.
- *
- * Note:
- * - The current position in the file is at the end of the stretched
- * file due to the call to lseek().
- * - An empty string is actually a single '\0' character, so a zero-byte
- * will be written at the last byte of the file.
- */
- result = write(ft, "", 1);
- if (result != 1) {
- close(ft);
- perror("Error writing last byte of the file");
- exit(EXIT_FAILURE);
- }
-
- /* Now the file is ready to be mmapped.
- */
- map = mmap(0, filesize_abs[*fid], PROT_READ | PROT_WRITE, MAP_SHARED, ft, 0);
- if (map == MAP_FAILED) {
- close(ft);
- perror("Error mmapping the file");
- exit(EXIT_FAILURE);
- }
-
- map_abs[*fid] = map;
-
- //printf("file map: %d\n",*fid);
-
-}
-
-//void
-//FC_FUNC_(open_file_abs_r_map,OPEN_FILE_ABS_R_MAP)(int *fid, char *filename,int *length, int *filesize){
-void open_file_abs_r_map(int *fid, char *filename,int *length, long long *filesize){
-
- // opens file for read access
- char * fncopy;
- char * blank;
- int ft;
- char *map;
-
- // checks filesize
- if( *filesize == 0 ){
- perror("Error file size for reading");
- exit(EXIT_FAILURE);
- }
-
- // Trim the file name.
- fncopy = strndup(filename, *length);
- blank = strchr(fncopy, ' ');
- if (blank != NULL) {
- fncopy[blank - fncopy] = '\0';
- }
-
-
- ft = open(fncopy, O_RDONLY);
- if (ft == -1) {
- perror("Error opening file for reading");
- exit(EXIT_FAILURE);
- }
-
- // file index id fid: from 0 to 8
- map_fd_abs[*fid] = ft;
-
- free(fncopy);
-
- filesize_abs[*fid] = *filesize;
-
- map = mmap(0, filesize_abs[*fid], PROT_READ, MAP_SHARED, ft, 0);
- if (map == MAP_FAILED) {
- close(ft);
- perror("Error mmapping the file");
- exit(EXIT_FAILURE);
- }
-
- map_abs[*fid] = map;
-
- //printf("file length r: %d \n",filesize_abs[*fid]);
- //printf("file map r: %d\n",*fid);
-
-}
-
-
-//void
-//FC_FUNC_(close_file_abs_map,CLOSE_FILE_ABS_MAP)(int * fid){
-void close_file_abs_map(int * fid){
-
- /* Don't forget to free the mmapped memory
- */
- if (munmap(map_abs[*fid], filesize_abs[*fid]) == -1) {
- perror("Error un-mmapping the file");
- /* Decide here whether to close(fd) and exit() or not. Depends... */
- }
-
- /* Un-mmaping doesn't close the file, so we still need to do that.
- */
- close(map_fd_abs[*fid]);
-}
-
-
-//void
-//FC_FUNC_(write_abs_map,WRITE_ABS_MAP)(int *fid, char *buffer, int *length , int *index){
-void write_abs_map(int *fid, char *buffer, int *length , int *index){
-
- char *map;
- long long offset;
-
- map = map_abs[*fid];
-
- // offset in bytes
- // make sure to use 64-bit arithmetic to avoid overflow for very large files
- offset = ((long long)*index -1 ) * (*length) ;
-
- // copies buffer to map
- memcpy( &map[offset], buffer ,*length );
-
-}
-
-//void
-//FC_FUNC_(read_abs_map,READ_ABS_MAP)(int *fid, char *buffer, int *length , int *index){
-void read_abs_map(int *fid, char *buffer, int *length , int *index){
-
- char *map;
- long long offset;
-
- map = map_abs[*fid];
-
- // offset in bytes
- // make sure to use 64-bit arithmetic to avoid overflow for very large files
- offset = ((long long)*index -1 ) * (*length) ;
-
- // copies map to buffer
- memcpy( buffer, &map[offset], *length );
-
-}
-
-
-/*
-
- wrapper functions
-
- - for your preferred, optimized file i/o ;
- e.g. uncomment // #define USE_MAP... in config.h to use mmap routines
- or comment out (default) to use fopen/fwrite/fread functions
-
- note: mmap functions should work fine for local harddisk directories, but can lead to
- problems with global (e.g. NFS) directories
-
- (on nehalem, Linux 2.6.18-164.11.1.el5 #1 SMP Wed Jan 20 10:04:55 EST 2010 x86_64 x86_64 x86_64 GNU/Linux
- - mmap functions are about 20 % faster than conventional fortran, unformatted file i/o
- - fwrite/fread function are about 12 % faster than conventional fortran, unformatted file i/o )
-
- */
-
-void
-FC_FUNC_(open_file_abs_w,OPEN_FILE_ABS_W)(int *fid, char *filename,int *length, long long *filesize) {
-
-#ifdef USE_MAP_FUNCTION
- open_file_abs_w_map(fid,filename,length,filesize);
-#else
- open_file_abs_w_fbin(fid,filename,length,filesize);
-#endif
-
-}
-
-void
-FC_FUNC_(open_file_abs_r,OPEN_FILE_ABS_R)(int *fid, char *filename,int *length, long long *filesize) {
-
-#ifdef USE_MAP_FUNCTION
- open_file_abs_r_map(fid,filename,length,filesize);
-#else
- open_file_abs_r_fbin(fid,filename,length,filesize);
-#endif
-
-}
-
-void
-FC_FUNC_(close_file_abs,CLOSE_FILES_ABS)(int *fid) {
-
-#ifdef USE_MAP_FUNCTION
- close_file_abs_map(fid);
-#else
- close_file_abs_fbin(fid);
-#endif
-
-}
-
-void
-FC_FUNC_(write_abs,WRITE_ABS)(int *fid, char *buffer, int *length , int *index) {
-
-#ifdef USE_MAP_FUNCTION
- write_abs_map(fid,buffer,length,index);
-#else
- write_abs_fbin(fid,buffer,length,index);
-#endif
-
-}
-
-void
-FC_FUNC_(read_abs,READ_ABS)(int *fid, char *buffer, int *length , int *index) {
-
-#ifdef USE_MAP_FUNCTION
- read_abs_map(fid,buffer,length,index);
-#else
- read_abs_fbin(fid,buffer,length,index);
-#endif
-
-}
-
-
+/*
+!=====================================================================
+!
+! S p e c f e m 3 D V e r s i o n 2 . 0
+! ---------------------------------------
+!
+! Main authors: Dimitri Komatitsch and Jeroen Tromp
+! Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+! April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+*/
+
+// for large files
+#define _FILE_OFFSET_BITS 64
+
+// after Brian's function
+
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+static int fd;
+
+void
+FC_FUNC_(open_file,OPEN_FILE)(char *file) {
+ /* fprintf(stderr, "Opening file: %s\n", file); */
+ fd = open(file, O_WRONLY | O_CREAT, 0644);
+ if(fd == -1) {
+ fprintf(stderr, "Error opening file: %s exiting\n", file);
+ exit(-1);
+ }
+}
+
+void
+FC_FUNC_(close_file,CLOSE_FILE)() {
+ /* fprintf(stderr, "Closing file\n"); */
+ close(fd);
+}
+
+void
+FC_FUNC_(write_integer,WRITE_INTEGER)(int *z) {
+ int dummy_unused_variable = write(fd, z, sizeof(int));
+}
+
+void
+FC_FUNC_(write_real,WRITE_REAL)(float *z) {
+ int dummy_unused_variable = write(fd, z, sizeof(float));
+}
+
+
+/* ---------------------------------------
+
+ IO performance test
+
+ Software Optimization for High Performance Computing: Creating Faster Applications
+
+ By Isom L. Crawford and Kevin R. Wadleigh
+ Jul 18, 2003
+
+ - uses functions fopen/fread/fwrite for binary file I/O
+
+ --------------------------------------- */
+
+#define __USE_GNU
+#include <string.h>
+#include <regex.h>
+
+#define MIN(x,y) ((x) < (y) ? (x) : (y))
+
+/* fastest performance on nehalem nodes:
+
+ Linux 2.6.18-164.11.1.el5 #1 SMP Wed Jan 20 10:04:55 EST 2010 x86_64 x86_64 x86_64 GNU/Linux
+
+ achieved with 16 KB buffers: */
+
+//#define MAX_B 65536 // 64 KB
+//#define MAX_B 32768 // 32 KB
+#define MAX_B 16384 // 16 KB
+//#define MAX_B 8192 // 8 KB
+
+// absorbing files: instead of passing file descriptor, we use the array index
+// index 0 for elastic domain file
+// index 1 for acoustic domain file
+// (reserved, but unused yet) index 2 - for NOISE_TOMOGRAPHY (SURFACE_MOVIE)
+#define ABS_FILEID 3 // number of absorbing files, note: in C we start indexing arrays from 0
+
+// file points
+static FILE * fp_abs[ABS_FILEID];
+// file work buffers
+static char * work_buffer[ABS_FILEID];
+
+
+//void
+//FC_FUNC_(open_file_abs_r_fbin,OPEN_FILE_ABS_R_FBIN)(int *fid, char *filename,int *length, int *filesize){
+void open_file_abs_r_fbin(int *fid, char *filename,int *length, long long *filesize){
+
+ // opens file for read access
+
+ //This sequence assigns the MAX_B array work_buffer to the file pointer
+ // to be used for its buffering. performance should benefit.
+ char * fncopy;
+ char * blank;
+ FILE *ft;
+ int ret;
+
+ // checks filesize
+ if( *filesize == 0 ){
+ perror("Error file size for reading");
+ exit(EXIT_FAILURE);
+ }
+
+ // Trim the file name.
+ fncopy = strndup(filename, *length);
+ blank = strchr(fncopy, ' ');
+ if (blank != NULL) {
+ fncopy[blank - fncopy] = '\0';
+ }
+
+/*
+//debug checks file size
+// see:
+//https://www.securecoding.cert.org/confluence/display/seccode/FIO19-C.+Do+not+use+fseek()+and+ftell()+to+compute+the+size+of+a+file
+ printf("file size: %lld \n",*filesize);
+ int fd;
+ struct stat stbuf;
+ long long size;
+ fd = open(fncopy, O_RDONLY);
+ if(fd == -1) {
+ fprintf(stderr, "Error opening file: %s exiting\n", fncopy);
+ exit(-1);
+ }
+ if( fstat(fd, &stbuf) == 0 ){
+ size = stbuf.st_size;
+ printf("file size found is: %lld (Bytes) \n",size);
+ }
+ close(fd);
+*/
+
+ // opens file
+ //ft = fopen( fncopy, "r+" );
+ ft = fopen( fncopy, "rb+" ); // read binary file
+ if( ft == NULL ) { perror("fopen"); exit(-1); }
+
+
+ // sets mode for full buffering
+ work_buffer[*fid] = (char *)malloc(MAX_B);
+ ret = setvbuf( ft, work_buffer[*fid], _IOFBF, (size_t)MAX_B );
+ if( ret != 0 ){
+ perror("Error setting working buffer");
+ exit(EXIT_FAILURE);
+ }
+
+ // stores file index id fid: from 0 to 8
+ fp_abs[*fid] = ft;
+
+ free(fncopy);
+}
+
+//void
+//FC_FUNC_(open_file_abs_w_fbin,OPEN_FILE_ABS_W_FBIN)(int *fid, char *filename, int *length, int *filesize){
+void open_file_abs_w_fbin(int *fid, char *filename, int *length, long long *filesize){
+
+ // opens file for write access
+
+ //This sequence assigns the MAX_B array work_buffer to the file pointer
+ // to be used for its buffering. performance should benefit.
+ char * fncopy;
+ char * blank;
+ FILE *ft;
+ int ret;
+
+ // checks filesize
+ if( *filesize == 0 ){
+ perror("Error file size for writing");
+ exit(EXIT_FAILURE);
+ }
+
+ // Trim the file name.
+ fncopy = strndup(filename, *length);
+ blank = strchr(fncopy, ' ');
+ if (blank != NULL) {
+ fncopy[blank - fncopy] = '\0';
+ }
+
+ // opens file
+ //ft = fopen( fncopy, "w+" );
+ ft = fopen( fncopy, "wb+" ); // write binary file
+ if( ft == NULL ) { perror("fopen"); exit(-1); }
+
+ // sets mode for full buffering
+ work_buffer[*fid] = (char *)malloc(MAX_B);
+ ret = setvbuf( ft, work_buffer[*fid], _IOFBF, (size_t)MAX_B );
+ if( ret != 0 ){
+ perror("Error setting working buffer");
+ exit(EXIT_FAILURE);
+ }
+
+ // stores file index id fid: from 0 to 8
+ fp_abs[*fid] = ft;
+
+ free(fncopy);
+
+}
+
+//void
+//FC_FUNC_(close_file_abs_fbin,CLOSE_FILE_ABS_FBIN)(int * fid){
+void close_file_abs_fbin(int * fid){
+
+ // closes file
+
+ fclose(fp_abs[*fid]);
+
+ free(work_buffer[*fid]);
+
+}
+
+//void
+//FC_FUNC_(write_abs_fbin,WRITE_ABS_FBIN)(int *fid, void *buffer, int *length, int *index){
+void write_abs_fbin(int *fid, char *buffer, int *length, int *index){
+
+ // writes binary file data in chunks of MAX_B
+
+ FILE *ft;
+ int itemlen,remlen,donelen,ret;
+ void *buf;
+
+ // file pointer
+ ft = fp_abs[*fid];
+
+ donelen = 0;
+ remlen = *length;
+ buf = buffer;
+ ret = 0;
+
+/*
+//debug
+ float dat[*length/4];
+ memcpy(dat,buffer,*length);
+ printf("buffer length: %d %d\n",*length,*index);
+ printf("buffer size: %d %d \n",sizeof(dat),sizeof(buffer));
+ int i;
+ for(i=0;i< 50;i++){
+ printf("buffer: %d %e \n",i,dat[i]);
+ }
+
+ // positions file pointer (for reverse time access)
+ // make sure to use 64-bit arithmetic to avoid overflow for very large files
+ long long pos,cur;
+
+ pos = ((long long)*length) * (*index -1 );
+ cur = ftell(ft);
+
+ printf("current position: %d %lld %lld \n",*fid,cur,pos);
+ ret = fseek(ft, pos , SEEK_SET);
+ if ( ret != 0 ) {
+ perror("Error fseek");
+ exit(EXIT_FAILURE);
+ }
+ */
+
+
+ // writes items of maximum MAX_B to the file
+ while (remlen > 0){
+
+ itemlen = MIN(remlen,MAX_B);
+ // note: we want to write out exactly *itemlen* bytes
+ ret = fwrite(buf,1,itemlen,ft);
+ if (ret > 0){
+ donelen = donelen + ret;
+ remlen = remlen - MAX_B;
+ buf += MAX_B;
+ }
+ else{
+ remlen = 0;
+ }
+ }
+
+ //debug
+ // printf("buffer done length: %d %d\n",donelen,*length);
+}
+
+//void
+//FC_FUNC_(read_abs_fbin,READ_ABS_FBIN)(int *fid, void *buffer, int *length, int *index){
+void read_abs_fbin(int *fid, char *buffer, int *length, int *index){
+
+ // reads binary file data in chunks of MAX_B
+
+ FILE *ft;
+ int ret,itemlen,remlen,donelen;
+ long long pos;
+ void *buf;
+
+ // file pointer
+ ft = fp_abs[*fid];
+
+ // positions file pointer (for reverse time access)
+ // make sure to use 64-bit arithmetic to avoid overflow for very large files
+ pos = ((long long)*length) * (*index -1 );
+
+ ret = fseek(ft, pos , SEEK_SET);
+ if ( ret != 0 ) {
+ perror("Error fseek");
+ exit(EXIT_FAILURE);
+ }
+
+ donelen = 0;
+ remlen = *length;
+ buf = buffer;
+ ret = 0;
+
+ // cleans buffer
+ //memset( buf,0,remlen);
+
+ // reads items of maximum MAX_B to the file
+ while (remlen > 0){
+
+ // checks end of file
+ if (ferror(ft) || feof(ft)) return;
+
+ itemlen = MIN(remlen,MAX_B);
+ ret = fread(buf,1,itemlen,ft);
+
+ if (ferror(ft) || feof(ft)) return;
+
+ if (ret > 0){
+ donelen = donelen + ret;
+ remlen = remlen - MAX_B;
+ buf += MAX_B;
+ }
+ else{
+ remlen = 0;
+ }
+ }
+
+/*
+// debug
+ printf("position: %lld %d %d \n",pos,*length,*index);
+ printf("buffer done length: %d %d\n",donelen,*length);
+ float dat[*length/4];
+ memcpy(dat,buffer,*length);
+ printf("return buffer length: %d %d\n",*length,*index);
+ printf("return buffer size: %d %d \n",sizeof(dat),sizeof(buffer));
+ int i;
+ for(i=0;i< 50;i++){
+ printf("return buffer: %d %e \n",i,dat[i]);
+ }
+*/
+}
+
+
+
+
+/* ---------------------------------------
+
+ IO performance test
+
+
+ A Performance Comparison of "read" and "mmap" in the Solaris 8 OS
+
+ By Oyetunde Fadele, September 2002
+
+ http://developers.sun.com/solaris/articles/read_mmap.html
+
+ or
+
+ High-performance network programming, Part 2: Speed up processing at both the client and server
+
+ by Girish Venkatachalam
+
+ http://www.ibm.com/developerworks/aix/library/au-highperform2/
+
+
+ - uses functions mmap/memcpy for mapping file I/O
+
+ ------------------------------------- */
+
+
+#include <errno.h>
+#include <limits.h>
+#include <sys/mman.h>
+
+// file maps
+static char * map_abs[ABS_FILEID];
+// file descriptors
+static int map_fd_abs[ABS_FILEID];
+// file sizes
+static long long filesize_abs[ABS_FILEID];
+
+//void
+//FC_FUNC_(open_file_abs_w_map,OPEN_FILE_ABS_W_MAP)(int *fid, char *filename, int *length, int *filesize){
+void open_file_abs_w_map(int *fid, char *filename, int *length, long long *filesize){
+
+ // opens file for write access
+
+ int ft;
+ int result;
+ char *map;
+ char *fncopy;
+ char *blank;
+
+ // checks filesize
+ if( *filesize == 0 ){
+ perror("Error file size for writing");
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ // debug check filesize below or above 2 GB
+ // filesize gives bytes needed; 4-byte integer limited to +- 2,147,483,648 bytes ~ 2 GB
+ float s = *filesize / 1024. / 1024. / 1024.;
+ if( s > 2.0 ){
+ printf("file size bigger than 2 GB: %lld B or %f GB \n",*filesize,s);
+ }
+ */
+
+ // Trim the file name.
+ fncopy = strndup(filename, *length);
+ blank = strchr(fncopy, ' ');
+ if (blank != NULL) {
+ fncopy[blank - fncopy] = '\0';
+ }
+
+ /* Open a file for writing.
+ * - Creating the file if it doesn't exist.
+ * - Truncating it to 0 size if it already exists. (not really needed)
+ *
+ * Note: "O_WRONLY" mode is not sufficient when mmaping.
+ */
+ ft = open(fncopy, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600);
+ if (ft == -1) {
+ perror("Error opening file for writing");
+ exit(EXIT_FAILURE);
+ }
+
+ // file index id fid: from 0 to 8
+ map_fd_abs[*fid] = ft;
+
+ free(fncopy);
+
+ /* Stretch the file size to the size of the (mmapped) array of ints
+ */
+ filesize_abs[*fid] = *filesize;
+ result = lseek(ft, filesize_abs[*fid] - 1, SEEK_SET);
+ if (result == -1) {
+ close(ft);
+ perror("Error calling lseek() to 'stretch' the file");
+ exit(EXIT_FAILURE);
+ }
+
+ //printf("file length: %d \n",filesize_abs[*fid]);
+
+
+ /* Something needs to be written at the end of the file to
+ * have the file actually have the new size.
+ * Just writing an empty string at the current file position will do.
+ *
+ * Note:
+ * - The current position in the file is at the end of the stretched
+ * file due to the call to lseek().
+ * - An empty string is actually a single '\0' character, so a zero-byte
+ * will be written at the last byte of the file.
+ */
+ result = write(ft, "", 1);
+ if (result != 1) {
+ close(ft);
+ perror("Error writing last byte of the file");
+ exit(EXIT_FAILURE);
+ }
+
+ /* Now the file is ready to be mmapped.
+ */
+ map = mmap(0, filesize_abs[*fid], PROT_READ | PROT_WRITE, MAP_SHARED, ft, 0);
+ if (map == MAP_FAILED) {
+ close(ft);
+ perror("Error mmapping the file");
+ exit(EXIT_FAILURE);
+ }
+
+ map_abs[*fid] = map;
+
+ //printf("file map: %d\n",*fid);
+
+}
+
+//void
+//FC_FUNC_(open_file_abs_r_map,OPEN_FILE_ABS_R_MAP)(int *fid, char *filename,int *length, int *filesize){
+void open_file_abs_r_map(int *fid, char *filename,int *length, long long *filesize){
+
+ // opens file for read access
+ char * fncopy;
+ char * blank;
+ int ft;
+ char *map;
+
+ // checks filesize
+ if( *filesize == 0 ){
+ perror("Error file size for reading");
+ exit(EXIT_FAILURE);
+ }
+
+ // Trim the file name.
+ fncopy = strndup(filename, *length);
+ blank = strchr(fncopy, ' ');
+ if (blank != NULL) {
+ fncopy[blank - fncopy] = '\0';
+ }
+
+
+ ft = open(fncopy, O_RDONLY);
+ if (ft == -1) {
+ perror("Error opening file for reading");
+ exit(EXIT_FAILURE);
+ }
+
+ // file index id fid: from 0 to 8
+ map_fd_abs[*fid] = ft;
+
+ free(fncopy);
+
+ filesize_abs[*fid] = *filesize;
+
+ map = mmap(0, filesize_abs[*fid], PROT_READ, MAP_SHARED, ft, 0);
+ if (map == MAP_FAILED) {
+ close(ft);
+ perror("Error mmapping the file");
+ exit(EXIT_FAILURE);
+ }
+
+ map_abs[*fid] = map;
+
+ //printf("file length r: %d \n",filesize_abs[*fid]);
+ //printf("file map r: %d\n",*fid);
+
+}
+
+
+//void
+//FC_FUNC_(close_file_abs_map,CLOSE_FILE_ABS_MAP)(int * fid){
+void close_file_abs_map(int * fid){
+
+ /* Don't forget to free the mmapped memory
+ */
+ if (munmap(map_abs[*fid], filesize_abs[*fid]) == -1) {
+ perror("Error un-mmapping the file");
+ /* Decide here whether to close(fd) and exit() or not. Depends... */
+ }
+
+ /* Un-mmaping doesn't close the file, so we still need to do that.
+ */
+ close(map_fd_abs[*fid]);
+}
+
+
+//void
+//FC_FUNC_(write_abs_map,WRITE_ABS_MAP)(int *fid, char *buffer, int *length , int *index){
+void write_abs_map(int *fid, char *buffer, int *length , int *index){
+
+ char *map;
+ long long offset;
+
+ map = map_abs[*fid];
+
+ // offset in bytes
+ // make sure to use 64-bit arithmetic to avoid overflow for very large files
+ offset = ((long long)*index -1 ) * (*length) ;
+
+ // copies buffer to map
+ memcpy( &map[offset], buffer ,*length );
+
+}
+
+//void
+//FC_FUNC_(read_abs_map,READ_ABS_MAP)(int *fid, char *buffer, int *length , int *index){
+void read_abs_map(int *fid, char *buffer, int *length , int *index){
+
+ char *map;
+ long long offset;
+
+ map = map_abs[*fid];
+
+ // offset in bytes
+ // make sure to use 64-bit arithmetic to avoid overflow for very large files
+ offset = ((long long)*index -1 ) * (*length) ;
+
+ // copies map to buffer
+ memcpy( buffer, &map[offset], *length );
+
+}
+
+
+/*
+
+ wrapper functions
+
+ - for your preferred, optimized file i/o ;
+ e.g. uncomment // #define USE_MAP... in config.h to use mmap routines
+ or comment out (default) to use fopen/fwrite/fread functions
+
+ note: mmap functions should work fine for local harddisk directories, but can lead to
+ problems with global (e.g. NFS) directories
+
+ (on nehalem, Linux 2.6.18-164.11.1.el5 #1 SMP Wed Jan 20 10:04:55 EST 2010 x86_64 x86_64 x86_64 GNU/Linux
+ - mmap functions are about 20 % faster than conventional fortran, unformatted file i/o
+ - fwrite/fread function are about 12 % faster than conventional fortran, unformatted file i/o )
+
+ */
+
+void
+FC_FUNC_(open_file_abs_w,OPEN_FILE_ABS_W)(int *fid, char *filename,int *length, long long *filesize) {
+
+#ifdef USE_MAP_FUNCTION
+ open_file_abs_w_map(fid,filename,length,filesize);
+#else
+ open_file_abs_w_fbin(fid,filename,length,filesize);
+#endif
+
+}
+
+void
+FC_FUNC_(open_file_abs_r,OPEN_FILE_ABS_R)(int *fid, char *filename,int *length, long long *filesize) {
+
+#ifdef USE_MAP_FUNCTION
+ open_file_abs_r_map(fid,filename,length,filesize);
+#else
+ open_file_abs_r_fbin(fid,filename,length,filesize);
+#endif
+
+}
+
+void
+FC_FUNC_(close_file_abs,CLOSE_FILES_ABS)(int *fid) {
+
+#ifdef USE_MAP_FUNCTION
+ close_file_abs_map(fid);
+#else
+ close_file_abs_fbin(fid);
+#endif
+
+}
+
+void
+FC_FUNC_(write_abs,WRITE_ABS)(int *fid, char *buffer, int *length , int *index) {
+
+#ifdef USE_MAP_FUNCTION
+ write_abs_map(fid,buffer,length,index);
+#else
+ write_abs_fbin(fid,buffer,length,index);
+#endif
+
+}
+
+void
+FC_FUNC_(read_abs,READ_ABS)(int *fid, char *buffer, int *length , int *index) {
+
+#ifdef USE_MAP_FUNCTION
+ read_abs_map(fid,buffer,length,index);
+#else
+ read_abs_fbin(fid,buffer,length,index);
+#endif
+
+}
+
+
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2011-11-17 23:51:03 UTC (rev 19213)
@@ -860,6 +860,14 @@
write(IMAIN,*)
endif
+ ! initializes GPU and outputs info to files for all processes
+ call prepare_cuda_device(myrank,ncuda_devices)
+
+ ! collects min/max of local devices found for statistics
+ call sync_all()
+ call min_all_i(ncuda_devices,ncuda_devices_min)
+ call max_all_i(ncuda_devices,ncuda_devices_max)
+
! prepares general fields on GPU
call prepare_constants_device(Mesh_pointer, &
NGLLX, NSPEC_AB, NGLOB_AB, &
@@ -881,11 +889,9 @@
number_receiver_global, ispec_selected_rec, &
nrec, nrec_local, &
SIMULATION_TYPE, &
- USE_MESH_COLORING_GPU,nspec_acoustic,nspec_elastic, &
- myrank,ncuda_devices)
+ USE_MESH_COLORING_GPU, &
+ nspec_acoustic,nspec_elastic)
- call min_all_i(ncuda_devices,ncuda_devices_min)
- call max_all_i(ncuda_devices,ncuda_devices_max)
! prepares fields on GPU for acoustic simulations
if( ACOUSTIC_SIMULATION ) then
@@ -1014,5 +1020,4 @@
write(IMAIN,*)
endif
-
end subroutine prepare_timerun_GPU
More information about the CIG-COMMITS
mailing list