[cig-commits] r22780 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER: . EXAMPLES/regional_Greece_small src/cuda src/specfem3D utils
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Mon Sep 9 08:42:04 PDT 2013
Author: danielpeter
Date: 2013-09-09 08:42:03 -0700 (Mon, 09 Sep 2013)
New Revision: 22780
Added:
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/utils/check_cuda_device.cu
Removed:
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
Modified:
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/xcombine_vol_data.sh
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/Makefile.in
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/rules.mk
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_seismograms.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/rules.mk
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_output_SAC.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90
Log:
updates cuda routines; renames update_displacement_cuda.cu
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/xcombine_vol_data.sh
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/xcombine_vol_data.sh 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/xcombine_vol_data.sh 2013-09-09 15:42:03 UTC (rev 22780)
@@ -14,7 +14,7 @@
res="1"
# for visualization
-cp ../../UTILS/Visualization/Paraview/AVS_continent_boundaries.inp ./
+cp ../../utils/Visualization/VTK_ParaView/AVS_continent_boundaries.inp ./
echo
echo "alpha_kernel"
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/Makefile.in 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/Makefile.in 2013-09-09 15:42:03 UTC (rev 22780)
@@ -99,10 +99,10 @@
# CUDA flags and linking
@COND_CUDA_TRUE at NVCC_FLAGS_BASE = $(CUDA_INC) $(MPI_INC)
- at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -dc -DCUDA $(GENCODE)
- at COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -DCUDA -DUSE_OLDER_CUDA4_GPU $(GENCODE)
+ at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -dc $(GENCODE)
+ at COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -DUSE_OLDER_CUDA4_GPU $(GENCODE)
- at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCCLINK_BASE = $(NVCC) $(CUDA_INC) $(MPI_INC) -DCUDA
+ at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCCLINK_BASE = $(NVCC) $(CUDA_INC) $(MPI_INC)
@COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCCLINK = $(NVCCLINK_BASE) -dlink $(GENCODE)
@COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCCLINK = $(NVCCLINK_BASE) -DUSE_OLDER_CUDA4_GPU $(GENCODE)
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu 2013-09-09 15:42:03 UTC (rev 22780)
@@ -715,7 +715,7 @@
realw* epsilon_trace_over_3,
int SIMULATION_TYPE,
int ATTENUATION,
- int USE_ATTENUATION_MIMIC,
+ int PARTIAL_PHYS_DISPERSION_ONLY,
int USE_3D_ATTENUATION_ARRAYS,
realw* one_minus_sum_beta,realw* factor_common,
realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
@@ -1056,7 +1056,7 @@
}
} // ! end of test whether isotropic or anisotropic element
- if(ATTENUATION && (! USE_ATTENUATION_MIMIC ) ){
+ if(ATTENUATION && (! PARTIAL_PHYS_DISPERSION_ONLY ) ){
// subtracts memory variables if attenuation
compute_element_cm_att_stress(tx,working_element,
R_xx,R_yy,R_xy,R_xz,R_yz,
@@ -1255,7 +1255,7 @@
#endif // MESH_COLORING
// update memory variables based upon the Runge-Kutta scheme
- if( ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
+ if( ATTENUATION && ( ! PARTIAL_PHYS_DISPERSION_ONLY ) ){
compute_element_cm_att_memory(tx,working_element,
d_muvstore,
@@ -1377,7 +1377,7 @@
d_epsilon_trace_over_3,
mp->simulation_type,
mp->attenuation,
- mp->use_attenuation_mimic,
+ mp->partial_phys_dispersion_only,
mp->use_3d_attenuation_arrays,
d_one_minus_sum_beta,d_factor_common,
d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
@@ -1427,7 +1427,7 @@
d_b_epsilon_trace_over_3,
mp->simulation_type,
mp->attenuation,
- mp->use_attenuation_mimic,
+ mp->partial_phys_dispersion_only,
mp->use_3d_attenuation_arrays,
d_one_minus_sum_beta,d_factor_common,
d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu 2013-09-09 15:42:03 UTC (rev 22780)
@@ -328,7 +328,7 @@
realw* epsilon_trace_over_3,
int SIMULATION_TYPE,
int ATTENUATION,
- int USE_ATTENUATION_MIMIC,
+ int PARTIAL_PHYS_DISPERSION_ONLY,
int USE_3D_ATTENUATION_ARRAYS,
realw* one_minus_sum_beta,realw* factor_common,
realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
@@ -678,7 +678,7 @@
sigma_yz = mul*duzdyl_plus_duydzl;
}
- if(ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
+ if(ATTENUATION && ( ! PARTIAL_PHYS_DISPERSION_ONLY ) ){
// subtracts memory variables if attenuation
compute_element_ic_att_stress(tx,working_element,
R_xx,R_yy,R_xy,R_xz,R_yz,
@@ -891,7 +891,7 @@
#endif // MESH_COLORING
// update memory variables based upon the Runge-Kutta scheme
- if( ATTENUATION && ! USE_ATTENUATION_MIMIC ){
+ if( ATTENUATION && ! PARTIAL_PHYS_DISPERSION_ONLY ){
compute_element_ic_att_memory(tx,working_element,
d_muv,
factor_common,alphaval,betaval,gammaval,
@@ -1005,7 +1005,7 @@
d_epsilon_trace_over_3,
mp->simulation_type,
mp->attenuation,
- mp->use_attenuation_mimic,
+ mp->partial_phys_dispersion_only,
mp->use_3d_attenuation_arrays,
d_one_minus_sum_beta,
d_factor_common,
@@ -1053,7 +1053,7 @@
d_b_epsilon_trace_over_3,
mp->simulation_type,
mp->attenuation,
- mp->use_attenuation_mimic,
+ mp->partial_phys_dispersion_only,
mp->use_3d_attenuation_arrays,
d_one_minus_sum_beta,
d_factor_common,
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu 2013-09-09 15:42:03 UTC (rev 22780)
@@ -30,10 +30,6 @@
#include <cuda.h>
#include <cublas.h>
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
-
#include <sys/time.h>
#include <sys/resource.h>
@@ -115,7 +111,26 @@
//MPI_Barrier(MPI_COMM_WORLD);
// sets active device
+#ifdef CUDA_DEVICE_ID
+ // uses fixed device id when compile with e.g.: -DCUDA_DEVICE_ID=1
+ device = CUDA_DEVICE_ID;
+ if(myrank == 0 ) printf("setting cuda devices with id = %d for all processes by -DCUDA_DEVICE_ID\n\n",device);
+
+ cudaSetDevice( device );
+ exit_on_cuda_error("cudaSetDevice has invalid device");
+
+ // double check that device was properly selected
+ cudaGetDevice(&device);
+ if( device != CUDA_DEVICE_ID ){
+ printf("error rank: %d devices: %d \n",myrank,device_count);
+ printf(" cudaSetDevice()=%d\n cudaGetDevice()=%d\n",CUDA_DEVICE_ID,device);
+ exit_on_error("CUDA set/get device error: device id conflict \n");
+ }
+#else
+ // device changes for different mpi processes according to number of device per node
+ // (assumes that number of devices per node is the same for different compute nodes)
device = myrank % device_count;
+
cudaSetDevice( device );
exit_on_cuda_error("cudaSetDevice has invalid device");
@@ -126,6 +141,7 @@
printf(" cudaSetDevice()=%d\n cudaGetDevice()=%d\n",myrank%device_count,device);
exit_on_error("CUDA set/get device error: device id conflict \n");
}
+#endif
}
// returns a handle to the active device
Deleted: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu 2013-09-09 15:42:03 UTC (rev 22780)
@@ -1,611 +0,0 @@
-/*
- !=====================================================================
- !
- ! S p e c f e m 3 D V e r s i o n 2 . 0
- ! ---------------------------------------
- !
- ! Main authors: Dimitri Komatitsch and Jeroen Tromp
- ! Princeton University, USA and University of Pau / CNRS / INRIA
- ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
- ! August 2013
- !
- ! This program is free software; you can redistribute it and/or modify
- ! it under the terms of the GNU General Public License as published by
- ! the Free Software Foundation; either version 2 of the License, or
- ! (at your option) any later version.
- !
- ! This program is distributed in the hope that it will be useful,
- ! but WITHOUT ANY WARRANTY; without even the implied warranty of
- ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ! GNU General Public License for more details.
- !
- ! You should have received a copy of the GNU General Public License along
- ! with this program; if not, write to the Free Software Foundation, Inc.,
- ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- !
- !=====================================================================
- */
-
-#include <stdio.h>
-#include <cuda.h>
-
-#include "config.h"
-#include "mesh_constants_cuda.h"
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// elastic wavefield
-
-// KERNEL 1
-/* ----------------------------------------------------------------------------------------------- */
-
-
-__global__ void UpdateDispVeloc_kernel(realw* displ,
- realw* veloc,
- realw* accel,
- int size,
- realw deltat,
- realw deltatsqover2,
- realw deltatover2) {
- int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
- if(id < size) {
- displ[id] = displ[id] + deltat*veloc[id] + deltatsqover2*accel[id];
- veloc[id] = veloc[id] + deltatover2*accel[id];
- accel[id] = 0.0f; // can do this using memset...not sure if faster
- }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// KERNEL 1
-// inner core
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(it_update_displacement_ic_cuda,
- IT_UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
- realw* deltat_F,
- realw* deltatsqover2_F,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {
-
-TRACE("it_update_displacement_ic_cuda");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
-
- int size = NDIM * mp->NGLOB_INNER_CORE;
-
- realw deltat = *deltat_F;
- realw deltatsqover2 = *deltatsqover2_F;
- realw deltatover2 = *deltatover2_F;
-
- int blocksize = BLOCKSIZE_KERNEL1;
- int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
-
- if( *FORWARD_OR_ADJOINT == 1 ){
- //launch kernel
- UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
- mp->d_veloc_inner_core,
- mp->d_accel_inner_core,
- size,deltat,deltatsqover2,deltatover2);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- // kernel for backward fields
- UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
- mp->d_b_veloc_inner_core,
- mp->d_b_accel_inner_core,
- size,deltat,deltatsqover2,deltatover2);
- }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("it_update_displacement_ic_cuda");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// KERNEL 1
-// crust/mantle
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(it_update_displacement_cm_cuda,
- IT_UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
- realw* deltat_F,
- realw* deltatsqover2_F,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {
-
- TRACE("it_update_displacement_cm_cuda");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
-
- int size = NDIM * mp->NGLOB_CRUST_MANTLE;
-
- realw deltat = *deltat_F;
- realw deltatsqover2 = *deltatsqover2_F;
- realw deltatover2 = *deltatover2_F;
-
- int blocksize = BLOCKSIZE_KERNEL1;
- int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
-
- if( *FORWARD_OR_ADJOINT == 1 ){
- //launch kernel
- UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
- mp->d_veloc_crust_mantle,
- mp->d_accel_crust_mantle,
- size,deltat,deltatsqover2,deltatover2);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- // kernel for backward fields
- UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
- mp->d_b_veloc_crust_mantle,
- mp->d_b_accel_crust_mantle,
- size,deltat,deltatsqover2,deltatover2);
- }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("it_update_displacement_cm_cuda");
-#endif
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// acoustic wavefield
-
-// KERNEL 1
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void UpdatePotential_kernel(realw* potential_acoustic,
- realw* potential_dot_acoustic,
- realw* potential_dot_dot_acoustic,
- int size,
- realw deltat,
- realw deltatsqover2,
- realw deltatover2) {
- int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
- if(id < size) {
- potential_acoustic[id] = potential_acoustic[id]
- + deltat*potential_dot_acoustic[id]
- + deltatsqover2*potential_dot_dot_acoustic[id];
-
- potential_dot_acoustic[id] = potential_dot_acoustic[id]
- + deltatover2*potential_dot_dot_acoustic[id];
-
- potential_dot_dot_acoustic[id] = 0.0f;
- }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// KERNEL 1
-// outer core
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(it_update_displacement_oc_cuda,
- IT_UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
- realw* deltat_F,
- realw* deltatsqover2_F,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {
-
- TRACE("it_update_displacement_oc_cuda");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
-
- int size = mp->NGLOB_OUTER_CORE;
-
- realw deltat = *deltat_F;
- realw deltatsqover2 = *deltatsqover2_F;
- realw deltatover2 = *deltatover2_F;
-
- int blocksize = BLOCKSIZE_KERNEL1;
- int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
-
- if( *FORWARD_OR_ADJOINT == 1 ){
- //launch kernel
- UpdatePotential_kernel<<<grid,threads>>>(mp->d_displ_outer_core,
- mp->d_veloc_outer_core,
- mp->d_accel_outer_core,
- size,deltat,deltatsqover2,deltatover2);
- }else if( *FORWARD_OR_ADJOINT == 1 ){
- UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,
- mp->d_b_veloc_outer_core,
- mp->d_b_accel_outer_core,
- size,deltat,deltatsqover2,deltatover2);
- }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("it_update_displacement_oc_cuda");
-#endif
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// KERNEL 3
-//
-// crust/mantle and inner core regions
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void kernel_3_cuda_device(realw* veloc,
- realw* accel,
- int size,
- realw deltatover2,
- realw two_omega_earth,
- realw* rmassx,
- realw* rmassy,
- realw* rmassz) {
- int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
- if(id < size) {
- // note: update adds rotational acceleration in case two_omega_earth is non-zero
- accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
- accel[3*id+1] = accel[3*id+1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
- accel[3*id+2] = accel[3*id+2]*rmassz[id];
-
- veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
- veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
- veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
- }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void kernel_3_accel_cuda_device(realw* accel,
- realw* veloc,
- int size,
- realw two_omega_earth,
- realw* rmassx,
- realw* rmassy,
- realw* rmassz) {
- int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
- if(id < size) {
- // note: update adds rotational acceleration in case two_omega_earth is non-zero
- accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
- accel[3*id+1] = accel[3*id+1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
- accel[3*id+2] = accel[3*id+2]*rmassz[id];
- }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void kernel_3_veloc_cuda_device(realw* veloc,
- realw* accel,
- int size,
- realw deltatover2) {
- int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
- if(id < size) {
- veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
- veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
- veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
- }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(update_accel_3_a_cuda,
- UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
- realw* deltatover2_F,
- int* NCHUNKS_VAL,
- int* FORWARD_OR_ADJOINT) {
- TRACE("update_accel_3_a_cuda");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-
- realw deltatover2 = *deltatover2_F;
-
- int blocksize = BLOCKSIZE_KERNEL3;
- int size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
-
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
-
- // crust/mantle region only
- // check whether we can update accel and veloc, or only accel at this point
- if( mp->oceans == 0 ){
- // updates both, accel and veloc
-
- if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
- // uses corrected mass matrices rmassx,rmassy,rmassz
- if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
- mp->d_accel_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- deltatover2,
- mp->two_omega_earth,
- mp->d_rmassx_crust_mantle,
- mp->d_rmassy_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
- mp->d_b_accel_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- deltatover2,
- mp->b_two_omega_earth,
- mp->d_rmassx_crust_mantle,
- mp->d_rmassy_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }
- }else{
- // uses only rmassz
- if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
- mp->d_accel_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- deltatover2,
- mp->two_omega_earth,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
- mp->d_b_accel_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- deltatover2,
- mp->b_two_omega_earth,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }
- }
-
- }else{
- // updates only accel
-
- if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
- // uses corrected mass matrices
- if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
- mp->d_veloc_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- mp->two_omega_earth,
- mp->d_rmassx_crust_mantle,
- mp->d_rmassy_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
- mp->d_b_veloc_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- mp->b_two_omega_earth,
- mp->d_rmassx_crust_mantle,
- mp->d_rmassy_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }
- }else{
- // uses only rmassz
- if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
- mp->d_veloc_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- mp->two_omega_earth,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
- mp->d_b_veloc_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- mp->b_two_omega_earth,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }
- }
- }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
- exit_on_cuda_error("after update_accel_3_a");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(update_veloc_3_b_cuda,
- UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {
- TRACE("update_veloc_3_b_cuda");
- int size_padded,num_blocks_x,num_blocks_y;
-
- Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-
- realw deltatover2 = *deltatover2_F;
-
- int blocksize = BLOCKSIZE_KERNEL3;
-
- // crust/mantle region
- // in case of ocean loads, we still have to update the velocity for crust/mantle region
- if( mp->oceans ){
- size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
- num_blocks_x = size_padded/blocksize;
- num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid1(num_blocks_x,num_blocks_y);
- dim3 threads1(blocksize,1,1);
-
- // updates only veloc at this point
- if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_veloc_crust_mantle,
- mp->d_accel_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- deltatover2);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_b_veloc_crust_mantle,
- mp->d_b_accel_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- deltatover2);
- }
- }
-
- // inner core region
- size_padded = ((int)ceil(((double)mp->NGLOB_INNER_CORE)/((double)blocksize)))*blocksize;
- num_blocks_x = size_padded/blocksize;
- num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
-
- // updates both, accel and veloc
- if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
- mp->d_accel_inner_core,
- mp->NGLOB_INNER_CORE,
- deltatover2,
- mp->two_omega_earth,
- mp->d_rmassx_inner_core,
- mp->d_rmassy_inner_core,
- mp->d_rmassz_inner_core);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core,
- mp->d_b_accel_inner_core,
- mp->NGLOB_INNER_CORE,
- deltatover2,
- mp->b_two_omega_earth,
- mp->d_rmassx_inner_core,
- mp->d_rmassy_inner_core,
- mp->d_rmassz_inner_core);
- }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
- exit_on_cuda_error("after update_veloc_3_b");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// KERNEL 3
-//
-// for outer_core
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-__global__ void kernel_3_outer_core_cuda_device(realw* veloc,
- realw* accel,int size,
- realw deltatover2,
- realw* rmass) {
- int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
- if(id < size) {
- // multiplies pressure with the inverse of the mass matrix
- accel[id] = accel[id]*rmass[id];
-
- // Newmark time scheme: corrector term
- veloc[id] = veloc[id] + deltatover2*accel[id];
- }
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(kernel_3_outer_core_cuda,
- KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {
-
- TRACE("kernel_3_outer_core_cuda");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-
- realw deltatover2 = *deltatover2_F;
-
- int blocksize = BLOCKSIZE_KERNEL3;
-
- int size_padded = ((int)ceil(((double)mp->NGLOB_OUTER_CORE)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
-
- if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
- mp->d_accel_outer_core,
- mp->NGLOB_OUTER_CORE,
- deltatover2,mp->d_rmass_outer_core);
- }else if( *FORWARD_OR_ADJOINT == 3){
- kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_b_veloc_outer_core,
- mp->d_b_accel_outer_core,
- mp->NGLOB_OUTER_CORE,
- deltatover2,mp->d_rmass_outer_core);
- }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
- exit_on_cuda_error("after kernel_3_outer_core");
-#endif
-}
-
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2013-09-09 15:42:03 UTC (rev 22780)
@@ -558,7 +558,7 @@
int attenuation;
int undo_attenuation;
- int use_attenuation_mimic;
+ int partial_phys_dispersion_only;
int use_3d_attenuation_arrays;
int compute_and_store_strain;
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2013-09-09 15:42:03 UTC (rev 22780)
@@ -30,9 +30,9 @@
#include <cuda.h>
#include <cublas.h>
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
+//#ifdef WITH_MPI
+//#include <mpi.h>
+//#endif
#include <sys/types.h>
#include <unistd.h>
@@ -42,6 +42,7 @@
/* ----------------------------------------------------------------------------------------------- */
+//unused routines...
/*
extern "C"
void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2013-09-09 15:42:03 UTC (rev 22780)
@@ -30,10 +30,6 @@
#include <cuda.h>
#include <cublas.h>
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
-
#include <sys/time.h>
#include <sys/resource.h>
@@ -114,39 +110,31 @@
PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
int* myrank_f,
int* h_NGLLX,
- realw* h_hprime_xx,
- realw* h_hprimewgll_xx,
+ realw* h_hprime_xx,realw* h_hprimewgll_xx,
realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
int* NSOURCES,int* nsources_local,
realw* h_sourcearrays,
int* h_islice_selected_source,int* h_ispec_selected_source,
+ int* nrec,int* nrec_local, int* nadj_rec_local,
int* h_number_receiver_global,
int* h_islice_selected_rec,int* h_ispec_selected_rec,
- int* nrec,int* nrec_local, int* nadj_rec_local,
int* NSPEC_CRUST_MANTLE, int* NGLOB_CRUST_MANTLE,
int* NSPEC_CRUST_MANTLE_STRAIN_ONLY,
int* NSPEC_OUTER_CORE, int* NGLOB_OUTER_CORE,
int* NSPEC_INNER_CORE, int* NGLOB_INNER_CORE,
int* NSPEC_INNER_CORE_STRAIN_ONLY,
- int* SIMULATION_TYPE,
- int* NOISE_TOMOGRAPHY,
- int* SAVE_FORWARD_f,
- int* ABSORBING_CONDITIONS_f,
- int* OCEANS_f,
- int* GRAVITY_f,
+ int* SIMULATION_TYPE,int* NOISE_TOMOGRAPHY,
+ int* SAVE_FORWARD_f,int* ABSORBING_CONDITIONS_f,
+ int* OCEANS_f,int* GRAVITY_f,
int* ROTATION_f,int* EXACT_MASS_MATRIX_FOR_ROTATION_f,
int* ATTENUATION_f,int* UNDO_ATTENUATION_f,
- int* USE_ATTENUATION_MIMIC_f,
- int* USE_3D_ATTENUATION_ARRAYS_f,
+ int* PARTIAL_PHYS_DISPERSION_ONLY_f,int* USE_3D_ATTENUATION_ARRAYS_f,
int* COMPUTE_AND_STORE_STRAIN_f,
- int* ANISOTROPIC_3D_MANTLE_f,
- int* ANISOTROPIC_INNER_CORE_f,
+ int* ANISOTROPIC_3D_MANTLE_f,int* ANISOTROPIC_INNER_CORE_f,
int* SAVE_BOUNDARY_MESH_f,
int* USE_MESH_COLORING_GPU_f,
- int* ANISOTROPIC_KL_f,
- int* APPROXIMATE_HESS_KL_f,
- realw* deltat_f,
- realw* b_deltat_f) {
+ int* ANISOTROPIC_KL_f,int* APPROXIMATE_HESS_KL_f,
+ realw* deltat_f,realw* b_deltat_f) {
TRACE("prepare_constants_device");
@@ -240,7 +228,7 @@
mp->attenuation = *ATTENUATION_f;
mp->undo_attenuation = *UNDO_ATTENUATION_f;
- mp->use_attenuation_mimic = *USE_ATTENUATION_MIMIC_f;
+ mp->partial_phys_dispersion_only = *PARTIAL_PHYS_DISPERSION_ONLY_f;
mp->use_3d_attenuation_arrays = *USE_3D_ATTENUATION_ARRAYS_f;
mp->compute_and_store_strain = *COMPUTE_AND_STORE_STRAIN_f;
@@ -503,7 +491,7 @@
copy_todevice_realw((void**)&mp->d_one_minus_sum_beta_crust_mantle,one_minus_sum_beta_crust_mantle,R_size2);
- if( ! mp->use_attenuation_mimic ){
+ if( ! mp->partial_phys_dispersion_only ){
// common factor
copy_todevice_realw((void**)&mp->d_factor_common_crust_mantle,factor_common_crust_mantle,R_size3);
// memory variables
@@ -515,7 +503,7 @@
}
if(mp->simulation_type == 3 ){
- if( ! mp->use_attenuation_mimic ){
+ if( ! mp->partial_phys_dispersion_only ){
// memory variables
copy_todevice_realw((void**)&mp->d_b_R_xx_crust_mantle,b_R_xx_crust_mantle,R_size1);
copy_todevice_realw((void**)&mp->d_b_R_yy_crust_mantle,b_R_yy_crust_mantle,R_size1);
@@ -537,7 +525,7 @@
copy_todevice_realw((void**)&mp->d_one_minus_sum_beta_inner_core,one_minus_sum_beta_inner_core,R_size2);
- if( ! mp->use_attenuation_mimic ){
+ if( ! mp->partial_phys_dispersion_only ){
// common factor
copy_todevice_realw((void**)&mp->d_factor_common_inner_core,factor_common_inner_core,R_size3);
// memory variables
@@ -549,7 +537,7 @@
}
if(mp->simulation_type == 3 ){
- if( ! mp->use_attenuation_mimic ){
+ if( ! mp->partial_phys_dispersion_only ){
// memory variables
copy_todevice_realw((void**)&mp->d_b_R_xx_inner_core,b_R_xx_inner_core,R_size1);
copy_todevice_realw((void**)&mp->d_b_R_yy_inner_core,b_R_yy_inner_core,R_size1);
@@ -1931,7 +1919,7 @@
if( mp->attenuation ){
cudaFree(mp->d_one_minus_sum_beta_crust_mantle);
cudaFree(mp->d_one_minus_sum_beta_inner_core);
- if( ! mp->use_attenuation_mimic ){
+ if( ! mp->partial_phys_dispersion_only ){
cudaFree(mp->d_factor_common_crust_mantle);
cudaFree(mp->d_R_xx_crust_mantle);
cudaFree(mp->d_R_yy_crust_mantle);
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/rules.mk
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/rules.mk 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/rules.mk 2013-09-09 15:42:03 UTC (rev 22780)
@@ -44,10 +44,10 @@
$O/compute_stacey_acoustic_cuda.cuda.o \
$O/compute_stacey_elastic_cuda.cuda.o \
$O/initialize_cuda.cuda.o \
- $O/it_update_displacement_cuda.cuda.o \
$O/noise_tomography_cuda.cuda.o \
$O/prepare_mesh_constants_cuda.cuda.o \
$O/transfer_fields_cuda.cuda.o \
+ $O/update_displacement_cuda.cuda.o \
$O/write_seismograms_cuda.cuda.o \
$O/save_and_compare_cpu_vs_gpu.cudacc.o \
$(EMPTY_MACRO)
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c 2013-09-09 15:42:03 UTC (rev 22780)
@@ -49,6 +49,11 @@
#endif
sprintf(filename,"maxval_surface_proc_%03d.dat",rank);
fp = fopen(filename,"a+");
+ if( fp == NULL ){
+ printf("FILE ERROR:%s\n",(char*) strerror(errno));
+ perror("file error\n");
+ exit(1);
+ }
fprintf(fp,"%e\n",*maxval);
fclose(fp);
}
@@ -63,6 +68,11 @@
sprintf(filename, "debug_output_gpu_%d.dat",*id);
}
fp = fopen(filename, "wb");
+ if( fp == NULL ){
+ printf("FILE ERROR:%s\n",(char*) strerror(errno));
+ perror("file error\n");
+ exit(1);
+ }
printf("writing vector, vector[0]=%e\n",vector[0]);
fwrite(vector, sizeof(float), *size, fp);
fclose(fp);
@@ -79,6 +89,11 @@
sprintf(filename, "debug_output_gpu_%d.dat",*id);
}
fp = fopen(filename, "wb");
+ if( fp == NULL ){
+ printf("FILE ERROR:%s\n",(char*) strerror(errno));
+ perror("file error\n");
+ exit(1);
+ }
fwrite(vector, sizeof(int), *size, fp);
fclose(fp);
@@ -89,6 +104,8 @@
int nodes_per_iteration = *nodes_per_iterationf;
char filename[BUFSIZ];
int procid;
+ size_t res;
+
#ifdef WITH_MPI
MPI_Comm_rank(MPI_COMM_WORLD,&procid);
#else
@@ -99,9 +116,7 @@
FILE* fp; int it;
printf("Opening %s for analysis\n",filename);
fp = fopen(filename,"rb");
- //char* errorstr;
- if(fp == 0) {
- //errorstr = (char*) strerror(errno);
+ if(fp == NULL) {
printf("FILE ERROR:%s\n",(char*) strerror(errno));
perror("file error\n");
exit(1);
@@ -114,7 +129,8 @@
for(it=0;it<*NSTEP;it++) {
int pos = (sizeof(float)*nodes_per_iteration)*(it);
fseek(fp,pos,SEEK_SET);
- fread(vector,sizeof(float),nodes_per_iteration,fp);
+ res = fread(vector,sizeof(float),nodes_per_iteration,fp);
+ if (res != nodes_per_iteration) {printf ("Reading error:%s\n",(char*) strerror(errno)); exit (1);}
for(i=0;i<nodes_per_iteration;i++) {
max_val = MAX(max_val,vector[i]);
}
@@ -176,19 +192,20 @@
char* cpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_proc000001_surface_movie";
char* gpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_v2_proc000001_surface_movie";
+ size_t res;
FILE* fp_cpu;
fp_cpu = fopen(cpu_file,"rb");
//char* errorstr;
- if(fp_cpu == 0) {
+ if(fp_cpu == NULL) {
//errorstr = (char*) strerror(errno);
//printf("CPU FILE ERROR:%s\n",errorstr);
printf("CPU FILE ERROR:%s\n",(char*) strerror(errno));
perror("cpu file error\n");
}
+
FILE* fp_gpu;
fp_gpu = fopen(gpu_file,"rb");
-
if(fp_gpu == NULL) {
//errorstr = (char*) strerror(errno);
//printf("GPU FILE ERROR:%s\n",errorstr);
@@ -208,8 +225,12 @@
fseek(fp_gpu,pos,SEEK_SET);
int number_of_nodes = *bytes_per_iteration/sizeof(float);
- fread(cpu_vector,sizeof(float),number_of_nodes,fp_cpu);
- fread(gpu_vector,sizeof(float),number_of_nodes,fp_gpu);
+ res = fread(cpu_vector,sizeof(float),number_of_nodes,fp_cpu);
+ if (res != number_of_nodes) {printf ("Reading error:%s\n",(char*) strerror(errno)); exit (1);}
+
+ res = fread(gpu_vector,sizeof(float),number_of_nodes,fp_gpu);
+ if (res != number_of_nodes) {printf ("Reading error:%s\n",(char*) strerror(errno)); exit (1);}
+
int size = number_of_nodes;
float gpu_min_val=10;
float gpu_max_val=0;
@@ -240,6 +261,8 @@
void compare_fvector_(float* vector, int* size, int* id, int* cpu_or_gpu) {
FILE* fp;
char cmp_filename[BUFSIZ];
+ size_t res;
+
float* compare_vector = (float*)malloc(*size*sizeof(float));
if(*cpu_or_gpu == 0) { //swap gpu/cpu for compare
sprintf(cmp_filename, "debug_output_gpu_%d.dat",*id);
@@ -247,13 +270,15 @@
else {
sprintf(cmp_filename, "debug_output_cpu_%d.dat",*id);
}
- fopen(cmp_filename, "rb");
+
+ fp = fopen(cmp_filename, "rb");
/* read the values */
- if((fp=fopen(cmp_filename, "rb"))==NULL) {
+ if( fp == NULL ) {
printf("Cannot open comparison file %s.\n",cmp_filename);
exit(1);
}
- if(fread(compare_vector, sizeof(float), *size, fp) != *size) {
+ res = fread(compare_vector, sizeof(float), *size, fp);
+ if(res != *size) {
if(feof(fp))
printf("Premature end of file.");
else
@@ -284,6 +309,8 @@
void compare_ivector_(int* vector, int* size, int* id, int* cpu_or_gpu) {
FILE* fp;
char cmp_filename[BUFSIZ];
+ size_t res;
+
int* compare_vector = (int*)malloc(*size*sizeof(int));
if(*cpu_or_gpu == 0) { //swap gpu/cpu for compare
sprintf(cmp_filename, "debug_output_gpu_%d.dat",*id);
@@ -291,13 +318,14 @@
else {
sprintf(cmp_filename, "debug_output_cpu_%d.dat",*id);
}
- fopen(cmp_filename, "rb");
+ fp = fopen(cmp_filename, "rb");
/* read the values */
- if((fp=fopen(cmp_filename, "rb"))==NULL) {
+ if( fp == NULL ) {
printf("Cannot open comparison file %s.\n",cmp_filename);
exit(1);
}
- if(fread(compare_vector, sizeof(int), *size, fp) != *size) {
+ res = fread(compare_vector, sizeof(int), *size, fp);
+ if(res != *size) {
if(feof(fp))
printf("Premature end of file.");
else
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2013-09-09 15:42:03 UTC (rev 22780)
@@ -271,48 +271,6 @@
//
-// src/cuda/it_update_displacement_cuda.cu
-//
-
-void FC_FUNC_(it_update_displacement_ic_cuda,
- IT_UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
- realw* deltat_F,
- realw* deltatsqover2_F,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {}
-
-void FC_FUNC_(it_update_displacement_cm_cuda,
- IT_UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
- realw* deltat_F,
- realw* deltatsqover2_F,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {}
-
-void FC_FUNC_(it_update_displacement_oc_cuda,
- IT_UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
- realw* deltat_F,
- realw* deltatsqover2_F,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {}
-
-void FC_FUNC_(update_accel_3_a_cuda,
- UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
- realw* deltatover2_F,
- int* NCHUNKS_VAL,
- int* FORWARD_OR_ADJOINT) {}
-
-void FC_FUNC_(update_veloc_3_b_cuda,
- UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {}
-
-void FC_FUNC_(kernel_3_outer_core_cuda,
- KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {}
-
-
-//
// src/cuda/noise_tomography_cuda.cu
//
@@ -349,39 +307,31 @@
PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
int* myrank_f,
int* h_NGLLX,
- realw* h_hprime_xx,
- realw* h_hprimewgll_xx,
+ realw* h_hprime_xx,realw* h_hprimewgll_xx,
realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
int* NSOURCES,int* nsources_local,
realw* h_sourcearrays,
int* h_islice_selected_source,int* h_ispec_selected_source,
+ int* nrec,int* nrec_local, int* nadj_rec_local,
int* h_number_receiver_global,
int* h_islice_selected_rec,int* h_ispec_selected_rec,
- int* nrec,int* nrec_local, int* nadj_rec_local,
int* NSPEC_CRUST_MANTLE, int* NGLOB_CRUST_MANTLE,
int* NSPEC_CRUST_MANTLE_STRAIN_ONLY,
int* NSPEC_OUTER_CORE, int* NGLOB_OUTER_CORE,
int* NSPEC_INNER_CORE, int* NGLOB_INNER_CORE,
int* NSPEC_INNER_CORE_STRAIN_ONLY,
- int* SIMULATION_TYPE,
- int* NOISE_TOMOGRAPHY,
- int* SAVE_FORWARD_f,
- int* ABSORBING_CONDITIONS_f,
- int* OCEANS_f,
- int* GRAVITY_f,
+ int* SIMULATION_TYPE,int* NOISE_TOMOGRAPHY,
+ int* SAVE_FORWARD_f,int* ABSORBING_CONDITIONS_f,
+ int* OCEANS_f,int* GRAVITY_f,
int* ROTATION_f,int* EXACT_MASS_MATRIX_FOR_ROTATION_f,
int* ATTENUATION_f,int* UNDO_ATTENUATION_f,
- int* USE_ATTENUATION_MIMIC_f,
- int* USE_3D_ATTENUATION_ARRAYS_f,
+ int* PARTIAL_PHYS_DISPERSION_ONLY_f,int* USE_3D_ATTENUATION_ARRAYS_f,
int* COMPUTE_AND_STORE_STRAIN_f,
- int* ANISOTROPIC_3D_MANTLE_f,
- int* ANISOTROPIC_INNER_CORE_f,
+ int* ANISOTROPIC_3D_MANTLE_f,int* ANISOTROPIC_INNER_CORE_f,
int* SAVE_BOUNDARY_MESH_f,
int* USE_MESH_COLORING_GPU_f,
- int* ANISOTROPIC_KL_f,
- int* APPROXIMATE_HESS_KL_f,
- realw* deltat_f,
- realw* b_deltat_f) {}
+ int* ANISOTROPIC_KL_f,int* APPROXIMATE_HESS_KL_f,
+ realw* deltat_f,realw* b_deltat_f) {}
void FC_FUNC_(prepare_fields_rotation_device,
PREPARE_FIELDS_ROTATION_DEVICE)(long* Mesh_pointer_f,
@@ -792,6 +742,48 @@
//
+// src/cuda/update_displacement_cuda.cu
+//
+
+void FC_FUNC_(update_displacement_ic_cuda,
+ UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
+ realw* deltat_F,
+ realw* deltatsqover2_F,
+ realw* deltatover2_F,
+ int* FORWARD_OR_ADJOINT) {}
+
+void FC_FUNC_(update_displacement_cm_cuda,
+ UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
+ realw* deltat_F,
+ realw* deltatsqover2_F,
+ realw* deltatover2_F,
+ int* FORWARD_OR_ADJOINT) {}
+
+void FC_FUNC_(update_displacement_oc_cuda,
+ UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
+ realw* deltat_F,
+ realw* deltatsqover2_F,
+ realw* deltatover2_F,
+ int* FORWARD_OR_ADJOINT) {}
+
+void FC_FUNC_(update_accel_3_a_cuda,
+ UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
+ realw* deltatover2_F,
+ int* NCHUNKS_VAL,
+ int* FORWARD_OR_ADJOINT) {}
+
+void FC_FUNC_(update_veloc_3_b_cuda,
+ UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
+ realw* deltatover2_F,
+ int* FORWARD_OR_ADJOINT) {}
+
+void FC_FUNC_(kernel_3_outer_core_cuda,
+ KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
+ realw* deltatover2_F,
+ int* FORWARD_OR_ADJOINT) {}
+
+
+//
// src/cuda/write_seismograms_cuda.cu
//
Copied: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu (from rev 22778, seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu)
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu 2013-09-09 15:42:03 UTC (rev 22780)
@@ -0,0 +1,611 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! August 2013
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// elastic wavefield
+
+// KERNEL 1
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void UpdateDispVeloc_kernel(realw* displ,
+ realw* veloc,
+ realw* accel,
+ int size,
+ realw deltat,
+ realw deltatsqover2,
+ realw deltatover2) {
+ int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+ /* because of block and grid sizing problems, there is a small */
+ /* amount of buffer at the end of the calculation */
+ if(id < size) {
+ displ[id] = displ[id] + deltat*veloc[id] + deltatsqover2*accel[id];
+ veloc[id] = veloc[id] + deltatover2*accel[id];
+ accel[id] = 0.0f; // can do this using memset...not sure if faster
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 1
+// inner core
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(update_displacement_ic_cuda,
+ UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
+ realw* deltat_F,
+ realw* deltatsqover2_F,
+ realw* deltatover2_F,
+ int* FORWARD_OR_ADJOINT) {
+
+TRACE("update_displacement_ic_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+ int size = NDIM * mp->NGLOB_INNER_CORE;
+
+ realw deltat = *deltat_F;
+ realw deltatsqover2 = *deltatsqover2_F;
+ realw deltatover2 = *deltatover2_F;
+
+ int blocksize = BLOCKSIZE_KERNEL1;
+ int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > MAXIMUM_GRID_DIM) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ //launch kernel
+ UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
+ mp->d_veloc_inner_core,
+ mp->d_accel_inner_core,
+ size,deltat,deltatsqover2,deltatover2);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ // kernel for backward fields
+ UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
+ mp->d_b_veloc_inner_core,
+ mp->d_b_accel_inner_core,
+ size,deltat,deltatsqover2,deltatover2);
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("update_displacement_ic_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 1
+// crust/mantle
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(update_displacement_cm_cuda,
+ UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
+ realw* deltat_F,
+ realw* deltatsqover2_F,
+ realw* deltatover2_F,
+ int* FORWARD_OR_ADJOINT) {
+
+ TRACE("update_displacement_cm_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+ int size = NDIM * mp->NGLOB_CRUST_MANTLE;
+
+ realw deltat = *deltat_F;
+ realw deltatsqover2 = *deltatsqover2_F;
+ realw deltatover2 = *deltatover2_F;
+
+ int blocksize = BLOCKSIZE_KERNEL1;
+ int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > MAXIMUM_GRID_DIM) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ //launch kernel
+ UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+ mp->d_veloc_crust_mantle,
+ mp->d_accel_crust_mantle,
+ size,deltat,deltatsqover2,deltatover2);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ // kernel for backward fields
+ UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+ mp->d_b_veloc_crust_mantle,
+ mp->d_b_accel_crust_mantle,
+ size,deltat,deltatsqover2,deltatover2);
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("update_displacement_cm_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic wavefield
+
+// KERNEL 1
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void UpdatePotential_kernel(realw* potential_acoustic,
+ realw* potential_dot_acoustic,
+ realw* potential_dot_dot_acoustic,
+ int size,
+ realw deltat,
+ realw deltatsqover2,
+ realw deltatover2) {
+ int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+ /* because of block and grid sizing problems, there is a small */
+ /* amount of buffer at the end of the calculation */
+ if(id < size) {
+ potential_acoustic[id] = potential_acoustic[id]
+ + deltat*potential_dot_acoustic[id]
+ + deltatsqover2*potential_dot_dot_acoustic[id];
+
+ potential_dot_acoustic[id] = potential_dot_acoustic[id]
+ + deltatover2*potential_dot_dot_acoustic[id];
+
+ potential_dot_dot_acoustic[id] = 0.0f;
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 1
+// outer core
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(update_displacement_oc_cuda,
+ UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
+ realw* deltat_F,
+ realw* deltatsqover2_F,
+ realw* deltatover2_F,
+ int* FORWARD_OR_ADJOINT) {
+
+ TRACE("update_displacement_oc_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+ int size = mp->NGLOB_OUTER_CORE;
+
+ realw deltat = *deltat_F;
+ realw deltatsqover2 = *deltatsqover2_F;
+ realw deltatover2 = *deltatover2_F;
+
+ int blocksize = BLOCKSIZE_KERNEL1;
+ int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > MAXIMUM_GRID_DIM) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ //launch kernel
+ UpdatePotential_kernel<<<grid,threads>>>(mp->d_displ_outer_core,
+ mp->d_veloc_outer_core,
+ mp->d_accel_outer_core,
+ size,deltat,deltatsqover2,deltatover2);
+ }else if( *FORWARD_OR_ADJOINT == 1 ){
+ UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,
+ mp->d_b_veloc_outer_core,
+ mp->d_b_accel_outer_core,
+ size,deltat,deltatsqover2,deltatover2);
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("update_displacement_oc_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 3
+//
+// crust/mantle and inner core regions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_cuda_device(realw* veloc,
+ realw* accel,
+ int size,
+ realw deltatover2,
+ realw two_omega_earth,
+ realw* rmassx,
+ realw* rmassy,
+ realw* rmassz) {
+ int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+ /* because of block and grid sizing problems, there is a small */
+ /* amount of buffer at the end of the calculation */
+ if(id < size) {
+ // note: update adds rotational acceleration in case two_omega_earth is non-zero
+ accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
+ accel[3*id+1] = accel[3*id+1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
+ accel[3*id+2] = accel[3*id+2]*rmassz[id];
+
+ veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+ veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+ veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_accel_cuda_device(realw* accel,
+ realw* veloc,
+ int size,
+ realw two_omega_earth,
+ realw* rmassx,
+ realw* rmassy,
+ realw* rmassz) {
+ int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+ /* because of block and grid sizing problems, there is a small */
+ /* amount of buffer at the end of the calculation */
+ if(id < size) {
+ // note: update adds rotational acceleration in case two_omega_earth is non-zero
+ accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
+ accel[3*id+1] = accel[3*id+1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
+ accel[3*id+2] = accel[3*id+2]*rmassz[id];
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_veloc_cuda_device(realw* veloc,
+ realw* accel,
+ int size,
+ realw deltatover2) {
+ int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+ /* because of block and grid sizing problems, there is a small */
+ /* amount of buffer at the end of the calculation */
+ if(id < size) {
+ veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+ veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+ veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(update_accel_3_a_cuda,
+ UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
+ realw* deltatover2_F,
+ int* NCHUNKS_VAL,
+ int* FORWARD_OR_ADJOINT) {
+ TRACE("update_accel_3_a_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+ realw deltatover2 = *deltatover2_F;
+
+ int blocksize = BLOCKSIZE_KERNEL3;
+ int size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
+
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > MAXIMUM_GRID_DIM) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ // crust/mantle region only
+ // check whether we can update accel and veloc, or only accel at this point
+ if( mp->oceans == 0 ){
+ // updates both, accel and veloc
+
+ if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
+ // uses corrected mass matrices rmassx,rmassy,rmassz
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+ mp->d_accel_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ deltatover2,
+ mp->two_omega_earth,
+ mp->d_rmassx_crust_mantle,
+ mp->d_rmassy_crust_mantle,
+ mp->d_rmassz_crust_mantle);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
+ mp->d_b_accel_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ deltatover2,
+ mp->b_two_omega_earth,
+ mp->d_rmassx_crust_mantle,
+ mp->d_rmassy_crust_mantle,
+ mp->d_rmassz_crust_mantle);
+ }
+ }else{
+ // uses only rmassz
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+ mp->d_accel_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ deltatover2,
+ mp->two_omega_earth,
+ mp->d_rmassz_crust_mantle,
+ mp->d_rmassz_crust_mantle,
+ mp->d_rmassz_crust_mantle);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
+ mp->d_b_accel_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ deltatover2,
+ mp->b_two_omega_earth,
+ mp->d_rmassz_crust_mantle,
+ mp->d_rmassz_crust_mantle,
+ mp->d_rmassz_crust_mantle);
+ }
+ }
+
+ }else{
+ // updates only accel
+
+ if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
+ // uses corrected mass matrices
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+ mp->d_veloc_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ mp->two_omega_earth,
+ mp->d_rmassx_crust_mantle,
+ mp->d_rmassy_crust_mantle,
+ mp->d_rmassz_crust_mantle);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
+ mp->d_b_veloc_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ mp->b_two_omega_earth,
+ mp->d_rmassx_crust_mantle,
+ mp->d_rmassy_crust_mantle,
+ mp->d_rmassz_crust_mantle);
+ }
+ }else{
+ // uses only rmassz
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+ mp->d_veloc_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ mp->two_omega_earth,
+ mp->d_rmassz_crust_mantle,
+ mp->d_rmassz_crust_mantle,
+ mp->d_rmassz_crust_mantle);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
+ mp->d_b_veloc_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ mp->b_two_omega_earth,
+ mp->d_rmassz_crust_mantle,
+ mp->d_rmassz_crust_mantle,
+ mp->d_rmassz_crust_mantle);
+ }
+ }
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+ exit_on_cuda_error("after update_accel_3_a");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(update_veloc_3_b_cuda,
+ UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
+ realw* deltatover2_F,
+ int* FORWARD_OR_ADJOINT) {
+ TRACE("update_veloc_3_b_cuda");
+ int size_padded,num_blocks_x,num_blocks_y;
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+ realw deltatover2 = *deltatover2_F;
+
+ int blocksize = BLOCKSIZE_KERNEL3;
+
+ // crust/mantle region
+ // in case of ocean loads, we still have to update the velocity for crust/mantle region
+ if( mp->oceans ){
+ size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
+ num_blocks_x = size_padded/blocksize;
+ num_blocks_y = 1;
+ while(num_blocks_x > MAXIMUM_GRID_DIM) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+ dim3 grid1(num_blocks_x,num_blocks_y);
+ dim3 threads1(blocksize,1,1);
+
+ // updates only veloc at this point
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_veloc_crust_mantle,
+ mp->d_accel_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ deltatover2);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_b_veloc_crust_mantle,
+ mp->d_b_accel_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ deltatover2);
+ }
+ }
+
+ // inner core region
+ size_padded = ((int)ceil(((double)mp->NGLOB_INNER_CORE)/((double)blocksize)))*blocksize;
+ num_blocks_x = size_padded/blocksize;
+ num_blocks_y = 1;
+ while(num_blocks_x > MAXIMUM_GRID_DIM) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ // updates both, accel and veloc
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
+ mp->d_accel_inner_core,
+ mp->NGLOB_INNER_CORE,
+ deltatover2,
+ mp->two_omega_earth,
+ mp->d_rmassx_inner_core,
+ mp->d_rmassy_inner_core,
+ mp->d_rmassz_inner_core);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core,
+ mp->d_b_accel_inner_core,
+ mp->NGLOB_INNER_CORE,
+ deltatover2,
+ mp->b_two_omega_earth,
+ mp->d_rmassx_inner_core,
+ mp->d_rmassy_inner_core,
+ mp->d_rmassz_inner_core);
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+ exit_on_cuda_error("after update_veloc_3_b");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 3
+//
+// for outer_core
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void kernel_3_outer_core_cuda_device(realw* veloc,
+ realw* accel,int size,
+ realw deltatover2,
+ realw* rmass) {
+ int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+ /* because of block and grid sizing problems, there is a small */
+ /* amount of buffer at the end of the calculation */
+ if(id < size) {
+ // multiplies pressure with the inverse of the mass matrix
+ accel[id] = accel[id]*rmass[id];
+
+ // Newmark time scheme: corrector term
+ veloc[id] = veloc[id] + deltatover2*accel[id];
+ }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_outer_core_cuda,
+ KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
+ realw* deltatover2_F,
+ int* FORWARD_OR_ADJOINT) {
+
+ TRACE("kernel_3_outer_core_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+ realw deltatover2 = *deltatover2_F;
+
+ int blocksize = BLOCKSIZE_KERNEL3;
+
+ int size_padded = ((int)ceil(((double)mp->NGLOB_OUTER_CORE)/((double)blocksize)))*blocksize;
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > MAXIMUM_GRID_DIM) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
+ mp->d_accel_outer_core,
+ mp->NGLOB_OUTER_CORE,
+ deltatover2,mp->d_rmass_outer_core);
+ }else if( *FORWARD_OR_ADJOINT == 3){
+ kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_b_veloc_outer_core,
+ mp->d_b_accel_outer_core,
+ mp->NGLOB_OUTER_CORE,
+ deltatover2,mp->d_rmass_outer_core);
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+ exit_on_cuda_error("after kernel_3_outer_core");
+#endif
+}
+
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_seismograms.f90 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_seismograms.f90 2013-09-09 15:42:03 UTC (rev 22780)
@@ -34,7 +34,7 @@
use specfem_par,only: &
NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
- nrec,nrec_local, &
+ nrec_local, &
nu,hxir_store,hetar_store,hgammar_store, &
ispec_selected_rec,number_receiver_global, &
scale_displ
@@ -117,7 +117,7 @@
use specfem_par,only: &
NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS,UNDO_ATTENUATION, &
- NSOURCES,nrec_local, &
+ nrec_local, &
nu_source,Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
hxir_store,hpxir_store,hetar_store,hpetar_store,hgammar_store,hpgammar_store, &
tshift_cmt,hdur_gaussian, &
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2013-09-09 15:42:03 UTC (rev 22780)
@@ -1829,7 +1829,7 @@
! user output
if(myrank == 0 ) then
- write(IMAIN,*) "preparing Fields and Constants on GPU Device"
+ write(IMAIN,*) "preparing fields and constants on GPU devices:"
call flush_IMAIN()
endif
@@ -1841,33 +1841,31 @@
! prepares general fields on GPU
call prepare_constants_device(Mesh_pointer,myrank,NGLLX, &
- hprime_xx, &
- hprimewgll_xx, &
- wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
- NSOURCES, nsources_local, &
- sourcearrays, &
- islice_selected_source,ispec_selected_source, &
- number_receiver_global, &
- islice_selected_rec,ispec_selected_rec, &
- nrec, nrec_local, nadj_rec_local, &
- NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
- NSPEC_CRUST_MANTLE_STRAIN_ONLY, &
- NSPEC_OUTER_CORE,NGLOB_OUTER_CORE, &
- NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
- NSPEC_INNER_CORE_STRAIN_ONLY, &
- SIMULATION_TYPE,NOISE_TOMOGRAPHY, &
- SAVE_FORWARD,ABSORBING_CONDITIONS, &
- OCEANS_VAL, &
- GRAVITY_VAL, &
- ROTATION_VAL,EXACT_MASS_MATRIX_FOR_ROTATION, &
- ATTENUATION_VAL,UNDO_ATTENUATION, &
- PARTIAL_PHYS_DISPERSION_ONLY,USE_3D_ATTENUATION_ARRAYS, &
- COMPUTE_AND_STORE_STRAIN, &
- ANISOTROPIC_3D_MANTLE_VAL,ANISOTROPIC_INNER_CORE_VAL, &
- SAVE_BOUNDARY_MESH, &
- USE_MESH_COLORING_GPU, &
- ANISOTROPIC_KL,APPROXIMATE_HESS_KL, &
- deltat,b_deltat)
+ hprime_xx,hprimewgll_xx, &
+ wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+ NSOURCES, nsources_local, &
+ sourcearrays, &
+ islice_selected_source,ispec_selected_source, &
+ nrec, nrec_local, nadj_rec_local, &
+ number_receiver_global, &
+ islice_selected_rec,ispec_selected_rec, &
+ NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
+ NSPEC_CRUST_MANTLE_STRAIN_ONLY, &
+ NSPEC_OUTER_CORE,NGLOB_OUTER_CORE, &
+ NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
+ NSPEC_INNER_CORE_STRAIN_ONLY, &
+ SIMULATION_TYPE,NOISE_TOMOGRAPHY, &
+ SAVE_FORWARD,ABSORBING_CONDITIONS, &
+ OCEANS_VAL,GRAVITY_VAL, &
+ ROTATION_VAL,EXACT_MASS_MATRIX_FOR_ROTATION, &
+ ATTENUATION_VAL,UNDO_ATTENUATION, &
+ PARTIAL_PHYS_DISPERSION_ONLY,USE_3D_ATTENUATION_ARRAYS, &
+ COMPUTE_AND_STORE_STRAIN, &
+ ANISOTROPIC_3D_MANTLE_VAL,ANISOTROPIC_INNER_CORE_VAL, &
+ SAVE_BOUNDARY_MESH, &
+ USE_MESH_COLORING_GPU, &
+ ANISOTROPIC_KL,APPROXIMATE_HESS_KL, &
+ deltat,b_deltat)
! prepares rotation arrays
if( ROTATION_VAL ) then
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/rules.mk
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/rules.mk 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/rules.mk 2013-09-09 15:42:03 UTC (rev 22780)
@@ -155,10 +155,10 @@
$O/compute_stacey_acoustic_cuda.cuda.o \
$O/compute_stacey_elastic_cuda.cuda.o \
$O/initialize_cuda.cuda.o \
- $O/it_update_displacement_cuda.cuda.o \
$O/noise_tomography_cuda.cuda.o \
$O/prepare_mesh_constants_cuda.cuda.o \
$O/transfer_fields_cuda.cuda.o \
+ $O/update_displacement_cuda.cuda.o \
$O/write_seismograms_cuda.cuda.o \
$O/save_and_compare_cpu_vs_gpu.cudacc.o \
$(EMPTY_MACRO)
@@ -243,6 +243,8 @@
@echo ""
${NVCCLINK} -o $(cuda_DEVICE_OBJ) $(cuda_OBJECTS)
${FCLINK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(cuda_DEVICE_OBJ) $(MPILIBS) $(CUDA_LINK)
+ @echo ""
+
else
## cuda 4 version
@@ -251,6 +253,8 @@
@echo "building xspecfem3D with CUDA 4 support"
@echo ""
${FCLINK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(MPILIBS) $(CUDA_LINK)
+ @echo ""
+
endif
else
@@ -264,6 +268,7 @@
## DK DK add OpenMP compiler flag here if needed
# ${MPIFCCOMPILE_CHECK} -qsmp=omp -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(MPILIBS)
${MPIFCCOMPILE_CHECK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(MPILIBS)
+ @echo ""
endif
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90 2013-09-09 15:42:03 UTC (rev 22780)
@@ -87,11 +87,11 @@
! on GPU
! Includes FORWARD_OR_ADJOINT == 1
! outer core region
- call it_update_displacement_oc_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
+ call update_displacement_oc_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
! inner core region
- call it_update_displacement_ic_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
+ call update_displacement_ic_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
! crust/mantle region
- call it_update_displacement_cm_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
+ call update_displacement_cm_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
endif
end subroutine update_displacement_Newmark
@@ -130,11 +130,11 @@
! on GPU
! Includes FORWARD_OR_ADJOINT == 3
! outer core region
- call it_update_displacement_oc_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
+ call update_displacement_oc_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
! inner core region
- call it_update_displacement_ic_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
+ call update_displacement_ic_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
! crust/mantle region
- call it_update_displacement_cm_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
+ call update_displacement_cm_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
endif
end subroutine update_displacement_Newmark_backward
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_output_SAC.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_output_SAC.f90 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_output_SAC.f90 2013-09-09 15:42:03 UTC (rev 22780)
@@ -34,7 +34,6 @@
use specfem_par,only: &
ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI, &
- myrank,nrec, &
station_name,network_name,stlat,stlon,stele,stbur, &
DT,t0, &
seismo_offset,seismo_current,it_end, &
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90 2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90 2013-09-09 15:42:03 UTC (rev 22780)
@@ -334,15 +334,12 @@
use constants_solver
use specfem_par,only: &
- ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI, &
- myrank,nrec, &
- station_name,network_name,stlat,stlon,stele,stbur, &
- DT,t0, &
- seismo_offset,seismo_current,it_end, &
+ myrank, &
+ station_name,network_name,stlat,stlon, &
+ DT, &
+ seismo_current, &
OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM, &
- OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
- SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE, &
- MODEL,OUTPUT_FILES
+ OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SEISMOS
use specfem_par,only: &
cmt_lat=>cmt_lat_SAC,cmt_lon=>cmt_lon_SAC
Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/utils/check_cuda_device.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/utils/check_cuda_device.cu (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/utils/check_cuda_device.cu 2013-09-09 15:42:03 UTC (rev 22780)
@@ -0,0 +1,288 @@
+/*
+**************************
+
+check_cuda_device utility
+
+**************************
+
+this utility program will output GPU device informations helpful for debugging CUDA.
+
+
+for compilation, see the command-line examples given here:
+
+- example without MPI support:
+
+nvcc -o check_cuda_device check_cuda_device.cu
+./check_cuda_device
+
+- example with MPI support:
+
+nvcc -DWITH_MPI -I/usr/lib/openmpi/include -o check_cuda_device check_cuda_device.cu -lmpi -L/usr/lib/openmpi/lib
+mpirun -np 2 ./check_cuda_device
+
+
+*/
+
+#include <stdio.h>
+#include <cuda.h>
+
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void get_free_memory(double* free_db, double* used_db, double* total_db) {
+
+ // gets memory usage in byte
+ size_t free_byte ;
+ size_t total_byte ;
+ cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+ if ( cudaSuccess != cuda_status ){
+ printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
+ exit(EXIT_FAILURE);
+ }
+
+ *free_db = (double)free_byte ;
+ *total_db = (double)total_byte ;
+ *used_db = *total_db - *free_db ;
+ return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void exit_on_error(char* info) {
+ printf("\nERROR: %s\n",info);
+ fflush(stdout);
+
+ // stops program
+#ifdef WITH_MPI
+ MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+ //free(info);
+ exit(EXIT_FAILURE);
+ return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void exit_on_cuda_error(char* kernel_name) {
+ // sync and check to catch errors from previous async operations
+ cudaThreadSynchronize();
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess){
+ printf("Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
+
+ // releases previous contexts
+#if CUDA_VERSION < 4000
+ cudaThreadExit();
+#else
+ cudaDeviceReset();
+#endif
+
+ // stops program
+ //free(kernel_name);
+#ifdef WITH_MPI
+ MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+ exit(EXIT_FAILURE);
+ }
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// GPU initialization
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+void initialize_cuda_device(int* myrank_f,int* ncuda_devices) {
+
+ int device;
+ int device_count;
+
+ // Gets rank number of MPI process
+ int myrank = *myrank_f;
+
+ /*
+ // cuda initialization (needs -lcuda library)
+ // note: cuInit initializes the driver API.
+ // it is needed for any following CUDA driver API function call (format cuFUNCTION(..) )
+ // however, for the CUDA runtime API functions (format cudaFUNCTION(..) )
+ // the initialization is implicit, thus cuInit() here would not be needed...
+ CUresult status = cuInit(0);
+ if ( CUDA_SUCCESS != status ) exit_on_error("CUDA driver API device initialization failed\n");
+
+ // returns a handle to the first cuda compute device
+ CUdevice dev;
+ status = cuDeviceGet(&dev, 0);
+ if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device not found\n");
+
+ // gets device properties
+ int major,minor;
+ status = cuDeviceComputeCapability(&major,&minor,dev);
+ if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device information not found\n");
+
+ // make sure that the device has compute capability >= 1.3
+ if (major < 1){
+ fprintf(stderr,"Compute capability major number should be at least 1, got: %d \nexiting...\n",major);
+ exit_on_error("CUDA Compute capability major number should be at least 1\n");
+ }
+ if (major == 1 && minor < 3){
+ fprintf(stderr,"Compute capability should be at least 1.3, got: %d.%d \nexiting...\n",major,minor);
+ exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
+ }
+ */
+
+ // note: from here on we use the runtime API ...
+
+ // Gets number of GPU devices
+ device_count = 0;
+ cudaGetDeviceCount(&device_count);
+ exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\ncheck if driver and runtime libraries work together\nexiting...\n");
+
+ // returns device count to fortran
+ if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
+ *ncuda_devices = device_count;
+
+ // Sets the active device
+ if(device_count >= 1) {
+ // generalized for more GPUs per node
+ // note: without previous context release, cudaSetDevice will complain with the cuda error
+ // "setting the device when a process is active is not allowed"
+
+ // releases previous contexts
+#if CUDA_VERSION < 4000
+ cudaThreadExit();
+#else
+ cudaDeviceReset();
+#endif
+
+ //printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count);
+ //MPI_Barrier(MPI_COMM_WORLD);
+
+ // sets active device
+ device = myrank % device_count;
+ cudaSetDevice( device );
+ exit_on_cuda_error("cudaSetDevice has invalid device");
+
+ // double check that device was properly selected
+ cudaGetDevice(&device);
+ if( device != (myrank % device_count) ){
+ printf("error rank: %d devices: %d \n",myrank,device_count);
+ printf(" cudaSetDevice()=%d\n cudaGetDevice()=%d\n",myrank%device_count,device);
+ exit_on_error("CUDA set/get device error: device id conflict \n");
+ }
+ }
+
+ // returns a handle to the active device
+ cudaGetDevice(&device);
+
+ // get device properties
+ struct cudaDeviceProp deviceProp;
+ cudaGetDeviceProperties(&deviceProp,device);
+
+ // exit if the machine has no CUDA-enabled device
+ if (deviceProp.major == 9999 && deviceProp.minor == 9999){
+ printf("No CUDA-enabled device found, exiting...\n\n");
+ exit_on_error("CUDA runtime error: there is no CUDA-enabled device found\n");
+ }
+
+ // outputs device infos to file
+
+ printf("GPU device for rank: %d\n\n",myrank);
+
+ // display device properties
+ printf("Device Name = %s\n",deviceProp.name);
+ printf("multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
+ printf("totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
+ printf("totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
+ printf("sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
+ printf("Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
+ printf("Maximum size of each dimension of a block: %d x %d x %d\n",
+ deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
+ printf("Maximum sizes of each dimension of a grid: %d x %d x %d\n",
+ deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
+ printf("Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
+ if(deviceProp.canMapHostMemory){
+ printf("canMapHostMemory: TRUE\n");
+ }else{
+ printf("canMapHostMemory: FALSE\n");
+ }
+ if(deviceProp.deviceOverlap){
+ printf("deviceOverlap: TRUE\n");
+ }else{
+ printf("deviceOverlap: FALSE\n");
+ }
+
+ // outputs initial memory infos via cudaMemGetInfo()
+ double free_db,used_db,total_db;
+ get_free_memory(&free_db,&used_db,&total_db);
+ printf("\n%d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n\n",myrank,
+ used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+
+
+ // make sure that the device has compute capability >= 1.3
+ if (deviceProp.major < 1){
+ printf("Compute capability major number should be at least 1, exiting...\n\n");
+ exit_on_error("CUDA Compute capability major number should be at least 1\n");
+ }
+ if (deviceProp.major == 1 && deviceProp.minor < 3){
+ printf("Compute capability should be at least 1.3, exiting...\n");
+ exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
+ }
+ // we use pinned memory for asynchronous copy
+ if( ! deviceProp.canMapHostMemory){
+ printf("Device capability should allow to map host memory, exiting...\n");
+ exit_on_error("CUDA Device capability canMapHostMemory should be TRUE\n");
+ }
+}
+
+
+int main(int argc, char **argv)
+{
+
+ int myrank,ndevices;
+
+ // initialize
+#ifdef WITH_MPI
+ int size;
+ int rc;
+ rc = MPI_Init(&argc,&argv);
+ if (rc != MPI_SUCCESS) {
+ printf ("Error starting MPI program. Terminating.\n");
+ MPI_Abort(MPI_COMM_WORLD, rc);
+ }
+ MPI_Comm_size(MPI_COMM_WORLD,&size);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+ if( myrank == 0 ){ printf ("Number of MPI processes = %d \n",size); }
+#else
+ myrank = 0;
+#endif
+
+ ndevices = 0;
+
+ // initializes cuda devices
+ initialize_cuda_device(&myrank,&ndevices);
+
+ // releases previous contexts
+#if CUDA_VERSION < 4000
+ cudaThreadExit();
+#else
+ cudaDeviceReset();
+#endif
+
+ printf("number of total devices: %d\n\n",ndevices);
+
+#ifdef WITH_MPI
+ MPI_Finalize();
+#endif
+ return 0;
+}
+
More information about the CIG-COMMITS
mailing list