[cig-commits] r19050 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src: cuda generate_databases specfem3D
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Fri Oct 7 18:09:46 PDT 2011
Author: danielpeter
Date: 2011-10-07 18:09:46 -0700 (Fri, 07 Oct 2011)
New Revision: 19050
Added:
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
Modified:
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_mass_matrices.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_acoustic.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_acoustic.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_elastic.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
Log:
updates acoustic/elastic coupling and oceans routines
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
@@ -657,9 +685,9 @@
i = threadIdx.x;
j = threadIdx.y;
k = threadIdx.z;
- iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
+ iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
- //kappal = kappastore[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+ //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
//potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
// pre_computed_irec_local_index[irec],
Added: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu (rev 0)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -0,0 +1,326 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+#include <mpi.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ACOUSTIC - ELASTIC coupling
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_coupling_acoustic_el_kernel(float* displ,
+ float* potential_dot_dot_acoustic,
+ int num_coupling_ac_el_faces,
+ int* coupling_ac_el_ispec,
+ int* coupling_ac_el_ijk,
+ float* coupling_ac_el_normal,
+ float* coupling_ac_el_jacobian2Dw,
+ int* ibool,
+ int* ispec_is_inner,
+ int phase_is_inner) {
+
+ int igll = threadIdx.x;
+ int iface = blockIdx.x + gridDim.x*blockIdx.y;
+
+ int i,j,k,iglob,ispec;
+ realw displ_x,displ_y,displ_z,displ_n;
+ realw nx,ny,nz;
+ realw jacobianw;
+
+ if( iface < num_coupling_ac_el_faces){
+
+ // don't compute points outside NGLLSQUARE==NGLL2==25
+ // way 2: no further check needed since blocksize = 25
+ // if(igll<NGLL2) {
+
+ // "-1" from index values to convert from Fortran-> C indexing
+ ispec = coupling_ac_el_ispec[iface]-1;
+
+ if(ispec_is_inner[ispec] == phase_is_inner ) {
+
+ i = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1;
+ j = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+ k = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+ iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]- 1;
+
+ // elastic displacement on global point
+ displ_x = displ[iglob*3] ; // (1,iglob)
+ displ_y = displ[iglob*3+1] ; // (2,iglob)
+ displ_z = displ[iglob*3+2] ; // (3,iglob)
+
+ // gets associated normal on GLL point
+ nx = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; // (1,igll,iface)
+ ny = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,1,igll,iface)]; // (2,igll,iface)
+ nz = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,2,igll,iface)]; // (3,igll,iface)
+
+ // calculates displacement component along normal
+ // (normal points outwards of acoustic element)
+ displ_n = displ_x*nx + displ_y*ny + displ_z*nz;
+
+
+ // gets associated, weighted jacobian
+ jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
+
+ //daniel
+ //if( igll == 0 ) printf("gpu: %i %i %i %i %i %e \n",i,j,k,ispec,iglob,jacobianw);
+
+
+ // continuity of pressure and normal displacement on global point
+
+ // note: newark time scheme together with definition of scalar potential:
+ // pressure = - chi_dot_dot
+ // requires that this coupling term uses the updated displacement at time step [t+delta_t],
+ // which is done at the very beginning of the time loop
+ // (see e.g. Chaljub & Vilotte, Nissen-Meyer thesis...)
+ // it also means you have to calculate and update this here first before
+ // calculating the coupling on the elastic side for the acceleration...
+ atomicAdd(&potential_dot_dot_acoustic[iglob],+ jacobianw*displ_n);
+
+ }
+ // }
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_coupling_acoustic_el_cuda,
+ COMPUTE_COUPLING_ACOUSTIC_EL_CUDA)(
+ long* Mesh_pointer_f,
+ int* phase_is_innerf,
+ int* num_coupling_ac_el_facesf,
+ int* SIMULATION_TYPEf) {
+ TRACE("compute_coupling_acoustic_el_cuda");
+ //double start_time = get_time();
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ int phase_is_inner = *phase_is_innerf;
+ int num_coupling_ac_el_faces = *num_coupling_ac_el_facesf;
+ int SIMULATION_TYPE = *SIMULATION_TYPEf;
+
+ // way 1: exact blocksize to match NGLLSQUARE
+ int blocksize = 25;
+
+ int num_blocks_x = num_coupling_ac_el_faces;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+//daniel
+// printf("gpu: %i %i %i \n",num_coupling_ac_el_faces,SIMULATION_TYPE,phase_is_inner);
+
+
+ compute_coupling_acoustic_el_kernel<<<grid,threads>>>(mp->d_displ,
+ mp->d_potential_dot_dot_acoustic,
+ num_coupling_ac_el_faces,
+ mp->d_coupling_ac_el_ispec,
+ mp->d_coupling_ac_el_ijk,
+ mp->d_coupling_ac_el_normal,
+ mp->d_coupling_ac_el_jacobian2Dw,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ phase_is_inner);
+
+ // adjoint simulations
+ if (SIMULATION_TYPE == 3 ){
+ compute_coupling_acoustic_el_kernel<<<grid,threads>>>(mp->d_b_displ,
+ mp->d_b_potential_dot_dot_acoustic,
+ num_coupling_ac_el_faces,
+ mp->d_coupling_ac_el_ispec,
+ mp->d_coupling_ac_el_ijk,
+ mp->d_coupling_ac_el_normal,
+ mp->d_coupling_ac_el_jacobian2Dw,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ phase_is_inner);
+
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+ exit_on_cuda_error("compute_coupling_acoustic_el_kernel");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ELASTIC - ACOUSTIC coupling
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_coupling_elastic_ac_kernel(float* potential_dot_dot_acoustic,
+ float* accel,
+ int num_coupling_ac_el_faces,
+ int* coupling_ac_el_ispec,
+ int* coupling_ac_el_ijk,
+ float* coupling_ac_el_normal,
+ float* coupling_ac_el_jacobian2Dw,
+ int* ibool,
+ int* ispec_is_inner,
+ int phase_is_inner) {
+
+ int igll = threadIdx.x;
+ int iface = blockIdx.x + gridDim.x*blockIdx.y;
+
+ int i,j,k,iglob,ispec;
+ realw pressure;
+ realw nx,ny,nz;
+ realw jacobianw;
+
+ if( iface < num_coupling_ac_el_faces){
+
+ // don't compute points outside NGLLSQUARE==NGLL2==25
+ // way 2: no further check needed since blocksize = 25
+ // if(igll<NGLL2) {
+
+ // "-1" from index values to convert from Fortran-> C indexing
+ ispec = coupling_ac_el_ispec[iface]-1;
+
+ if(ispec_is_inner[ispec] == phase_is_inner ) {
+
+ i = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1;
+ j = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+ k = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+ iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]- 1;
+
+ // acoustic pressure on global point
+ pressure = - potential_dot_dot_acoustic[iglob];
+
+ // gets associated normal on GLL point
+ nx = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; // (1,igll,iface)
+ ny = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,1,igll,iface)]; // (2,igll,iface)
+ nz = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,2,igll,iface)]; // (3,igll,iface)
+
+ // gets associated, weighted jacobian
+ jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
+
+ //daniel
+ //if( igll == 0 ) printf("gpu: %i %i %i %i %i %e \n",i,j,k,ispec,iglob,jacobianw);
+
+
+ // continuity of displacement and pressure on global point
+ //
+ // note: newark time scheme together with definition of scalar potential:
+ // pressure = - chi_dot_dot
+ // requires that this coupling term uses the *UPDATED* pressure (chi_dot_dot), i.e.
+ // pressure at time step [t + delta_t]
+ // (see e.g. Chaljub & Vilotte, Nissen-Meyer thesis...)
+ // it means you have to calculate and update the acoustic pressure first before
+ // calculating this term...
+ atomicAdd(&accel[iglob*3],+ jacobianw*nx*pressure);
+ atomicAdd(&accel[iglob*3+1],+ jacobianw*ny*pressure);
+ atomicAdd(&accel[iglob*3+2],+ jacobianw*nz*pressure);
+ }
+ // }
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_coupling_elastic_ac_cuda,
+ COMPUTE_COUPLING_ELASTIC_AC_CUDA)(
+ long* Mesh_pointer_f,
+ int* phase_is_innerf,
+ int* num_coupling_ac_el_facesf,
+ int* SIMULATION_TYPEf) {
+ TRACE("compute_coupling_elastic_ac_cuda");
+ //double start_time = get_time();
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ int phase_is_inner = *phase_is_innerf;
+ int num_coupling_ac_el_faces = *num_coupling_ac_el_facesf;
+ int SIMULATION_TYPE = *SIMULATION_TYPEf;
+
+ // way 1: exact blocksize to match NGLLSQUARE
+ int blocksize = 25;
+
+ int num_blocks_x = num_coupling_ac_el_faces;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ //daniel
+ // printf("gpu: %i %i %i \n",num_coupling_ac_el_faces,SIMULATION_TYPE,phase_is_inner);
+
+
+ compute_coupling_elastic_ac_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+ mp->d_accel,
+ num_coupling_ac_el_faces,
+ mp->d_coupling_ac_el_ispec,
+ mp->d_coupling_ac_el_ijk,
+ mp->d_coupling_ac_el_normal,
+ mp->d_coupling_ac_el_jacobian2Dw,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ phase_is_inner);
+
+ // adjoint simulations
+ if (SIMULATION_TYPE == 3 ){
+ compute_coupling_elastic_ac_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
+ mp->d_b_accel,
+ num_coupling_ac_el_faces,
+ mp->d_coupling_ac_el_ispec,
+ mp->d_coupling_ac_el_ijk,
+ mp->d_coupling_ac_el_normal,
+ mp->d_coupling_ac_el_jacobian2Dw,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ phase_is_inner);
+
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+ exit_on_cuda_error("compute_coupling_elastic_ac_cuda");
+#endif
+}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,32 +1,31 @@
/*
-!=====================================================================
-!
-! S p e c f e m 3 D V e r s i o n 2 . 0
-! ---------------------------------------
-!
-! Main authors: Dimitri Komatitsch and Jeroen Tromp
-! Princeton University, USA and University of Pau / CNRS / INRIA
-! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
-! April 2011
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-*/
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
-
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
@@ -51,8 +50,6 @@
int* d_ibool_interfaces_ext_mesh) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- //int bx = blockIdx.y*gridDim.x+blockIdx.x;
- //int tx = threadIdx.x;
int iinterface=0;
for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
@@ -144,8 +141,6 @@
int* d_ibool_interfaces_ext_mesh) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- //int bx = blockIdx.y*gridDim.x+blockIdx.x;
- //int tx = threadIdx.x;
int iinterface=0;
for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
@@ -186,11 +181,19 @@
TRACE("transfer_and_assemble_potential_to_device");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+ //double start_time = get_time();
+ // cudaEvent_t start, stop;
+ // float time;
+ // cudaEventCreate(&start);
+ // cudaEventCreate(&stop);
+ // cudaEventRecord( start, 0 );
+ // copies buffer onto GPU
cudaMemcpy(mp->d_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh,
*max_nibool_interfaces_ext_mesh* *num_interfaces_ext_mesh*sizeof(real), cudaMemcpyHostToDevice);
- int blocksize = 256;
+ // assembles on GPU
+ int blocksize = 256;
int size_padded = ((int)ceil(((double)*max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
@@ -201,12 +204,7 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- //double start_time = get_time();
- // cudaEvent_t start, stop;
- // float time;
- // cudaEventCreate(&start);
- // cudaEventCreate(&stop);
- // cudaEventRecord( start, 0 );
+
if(*FORWARD_OR_ADJOINT == 1) {
//assemble forward field
assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
@@ -226,13 +224,15 @@
mp->d_ibool_interfaces_ext_mesh);
}
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
// cudaEventRecord( stop, 0 );
// cudaEventSynchronize( stop );
// cudaEventElapsedTime( &time, start, stop );
// cudaEventDestroy( start );
// cudaEventDestroy( stop );
// printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
exit_on_cuda_error("transfer_and_assemble_potential_to_device");
#endif
}
@@ -267,6 +267,7 @@
int* SIMULATION_TYPE) {
TRACE("compute_forces_acoustic_cuda");
+ //double start_time = get_time();
Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
@@ -284,12 +285,19 @@
Kernel_2_acoustic(num_elements, mp, *iphase, *SIMULATION_TYPE);
cudaThreadSynchronize();
-/* MPI_Barrier(MPI_COMM_WORLD); */
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ /* MPI_Barrier(MPI_COMM_WORLD); */
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+#endif
}
/* ----------------------------------------------------------------------------------------------- */
+
/* KERNEL 2 */
+
/* ----------------------------------------------------------------------------------------------- */
void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase, int SIMULATION_TYPE)
@@ -358,12 +366,13 @@
#endif
}
+/* ----------------------------------------------------------------------------------------------- */
/* KERNEL 2 on device*/
-//typedef double reald;
-//typedef float reald;
+/* ----------------------------------------------------------------------------------------------- */
+
__global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
int* d_phase_ispec_inner_acoustic,
int num_phase_ispec_acoustic, int d_iphase,
@@ -374,15 +383,9 @@
float* wgllwgll_xy,float* wgllwgll_xz,float* wgllwgll_yz,
float* d_rhostore){
- /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
int bx = blockIdx.y*gridDim.x+blockIdx.x;
- /* int bx = blockIdx.x; */
int tx = threadIdx.x;
-
-
- //const int NGLLX = 5;
- // const int NGLL2 = 25;
const int NGLL3 = 125;
const int NGLL3_ALIGN = 128;
@@ -466,23 +469,23 @@
}
#else
- temp1l = s_dummy_loc[K*NGLL2+J*NGLLX]*hprime_xx[I]
- + s_dummy_loc[K*NGLL2+J*NGLLX+1]*hprime_xx[NGLLX+I]
- + s_dummy_loc[K*NGLL2+J*NGLLX+2]*hprime_xx[2*NGLLX+I]
- + s_dummy_loc[K*NGLL2+J*NGLLX+3]*hprime_xx[3*NGLLX+I]
- + s_dummy_loc[K*NGLL2+J*NGLLX+4]*hprime_xx[4*NGLLX+I];
-
- temp2l = s_dummy_loc[K*NGLL2+I]*hprime_xx[J]
- + s_dummy_loc[K*NGLL2+NGLLX+I]*hprime_xx[NGLLX+J]
- + s_dummy_loc[K*NGLL2+2*NGLLX+I]*hprime_xx[2*NGLLX+J]
- + s_dummy_loc[K*NGLL2+3*NGLLX+I]*hprime_xx[3*NGLLX+J]
- + s_dummy_loc[K*NGLL2+4*NGLLX+I]*hprime_xx[4*NGLLX+J];
+ temp1l = s_dummy_loc[K*NGLL2+J*NGLLX]*hprime_xx[I]
+ + s_dummy_loc[K*NGLL2+J*NGLLX+1]*hprime_xx[NGLLX+I]
+ + s_dummy_loc[K*NGLL2+J*NGLLX+2]*hprime_xx[2*NGLLX+I]
+ + s_dummy_loc[K*NGLL2+J*NGLLX+3]*hprime_xx[3*NGLLX+I]
+ + s_dummy_loc[K*NGLL2+J*NGLLX+4]*hprime_xx[4*NGLLX+I];
+
+ temp2l = s_dummy_loc[K*NGLL2+I]*hprime_xx[J]
+ + s_dummy_loc[K*NGLL2+NGLLX+I]*hprime_xx[NGLLX+J]
+ + s_dummy_loc[K*NGLL2+2*NGLLX+I]*hprime_xx[2*NGLLX+J]
+ + s_dummy_loc[K*NGLL2+3*NGLLX+I]*hprime_xx[3*NGLLX+J]
+ + s_dummy_loc[K*NGLL2+4*NGLLX+I]*hprime_xx[4*NGLLX+J];
- temp3l = s_dummy_loc[J*NGLLX+I]*hprime_xx[K]
- + s_dummy_loc[NGLL2+J*NGLLX+I]*hprime_xx[NGLLX+K]
- + s_dummy_loc[2*NGLL2+J*NGLLX+I]*hprime_xx[2*NGLLX+K]
- + s_dummy_loc[3*NGLL2+J*NGLLX+I]*hprime_xx[3*NGLLX+K]
- + s_dummy_loc[4*NGLL2+J*NGLLX+I]*hprime_xx[4*NGLLX+K];
+ temp3l = s_dummy_loc[J*NGLLX+I]*hprime_xx[K]
+ + s_dummy_loc[NGLL2+J*NGLLX+I]*hprime_xx[NGLLX+K]
+ + s_dummy_loc[2*NGLL2+J*NGLLX+I]*hprime_xx[2*NGLLX+K]
+ + s_dummy_loc[3*NGLL2+J*NGLLX+I]*hprime_xx[3*NGLLX+K]
+ + s_dummy_loc[4*NGLL2+J*NGLLX+I]*hprime_xx[4*NGLLX+K];
#endif
@@ -546,25 +549,25 @@
}
#else
- temp1l = s_temp1[K*NGLL2+J*NGLLX]*hprimewgll_xx[I*NGLLX]
- + s_temp1[K*NGLL2+J*NGLLX+1]*hprimewgll_xx[I*NGLLX+1]
- + s_temp1[K*NGLL2+J*NGLLX+2]*hprimewgll_xx[I*NGLLX+2]
- + s_temp1[K*NGLL2+J*NGLLX+3]*hprimewgll_xx[I*NGLLX+3]
- + s_temp1[K*NGLL2+J*NGLLX+4]*hprimewgll_xx[I*NGLLX+4];
-
+ temp1l = s_temp1[K*NGLL2+J*NGLLX]*hprimewgll_xx[I*NGLLX]
+ + s_temp1[K*NGLL2+J*NGLLX+1]*hprimewgll_xx[I*NGLLX+1]
+ + s_temp1[K*NGLL2+J*NGLLX+2]*hprimewgll_xx[I*NGLLX+2]
+ + s_temp1[K*NGLL2+J*NGLLX+3]*hprimewgll_xx[I*NGLLX+3]
+ + s_temp1[K*NGLL2+J*NGLLX+4]*hprimewgll_xx[I*NGLLX+4];
+
- temp2l = s_temp2[K*NGLL2+I]*hprimewgll_xx[J*NGLLX]
- + s_temp2[K*NGLL2+NGLLX+I]*hprimewgll_xx[J*NGLLX+1]
- + s_temp2[K*NGLL2+2*NGLLX+I]*hprimewgll_xx[J*NGLLX+2]
- + s_temp2[K*NGLL2+3*NGLLX+I]*hprimewgll_xx[J*NGLLX+3]
- + s_temp2[K*NGLL2+4*NGLLX+I]*hprimewgll_xx[J*NGLLX+4];
+ temp2l = s_temp2[K*NGLL2+I]*hprimewgll_xx[J*NGLLX]
+ + s_temp2[K*NGLL2+NGLLX+I]*hprimewgll_xx[J*NGLLX+1]
+ + s_temp2[K*NGLL2+2*NGLLX+I]*hprimewgll_xx[J*NGLLX+2]
+ + s_temp2[K*NGLL2+3*NGLLX+I]*hprimewgll_xx[J*NGLLX+3]
+ + s_temp2[K*NGLL2+4*NGLLX+I]*hprimewgll_xx[J*NGLLX+4];
- temp3l = s_temp3[J*NGLLX+I]*hprimewgll_xx[K*NGLLX]
- + s_temp3[NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+1]
- + s_temp3[2*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+2]
- + s_temp3[3*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+3]
- + s_temp3[4*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+4];
+ temp3l = s_temp3[J*NGLLX+I]*hprimewgll_xx[K*NGLLX]
+ + s_temp3[NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+1]
+ + s_temp3[2*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+2]
+ + s_temp3[3*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+3]
+ + s_temp3[4*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+4];
#endif
@@ -582,7 +585,7 @@
// d_accel[iglob*3 + 1] -= (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
// d_accel[iglob*3 + 2] -= (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
- atomicAdd(&d_potential_dot_dot_acoustic[iglob],-(fac1*temp1l + fac2*temp2l + fac3*temp3l));
+ atomicAdd(&d_potential_dot_dot_acoustic[iglob],-(fac1*temp1l + fac2*temp2l + fac3*temp3l));
#endif
}
@@ -594,7 +597,9 @@
/* ----------------------------------------------------------------------------------------------- */
+
/* KERNEL 3 */
+
/* ----------------------------------------------------------------------------------------------- */
@@ -717,11 +722,11 @@
/* ----------------------------------------------------------------------------------------------- */
+
/* KERNEL for enforce free surface */
+
/* ----------------------------------------------------------------------------------------------- */
-#define INDEX(i,j,k,ispec) i + (j)*5 + (k)*25 + (ispec)*125
-#define INDEX_IJK(x,y,z) x + (y)*3 + (z)*3*25
__global__ void enforce_free_surface_cuda_kernel(
float* potential_acoustic,
@@ -748,17 +753,17 @@
if( ispec_is_acoustic[ispec] == 1 ){
// gets global point index
- int tx = threadIdx.x + threadIdx.y*blockDim.x;
+ int igll = threadIdx.x + threadIdx.y*blockDim.x;
//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-// if( tx > 25-1 ){printf("device tx: %i \n",tx);}
+// if( igll > 25-1 ){printf("device igll: %i \n",igll);}
//#endif
- int i = free_surface_ijk[INDEX_IJK(0,tx,iface)] - 1;
- int j = free_surface_ijk[INDEX_IJK(1,tx,iface)] - 1;
- int k = free_surface_ijk[INDEX_IJK(2,tx,iface)] - 1;
+ int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
+ int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+ int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
- int iglob = ibool[INDEX(i,j,k,ispec)] - 1;
+ int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
// sets potentials to zero at free surface
potential_acoustic[iglob] = 0;
@@ -766,7 +771,7 @@
potential_dot_dot_acoustic[iglob] = 0;
//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-// if( ispec == 160 && tx < 25 ){printf("device: %i %i %i %i %i \n",tx,i,j,k,iglob);}
+// if( ispec == 160 && igll < 25 ){printf("device: %i %i %i %i %i \n",igll,i,j,k,iglob);}
//#endif
}
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
@@ -44,9 +72,6 @@
float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
float* alphaval,float* betaval,float* gammaval);
-__global__ void kernel_3_cuda_device(real* veloc,
- real* accel, int size,
- real deltatover2, real* rmass);
/* ----------------------------------------------------------------------------------------------- */
@@ -264,6 +289,7 @@
TRACE("compute_forces_elastic_cuda");
// EPIK_TRACER("compute_forces_elastic_cuda");
//printf("Running compute_forces\n");
+ //double start_time = get_time();
Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
@@ -278,11 +304,15 @@
/* MPI_Comm_rank(MPI_COMM_WORLD,&myrank); */
/* if(myrank==0) { */
- Kernel_2(num_elements,mp,*iphase,*COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION);
+ Kernel_2(num_elements,mp,*iphase,*COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION);
+ cudaThreadSynchronize();
- cudaThreadSynchronize();
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
/* MPI_Barrier(MPI_COMM_WORLD); */
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+#endif
}
@@ -1037,64 +1067,155 @@
/* ----------------------------------------------------------------------------------------------- */
+__global__ void kernel_3_cuda_device(real* veloc,
+ real* accel, int size,
+ real deltatover2, real* rmass) {
+ int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+ /* because of block and grid sizing problems, there is a small */
+ /* amount of buffer at the end of the calculation */
+ if(id < size) {
+ accel[3*id] = accel[3*id]*rmass[id];
+ accel[3*id+1] = accel[3*id+1]*rmass[id];
+ accel[3*id+2] = accel[3*id+2]*rmass[id];
+
+ veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+ veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+ veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_accel_cuda_device(real* accel,
+ int size,
+ real* rmass) {
+ int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+ /* because of block and grid sizing problems, there is a small */
+ /* amount of buffer at the end of the calculation */
+ if(id < size) {
+ accel[3*id] = accel[3*id]*rmass[id];
+ accel[3*id+1] = accel[3*id+1]*rmass[id];
+ accel[3*id+2] = accel[3*id+2]*rmass[id];
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_veloc_cuda_device(real* veloc,
+ real* accel,
+ int size,
+ real deltatover2) {
+ int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+ /* because of block and grid sizing problems, there is a small */
+ /* amount of buffer at the end of the calculation */
+ if(id < size) {
+ veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+ veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+ veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
extern "C"
-void FC_FUNC_(kernel_3_cuda,
- KERNEL_3_CUDA)(long* Mesh_pointer,
- int* size_F,
- float* deltatover2_F,
- int* SIMULATION_TYPE_f,
- float* b_deltatover2) {
-TRACE("kernel_3_cuda");
+void FC_FUNC_(kernel_3_a_cuda,
+ KERNEL_3_A_CUDA)(long* Mesh_pointer,
+ int* size_F,
+ float* deltatover2_F,
+ int* SIMULATION_TYPE_f,
+ float* b_deltatover2_F,
+ int* OCEANS) {
+TRACE("kernel_3_a_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
int size = *size_F;
int SIMULATION_TYPE = *SIMULATION_TYPE_f;
real deltatover2 = *deltatover2_F;
+ real b_deltatover2 = *b_deltatover2_F;
+
int blocksize=128;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
num_blocks_x = ceil(num_blocks_x/2.0);
num_blocks_y = num_blocks_y*2;
}
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc, mp->d_accel, size, deltatover2, mp->d_rmass);
+ // check whether we can update accel and veloc, or only accel at this point
+ if( *OCEANS == 0 ){
+ // updates both, accel and veloc
+ kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc, mp->d_accel, size, deltatover2, mp->d_rmass);
- if(SIMULATION_TYPE == 3) {
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc, mp->d_b_accel, size, *b_deltatover2,mp->d_rmass);
+ if(SIMULATION_TYPE == 3) {
+ kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc, mp->d_b_accel, size, b_deltatover2,mp->d_rmass);
+ }
+ }else{
+ // updates only accel
+ kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel, size, mp->d_rmass);
+
+ if(SIMULATION_TYPE == 3) {
+ kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel, size, mp->d_rmass);
+ }
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
//printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
- exit_on_cuda_error("after kernel 3");
+ exit_on_cuda_error("after kernel 3 a");
#endif
}
/* ----------------------------------------------------------------------------------------------- */
- __global__ void kernel_3_cuda_device(real* veloc,
- real* accel, int size,
- real deltatover2, real* rmass) {
- int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+extern "C"
+void FC_FUNC_(kernel_3_b_cuda,
+ KERNEL_3_B_CUDA)(long* Mesh_pointer,
+ int* size_F,
+ float* deltatover2_F,
+ int* SIMULATION_TYPE_f,
+ float* b_deltatover2_F) {
+ TRACE("kernel_3_b_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+ int size = *size_F;
+ int SIMULATION_TYPE = *SIMULATION_TYPE_f;
+ real deltatover2 = *deltatover2_F;
+ real b_deltatover2 = *b_deltatover2_F;
+
+ int blocksize=128;
+ int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
- if(id < size) {
- accel[3*id] = accel[3*id]*rmass[id];
- accel[3*id+1] = accel[3*id+1]*rmass[id];
- accel[3*id+2] = accel[3*id+2]*rmass[id];
-
- veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
- veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
- veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
- }
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_y = num_blocks_y*2;
}
-/* ----------------------------------------------------------------------------------------------- */
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ // updates only veloc at this point
+ kernel_3_veloc_cuda_device<<< grid, threads>>>(mp->d_veloc,mp->d_accel,size,deltatover2);
+
+ if(SIMULATION_TYPE == 3) {
+ kernel_3_veloc_cuda_device<<< grid, threads>>>(mp->d_b_veloc,mp->d_b_accel,size,b_deltatover2);
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+ exit_on_cuda_error("after kernel 3 b");
+#endif
+}
+
/* ----------------------------------------------------------------------------------------------- */
/* note:
@@ -1241,3 +1362,129 @@
}
}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* KERNEL for ocean load on free surface */
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void elastic_ocean_load_cuda_kernel(float* accel,
+ float* rmass,
+ float* rmass_ocean_load,
+ int num_free_surface_faces,
+ int* free_surface_ispec,
+ int* free_surface_ijk,
+ float* free_surface_normal,
+ int* ibool,
+ int* updated_dof_ocean_load) {
+ // gets spectral element face id
+ int igll = threadIdx.x ; // threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1)
+ int iface = blockIdx.x + gridDim.x*blockIdx.y;
+ realw nx,ny,nz;
+ realw force_normal_comp,additional_term;
+
+ // for all faces on free surface
+ if( iface < num_free_surface_faces ){
+
+ int ispec = free_surface_ispec[iface]-1;
+
+ // gets global point index
+ int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
+ int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+ int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+
+ int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+
+ // only update this global point once
+ // daniel: todo - workaround to not use the temporary update array
+ // atomicExch returns the old value, i.e. 0 indicates that we still have to do this point
+ if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
+
+ // get normal
+ nx = free_surface_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; //(1,igll,iface)
+ ny = free_surface_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
+ nz = free_surface_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
+
+ // make updated component of right-hand side
+ // we divide by rmass() which is 1 / M
+ // we use the total force which includes the Coriolis term above
+ force_normal_comp = ( accel[iglob*3]*nx + accel[iglob*3+1]*ny + accel[iglob*3+2]*nz ) / rmass[iglob];
+
+ additional_term = (rmass_ocean_load[iglob] - rmass[iglob]) * force_normal_comp;
+
+ // daniel: probably wouldn't need atomicAdd anymore, but just to be sure...
+ atomicAdd(&accel[iglob*3], + additional_term * nx);
+ atomicAdd(&accel[iglob*3+1], + additional_term * ny);
+ atomicAdd(&accel[iglob*3+2], + additional_term * nz);
+ }
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(elastic_ocean_load_cuda,
+ ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
+ int* SIMULATION_TYPE) {
+
+TRACE("elastic_ocean_load_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ // checks if anything to do
+ if( mp->num_free_surface_faces == 0 ) return;
+
+ // block sizes: exact blocksize to match NGLLSQUARE
+ int blocksize = 25;
+
+ int num_blocks_x = mp->num_free_surface_faces;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ // temporary global array: used to synchronize updates on global accel array
+ int* d_updated_dof_ocean_load;
+ print_CUDA_error_if_any(cudaMalloc((void**)&(d_updated_dof_ocean_load),sizeof(int)*mp->NGLOB_AB),88501);
+ // initializes array
+ cudaMemset((void*)d_updated_dof_ocean_load,0,sizeof(int)*mp->NGLOB_AB);
+
+ elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_accel,
+ mp->d_rmass,
+ mp->d_rmass_ocean_load,
+ mp->num_free_surface_faces,
+ mp->d_free_surface_ispec,
+ mp->d_free_surface_ijk,
+ mp->d_free_surface_normal,
+ mp->d_ibool,
+ d_updated_dof_ocean_load);
+ // for backward/reconstructed potentials
+ if(*SIMULATION_TYPE == 3) {
+ // re-initializes array
+ cudaMemset(d_updated_dof_ocean_load,0,sizeof(int)*mp->NGLOB_AB);
+
+ elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
+ mp->d_rmass,
+ mp->d_rmass_ocean_load,
+ mp->num_free_surface_faces,
+ mp->d_free_surface_ispec,
+ mp->d_free_surface_ijk,
+ mp->d_free_surface_normal,
+ mp->d_ibool,
+ d_updated_dof_ocean_load);
+
+ }
+
+ cudaFree(d_updated_dof_ocean_load);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("enforce_free_surface_cuda");
+#endif
+}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
@@ -185,15 +213,15 @@
int ispec = free_surface_ispec[iface]-1;
int igll = threadIdx.x;
int ipoin = igll + 25*iface;
- int i = free_surface_ijk[INDEX3(3,25,0,igll,iface)]-1;
- int j = free_surface_ijk[INDEX3(3,25,0,igll,iface)]-1;
- int k = free_surface_ijk[INDEX3(3,25,0,igll,iface)]-1;
+ int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+ int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+ int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
- float eta = (noise_surface_movie[INDEX3(3,25,0,igll,iface)]*normal_x_noise[ipoin]+
- noise_surface_movie[INDEX3(3,25,1,igll,iface)]*normal_y_noise[ipoin]+
- noise_surface_movie[INDEX3(3,25,2,igll,iface)]*normal_z_noise[ipoin]);
+ float eta = ( noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x_noise[ipoin]+
+ noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y_noise[ipoin]+
+ noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)]*normal_z_noise[ipoin]);
// if(ijk_ispec == 78496) {
// d_debug[0] = Sigma_kl[ijk_ispec];
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
@@ -16,30 +44,33 @@
float* potential_dot_dot_acoustic,
int* abs_boundary_ispec,
int* abs_boundary_ijk,
+ real* abs_boundary_jacobian2Dw,
int* ibool,
float* rhostore,
float* kappastore,
- real* abs_boundary_jacobian2Dw,
int* ispec_is_inner,
int* ispec_is_acoustic,
int phase_is_inner,
int SIMULATION_TYPE, int SAVE_FORWARD,
+ int num_abs_boundary_faces,
float* b_potential_dot_acoustic,
float* b_potential_dot_dot_acoustic,
- float* b_absorb_potential // ,float* debug_val,int* debug_val_int
+ float* b_absorb_potential
) {
- int igll = threadIdx.x; // tx
- int iface = blockIdx.x + gridDim.x*blockIdx.y; // bx
-
+ int igll = threadIdx.x;
+ int iface = blockIdx.x + gridDim.x*blockIdx.y;
int i,j,k,iglob,ispec;
realw rhol,kappal,cpl;
realw jacobianw;
+
// don't compute points outside NGLLSQUARE==NGLL2==25
// way 2: no further check needed since blocksize = 25
- // if(igll<NGLL2) {
+ if( iface < num_abs_boundary_faces){
+
+ // if(igll<NGLL2 && iface < num_abs_boundary_faces) {
// "-1" from index values to convert from Fortran-> C indexing
ispec = abs_boundary_ispec[iface]-1;
@@ -49,12 +80,12 @@
i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
j = abs_boundary_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
k = abs_boundary_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
- iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
+ iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
// determines bulk sound speed
rhol = rhostore[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
- kappal = kappastore[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+ kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
cpl = sqrt( kappal / rhol );
@@ -77,7 +108,7 @@
}
// }
-
+ }
}
/* ----------------------------------------------------------------------------------------------- */
@@ -87,7 +118,6 @@
COMPUTE_STACEY_ACOUSTIC_CUDA)(
long* Mesh_pointer_f,
int* phase_is_innerf,
- int* num_abs_boundary_facesf,
int* SIMULATION_TYPEf,
int* SAVE_FORWARDf,
float* h_b_absorb_potential) {
@@ -96,7 +126,6 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
int phase_is_inner = *phase_is_innerf;
- int num_abs_boundary_faces = *num_abs_boundary_facesf;
int SIMULATION_TYPE = *SIMULATION_TYPEf;
int SAVE_FORWARD = *SAVE_FORWARDf;
@@ -108,7 +137,7 @@
// > NGLLSQUARE==NGLL2==25, no further check inside kernel
int blocksize = 25;
- int num_blocks_x = num_abs_boundary_faces;
+ int num_blocks_x = mp->d_num_abs_boundary_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
num_blocks_x = ceil(num_blocks_x/2.0);
@@ -119,33 +148,34 @@
dim3 threads(blocksize,1,1);
// adjoint simulations: reads in absorbing boundary
- if (SIMULATION_TYPE == 3 && num_abs_boundary_faces > 0 ){
+ if (SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0 ){
// copies array to GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,h_b_absorb_potential,
- mp->d_b_reclen_potential,cudaMemcpyHostToDevice),700);
+ mp->d_b_reclen_potential,cudaMemcpyHostToDevice),7700);
}
compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_acoustic,
mp->d_potential_dot_dot_acoustic,
mp->d_abs_boundary_ispec,
mp->d_abs_boundary_ijk,
+ mp->d_abs_boundary_jacobian2Dw,
mp->d_ibool,
mp->d_rhostore,
mp->d_kappastore,
- mp->d_abs_boundary_jacobian2Dw,
mp->d_ispec_is_inner,
mp->d_ispec_is_acoustic,
phase_is_inner,
SIMULATION_TYPE,SAVE_FORWARD,
+ mp->d_num_abs_boundary_faces,
mp->d_b_potential_dot_acoustic,
mp->d_b_potential_dot_dot_acoustic,
mp->d_b_absorb_potential);
// adjoint simulations: stores absorbed wavefield part
- if (SIMULATION_TYPE == 1 && SAVE_FORWARD == 1 && num_abs_boundary_faces > 0 ){
+ if (SIMULATION_TYPE == 1 && SAVE_FORWARD == 1 && mp->d_num_abs_boundary_faces > 0 ){
// copies array to CPU
print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_potential,mp->d_b_absorb_potential,
- mp->d_b_reclen_potential,cudaMemcpyDeviceToHost),701);
+ mp->d_b_reclen_potential,cudaMemcpyDeviceToHost),7701);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
@@ -14,17 +42,23 @@
__global__ void compute_stacey_elastic_kernel(real* veloc,
real* accel,
- real* b_accel,
int* abs_boundary_ispec,
- int* abs_boundary_ijk, int* ibool,
+ int* abs_boundary_ijk,
real* abs_boundary_normal,
- real* rho_vp, real* rho_vs,
real* abs_boundary_jacobian2Dw,
- real* b_absorb_field,
- int* ispec_is_inner, int* ispec_is_elastic,
- int phase_is_inner,float* debug_val,int* debug_val_int,
+ int* ibool,
+ real* rho_vp,
+ real* rho_vs,
+ int* ispec_is_inner,
+ int* ispec_is_elastic,
+ int phase_is_inner,
+ int SIMULATION_TYPE,int SAVE_FORWARD,
int num_abs_boundary_faces,
- int SAVE_FORWARD,int SIMULATION_TYPE) {
+ real* b_accel,
+ real* b_absorb_field,
+ float* debug_val,
+ int* debug_val_int
+ ) {
int igll = threadIdx.x; // tx
int iface = blockIdx.x + gridDim.x*blockIdx.y; // bx
@@ -39,9 +73,13 @@
realw tx,ty,tz;
realw jacobianw;
- // don't compute points outside NGLLSQUARE == NGLL2 == 25
- if(igll < NGLL2 && iface < num_abs_boundary_faces) {
+
+ // don't compute points outside NGLLSQUARE==NGLL2==25
+ // way 2: no further check needed since blocksize = 25
+ if( iface < num_abs_boundary_faces){
+ //if(igll < NGLL2 && iface < num_abs_boundary_faces) {
+
// "-1" from index values to convert from Fortran-> C indexing
ispec = abs_boundary_ispec[iface]-1;
i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
@@ -82,19 +120,18 @@
atomicAdd(&accel[iglob*3+2],-tz*jacobianw);
if(SIMULATION_TYPE == 3) {
- atomicAdd(&b_accel[iglob*3 ],-b_absorb_field[0+3*(igll+25*(iface))]);
- atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[1+3*(igll+25*(iface))]);
- atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[2+3*(igll+25*(iface))]);
+ atomicAdd(&b_accel[iglob*3 ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]);
+ atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]);
+ atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]);
}
else if(SAVE_FORWARD && SIMULATION_TYPE == 1) {
- b_absorb_field[0+3*(igll+25*(iface))] = tx*jacobianw;
- b_absorb_field[1+3*(igll+25*(iface))] = ty*jacobianw;
- b_absorb_field[2+3*(igll+25*(iface))] = tz*jacobianw;
+ b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)] = tx*jacobianw;
+ b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)] = ty*jacobianw;
+ b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)] = tz*jacobianw;
}
}
}
-
}
/* ----------------------------------------------------------------------------------------------- */
@@ -103,38 +140,28 @@
extern "C"
void FC_FUNC_(compute_stacey_elastic_cuda,
COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
- int* NSPEC_ABf,
- int* NGLOB_ABf,
int* phase_is_innerf,
- int* num_abs_boundary_facesf,
int* SIMULATION_TYPEf,
- int* NSTEPf,
- int* NGLOB_ADJOINTf,
- int* b_num_abs_boundary_facesf,
- int* b_reclen_fieldf,
- float* b_absorb_field,
- int* SAVE_FORWARDf,
- int* itf) {
+ int* SAVE_FORWARDf,
+ float* h_b_absorb_field) {
TRACE("compute_stacey_elastic_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- //int fid = 0;
- //int it = *itf;
- //int NSPEC_AB = *NSPEC_ABf;
- //int NGLOB_AB = *NGLOB_ABf;
- int phase_is_inner = *phase_is_innerf;
- int num_abs_boundary_faces = *num_abs_boundary_facesf;
- int SIMULATION_TYPE = *SIMULATION_TYPEf;
- //int NSTEP = *NSTEPf;
- int myrank; MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
- //int NGLOB_ADJOINT = *NGLOB_ADJOINTf;
- //int b_num_abs_boundary_faces = *b_num_abs_boundary_facesf;
- int b_reclen_field = *b_reclen_fieldf;
- int SAVE_FORWARD = *SAVE_FORWARDf;
- int blocksize = 32; // > NGLL2=25, but we handle this inside kernel
- int num_blocks_x = num_abs_boundary_faces;
+ int phase_is_inner = *phase_is_innerf;
+ int SIMULATION_TYPE = *SIMULATION_TYPEf;
+ int SAVE_FORWARD = *SAVE_FORWARDf;
+
+ // way 1
+ // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
+ //int blocksize = 32;
+
+ // way 2: seems sligthly faster
+ // > NGLLSQUARE==NGLL2==25, no further check inside kernel
+ int blocksize = 25;
+
+ int num_blocks_x = mp->d_num_abs_boundary_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
num_blocks_x = ceil(num_blocks_x/2.0);
@@ -147,30 +174,36 @@
float* d_debug_val;
int* d_debug_val_int;
- if(SIMULATION_TYPE == 3 && num_abs_boundary_faces > 0) {
+ if(SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0) {
// int val = NSTEP-it+1;
// read_abs_(&fid,(char*)b_absorb_field,&b_reclen_field,&val);
// The read is done in fortran
- cudaMemcpy(mp->d_b_absorb_field,b_absorb_field,b_reclen_field,cudaMemcpyHostToDevice);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,h_b_absorb_field,
+ mp->d_b_reclen_field,cudaMemcpyHostToDevice),7700);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("between cudamemcpy and compute_stacey_elastic_kernel");
#endif
- compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc,mp->d_accel,mp->d_b_accel,
- mp->d_abs_boundary_ispec, mp->d_abs_boundary_ijk,
- mp->d_ibool,
+ compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc,
+ mp->d_accel,
+ mp->d_abs_boundary_ispec,
+ mp->d_abs_boundary_ijk,
mp->d_abs_boundary_normal,
- mp->d_rho_vp, mp->d_rho_vs,
mp->d_abs_boundary_jacobian2Dw,
- mp->d_b_absorb_field,
+ mp->d_ibool,
+ mp->d_rho_vp,
+ mp->d_rho_vs,
mp->d_ispec_is_inner,
mp->d_ispec_is_elastic,
phase_is_inner,
- d_debug_val,d_debug_val_int,
- num_abs_boundary_faces,
- SAVE_FORWARD,SIMULATION_TYPE);
+ SIMULATION_TYPE,SAVE_FORWARD,
+ mp->d_num_abs_boundary_faces,
+ mp->d_b_accel,
+ mp->d_b_absorb_field,
+ d_debug_val,
+ d_debug_val_int);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("compute_stacey_elastic_kernel");
@@ -180,8 +213,9 @@
// if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) &
// write(IOABS,rec=it) b_reclen_field,b_absorb_field,b_reclen_field
- if(SIMULATION_TYPE==1 && SAVE_FORWARD && num_abs_boundary_faces>0) {
- cudaMemcpy(b_absorb_field,mp->d_b_absorb_field,b_reclen_field,cudaMemcpyDeviceToHost);
+ if(SIMULATION_TYPE==1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces>0) {
+ print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_field,mp->d_b_absorb_field,
+ mp->d_b_reclen_field,cudaMemcpyDeviceToHost),7701);
// The write is done in fortran
// write_abs_(&fid,(char*)b_absorb_field,&b_reclen_field,&it);
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#ifndef GPU_MESH_
#define GPU_MESH_
#include <sys/types.h>
@@ -28,8 +56,13 @@
#define PRINT5(var,offset) // for(i=0;i<10;i++) printf("var(%d)=%f\n",i,var[offset+i]);
#endif
+
#define ENABLE_VERY_SLOW_ERROR_CHECKING
+/* ----------------------------------------------------------------------------------------------- */
+
+// indexing
+
#define INDEX2(xsize,x,y) x + (y)*xsize
#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
@@ -44,6 +77,8 @@
//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*j)))
+/* ----------------------------------------------------------------------------------------------- */
+
#define MAX(x,y) (((x) < (y)) ? (y) : (x))
double get_time();
@@ -122,15 +157,18 @@
int* d_ibool_interfaces_ext_mesh;
//used for absorbing stacey boundaries
+ int d_num_abs_boundary_faces;
int* d_abs_boundary_ispec;
int* d_abs_boundary_ijk;
float* d_abs_boundary_normal;
- float* d_rho_vp;
- float* d_rho_vs;
float* d_abs_boundary_jacobian2Dw;
+
float* d_b_absorb_field;
- int b_num_abs_boundary_faces;
+ int d_b_reclen_field;
+ float* d_rho_vp;
+ float* d_rho_vs;
+
// inner / outer elements
int* d_ispec_is_inner;
int* d_ispec_is_elastic;
@@ -214,6 +252,10 @@
// noise sensitivity kernel
float* d_Sigma_kl;
+ // oceans
+ float* d_rmass_ocean_load;
+ float* d_free_surface_normal;
+
// ------------------------------------------------------------------ //
// acoustic wavefield
// ------------------------------------------------------------------ //
@@ -244,6 +286,13 @@
float* d_rho_ac_kl;
float* d_kappa_ac_kl;
+ // coupling acoustic-elastic
+ int* d_coupling_ac_el_ispec;
+ int* d_coupling_ac_el_ijk;
+ float* d_coupling_ac_el_normal;
+ float* d_coupling_ac_el_jacobian2Dw;
+
+
} Mesh;
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
@@ -21,15 +49,15 @@
if(iface < num_free_surface_faces) {
int ispec = free_surface_ispec[iface]-1; //-1 for C-based indexing
- int i = free_surface_ijk[0+3*(igll + 25*(iface))]-1;
- int j = free_surface_ijk[1+3*(igll + 25*(iface))]-1;
- int k = free_surface_ijk[2+3*(igll + 25*(iface))]-1;
+ int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+ int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+ int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
- noise_surface_movie[INDEX3(3,25,0,igll,iface)] = displ[iglob*3];
- noise_surface_movie[INDEX3(3,25,1,igll,iface)] = displ[iglob*3+1];
- noise_surface_movie[INDEX3(3,25,2,igll,iface)] = displ[iglob*3+2];
+ noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)] = displ[iglob*3];
+ noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)] = displ[iglob*3+1];
+ noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)] = displ[iglob*3+2];
}
}
@@ -141,9 +169,9 @@
int igll = threadIdx.x;
int ipoin = 25*iface + igll;
- int i=free_surface_ijk[0+3*(igll + 25*(iface))]-1;
- int j=free_surface_ijk[1+3*(igll + 25*(iface))]-1;
- int k=free_surface_ijk[2+3*(igll + 25*(iface))]-1;
+ int i=free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+ int j=free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+ int k=free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
@@ -151,9 +179,9 @@
real normal_y = normal_y_noise[ipoin];
real normal_z = normal_z_noise[ipoin];
- real eta = (noise_surface_movie[INDEX3(3,25,0,igll,iface)]*normal_x +
- noise_surface_movie[INDEX3(3,25,1,igll,iface)]*normal_y +
- noise_surface_movie[INDEX3(3,25,2,igll,iface)]*normal_z);
+ real eta = (noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x +
+ noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y +
+ noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)]*normal_z);
// error from cuda-memcheck and ddt seems "incorrect", because we
// are passing a __constant__ variable pointer around like it was
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,21 +1,45 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#ifndef CUDA_HEADER_H
#define CUDA_HEADER_H
+
/* CUDA specific things from specfem3D_kernels.cu */
-//#define NGLL2 25
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
#ifdef USE_TEXTURES
-// declaration of textures
-texture<float, 1, cudaReadModeElementType> tex_displ;
-texture<float, 1, cudaReadModeElementType> tex_accel;
+ // declaration of textures
+ texture<float, 1, cudaReadModeElementType> tex_displ;
+ texture<float, 1, cudaReadModeElementType> tex_accel;
-texture<float, 1, cudaReadModeElementType> tex_potential_acoustic;
-texture<float, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
+ texture<float, 1, cudaReadModeElementType> tex_potential_acoustic;
+ texture<float, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
-// for binding the textures
+ // for binding the textures
void bindTexturesDispl(float* d_displ)
{
@@ -92,7 +116,5 @@
void setConst_wgllwgll_xz(float* array, Mesh* mp);
void setConst_wgllwgll_yz(float* array, Mesh* mp);
-//void exit_on_cuda_error(char* kernel_name);
-//void show_free_memory(char* info_str);
#endif //CUDA_HEADER_H
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
@@ -87,7 +115,7 @@
{
if (cudaSuccess != err)
{
- printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call # %d\n",cudaGetErrorString(err),num);
+ printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
fflush(stdout);
#ifdef USE_MPI
MPI_Abort(MPI_COMM_WORLD,1);
@@ -217,12 +245,12 @@
float* h_hprimewgll_xx,
float* h_wgllwgll_xy,
float* h_wgllwgll_xz,
- float* h_wgllwgll_yz,
+ float* h_wgllwgll_yz,
+ int* ABSORBING_CONDITIONS,
int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
float* h_abs_boundary_normal,
float* h_abs_boundary_jacobian2Dw,
- float* h_b_absorb_field,
- int* num_abs_boundary_faces, int* b_num_abs_boundary_faces,
+ int* h_num_abs_boundary_faces,
int* h_ispec_is_inner,
int* NSOURCES,
float* h_sourcearrays,
@@ -328,29 +356,30 @@
// absorbing boundaries
- if( *num_abs_boundary_faces > 0 ){
+ mp->d_num_abs_boundary_faces = *h_num_abs_boundary_faces;
+ if( *ABSORBING_CONDITIONS == 1 && mp->d_num_abs_boundary_faces > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_ispec,
- (*num_abs_boundary_faces)*sizeof(int)),1101);
+ (mp->d_num_abs_boundary_faces)*sizeof(int)),1101);
print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ispec, h_abs_boundary_ispec,
- (*num_abs_boundary_faces)*sizeof(int),
+ (mp->d_num_abs_boundary_faces)*sizeof(int),
cudaMemcpyHostToDevice),1102);
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_ijk,
- 3*25*(*num_abs_boundary_faces)*sizeof(int)),1103);
+ 3*25*(mp->d_num_abs_boundary_faces)*sizeof(int)),1103);
print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ijk, h_abs_boundary_ijk,
- 3*25*(*num_abs_boundary_faces)*sizeof(int),
+ 3*25*(mp->d_num_abs_boundary_faces)*sizeof(int),
cudaMemcpyHostToDevice),1104);
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_normal,
- 3*25*(*num_abs_boundary_faces)*sizeof(int)),1105);
+ 3*25*(mp->d_num_abs_boundary_faces)*sizeof(int)),1105);
print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_normal, h_abs_boundary_normal,
- 3*25*(*num_abs_boundary_faces)*sizeof(int),
+ 3*25*(mp->d_num_abs_boundary_faces)*sizeof(int),
cudaMemcpyHostToDevice),1106);
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_jacobian2Dw,
- 25*(*num_abs_boundary_faces)*sizeof(float)),1107);
+ 25*(mp->d_num_abs_boundary_faces)*sizeof(float)),1107);
print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
- 25*(*num_abs_boundary_faces)*sizeof(float),
+ 25*(mp->d_num_abs_boundary_faces)*sizeof(float),
cudaMemcpyHostToDevice),1108);
}
@@ -476,7 +505,13 @@
float* b_absorb_potential,
int* SIMULATION_TYPE,
float* rho_ac_kl,
- float* kappa_ac_kl
+ float* kappa_ac_kl,
+ int* ELASTIC_SIMULATION,
+ int* num_coupling_ac_el_faces,
+ int* coupling_ac_el_ispec,
+ int* coupling_ac_el_ijk,
+ float* coupling_ac_el_normal,
+ float* coupling_ac_el_jacobian2Dw
) {
TRACE("prepare_fields_acoustic_device");
@@ -567,7 +602,32 @@
mp->nrec_local*125*sizeof(float)),9107);
mp->h_station_seismo_potential = (float*)malloc(mp->nrec_local*125*sizeof(float));
if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
+
+
+ // coupling with elastic parts
+ if( *ELASTIC_SIMULATION == 1 && *num_coupling_ac_el_faces > 0 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ispec),
+ (*num_coupling_ac_el_faces)*sizeof(int)),9601);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,
+ (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9602);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk),
+ 3*25*(*num_coupling_ac_el_faces)*sizeof(int)),9603);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,
+ 3*25*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9604);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal),
+ 3*25*(*num_coupling_ac_el_faces)*sizeof(float)),9605);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
+ 3*25*(*num_coupling_ac_el_faces)*sizeof(float),cudaMemcpyHostToDevice),9606);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw),
+ 25*(*num_coupling_ac_el_faces)*sizeof(float)),9607);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
+ 25*(*num_coupling_ac_el_faces)*sizeof(float),cudaMemcpyHostToDevice),9608);
+ }
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("prepare_fields_acoustic_device");
#endif
@@ -592,7 +652,7 @@
int* ispec_is_elastic,
int* ABSORBING_CONDITIONS,
float* h_b_absorb_field,
- int* b_num_abs_boundary_faces,
+ int* h_b_reclen_field,
int* SIMULATION_TYPE,
float* rho_kl,
float* mu_kl,
@@ -609,8 +669,9 @@
float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
float* one_minus_sum_beta,float* factor_common,
float* alphaval,float* betaval,float* gammaval,
- float* b_alphaval,float* b_betaval,float* b_gammaval
- ){
+ float* b_alphaval,float* b_betaval,float* b_gammaval,
+ int* OCEANS,float* rmass_ocean_load,
+ float* free_surface_normal,int* num_free_surface_faces){
TRACE("prepare_fields_elastic_device");
@@ -656,12 +717,12 @@
mp->h_station_seismo_field = (float*)malloc(3*125*(mp->nrec_local)*sizeof(float));
// absorbing conditions
- if( *ABSORBING_CONDITIONS == 1 ){
- mp->b_num_abs_boundary_faces = *b_num_abs_boundary_faces;
+ if( *ABSORBING_CONDITIONS == 1 && mp->d_num_abs_boundary_faces > 0){
+ mp->d_b_reclen_field = *h_b_reclen_field;
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field),
- 3*25*mp->b_num_abs_boundary_faces*sizeof(float)),8016);
+ mp->d_b_reclen_field),8016);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
- 3*25*mp->b_num_abs_boundary_faces*sizeof(float),cudaMemcpyHostToDevice),8017);
+ mp->d_b_reclen_field,cudaMemcpyHostToDevice),8017);
}
// kernel simulations
@@ -846,11 +907,21 @@
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
N_SLS*sizeof(float),cudaMemcpyHostToDevice),8439);
}
+ }
+
+ if( *OCEANS == 1 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),sizeof(float)*mp->NGLOB_AB),8501);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
+ sizeof(float)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8502);
+
+ mp->num_free_surface_faces = *num_free_surface_faces;
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
+ 3*25*(mp->num_free_surface_faces)*sizeof(float)),8503);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
+ 3*25*(mp->num_free_surface_faces)*sizeof(float),cudaMemcpyHostToDevice),8504);
-
}
-
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("prepare_fields_elastic_device");
#endif
@@ -963,7 +1034,6 @@
extern "C"
void FC_FUNC_(prepare_cleanup_device,
PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
- int* num_abs_boundary_faces,
int* SIMULATION_TYPE,
int* ACOUSTIC_SIMULATION,
int* ELASTIC_SIMULATION,
@@ -991,7 +1061,7 @@
cudaFree(mp->d_muv);
// absorbing boundaries
- if( *num_abs_boundary_faces > 0 ){
+ if( *ABSORBING_CONDITIONS == 1 && mp->d_num_abs_boundary_faces > 0 ){
cudaFree(mp->d_abs_boundary_ispec);
cudaFree(mp->d_abs_boundary_ijk);
cudaFree(mp->d_abs_boundary_normal);
@@ -1064,7 +1134,7 @@
cudaFree(mp->d_ispec_is_elastic);
cudaFree(mp->d_station_seismo_field);
- if( *ABSORBING_CONDITIONS == 1 ) cudaFree(mp->d_b_absorb_field);
+ if( *ABSORBING_CONDITIONS == 1 && mp->d_num_abs_boundary_faces > 0) cudaFree(mp->d_b_absorb_field);
if( *SIMULATION_TYPE == 3 ) {
cudaFree(mp->d_b_displ);
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu 2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
@@ -8,12 +36,6 @@
#include "config.h"
#include "mesh_constants_cuda.h"
-//#define INDEX2(xsize,x,y) x + (y)*xsize
-//#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
-//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
-//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + (j)*xsize*ysize*zsize*isize
-//
-//#define ENABLE_VERY_SLOW_ERROR_CHECKING
/* ----------------------------------------------------------------------------------------------- */
@@ -49,10 +71,7 @@
}
}
-/* ----------------------------------------------------------------------------------------------- */
-//extern "C" void pause_for_debuger(int);
-
/* ----------------------------------------------------------------------------------------------- */
void transfer_field_from_device(Mesh* mp, float* d_field,float* h_field,
@@ -223,11 +242,7 @@
mp->d_station_seismo_potential,
d_potential);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("transfer_stations_fields_acoustic_from_device_kernel");
-#endif
-
+
print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_potential,mp->d_station_seismo_potential,
mp->nrec_local*125*sizeof(float),cudaMemcpyDeviceToHost),500);
@@ -249,6 +264,9 @@
//memcpy(&(h_potential[iglob]),&(mp->h_station_seismo_potential[irec_local*125]),125*sizeof(float));
}
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("transfer_field_acoustic_from_device");
+#endif
}
/* ----------------------------------------------------------------------------------------------- */
@@ -313,6 +331,7 @@
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
//double end_time = get_time();
//printf("Elapsed time: %e\n",end_time-start_time);
+ exit_on_cuda_error("transfer_station_fields_acoustic_from_device");
#endif
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_mass_matrices.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_mass_matrices.f90 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_mass_matrices.f90 2011-10-08 01:09:46 UTC (rev 19050)
@@ -184,8 +184,8 @@
do igll=1,NGLLSQUARE
ix_oceans = free_surface_ijk(1,igll,ispec2D)
- iy_oceans = free_surface_ijk(1,igll,ispec2D)
- iz_oceans = free_surface_ijk(1,igll,ispec2D)
+ iy_oceans = free_surface_ijk(2,igll,ispec2D)
+ iz_oceans = free_surface_ijk(3,igll,ispec2D)
iglobnum=ibool(ix_oceans,iy_oceans,iz_oceans,ispec_oceans)
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in 2011-10-08 01:09:46 UTC (rev 19050)
@@ -134,6 +134,7 @@
CUDA_OBJECTS = \
$O/check_fields_cuda.cuda.o \
$O/compute_add_sources_cuda.cuda.o \
+ $O/compute_coupling_cuda.cuda.o \
$O/compute_forces_acoustic_cuda.cuda.o \
$O/compute_forces_elastic_cuda.cuda.o \
$O/compute_kernels_cuda.cuda.o \
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_acoustic.f90 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_acoustic.f90 2011-10-08 01:09:46 UTC (rev 19050)
@@ -215,46 +215,30 @@
endif
! elastic coupling
- if(ELASTIC_SIMULATION ) then
-
- ! transfers potentials to CPU
- if(GPU_MODE) then
- call transfer_fields_acoustic_from_device(NGLOB_AB,potential_acoustic, &
- potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
- call transfer_fields_from_device(NDIM*NGLOB_AB,displ,veloc,accel, Mesh_pointer)
- ! backward simulation
- if( SIMULATION_TYPE == 3 ) then
- call transfer_fields_acoustic_from_device(NGLOB_AB,b_potential_acoustic, &
- b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
- call transfer_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel,Mesh_pointer)
- endif
- endif
-
- call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, &
+ if(ELASTIC_SIMULATION ) then
+ if( .NOT. GPU_MODE ) then
+ call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, &
ibool,displ,potential_dot_dot_acoustic, &
num_coupling_ac_el_faces, &
coupling_ac_el_ispec,coupling_ac_el_ijk, &
coupling_ac_el_normal, &
coupling_ac_el_jacobian2Dw, &
ispec_is_inner,phase_is_inner)
- ! adjoint simulations
- if( SIMULATION_TYPE == 3 ) &
- call compute_coupling_acoustic_el(NSPEC_ADJOINT,NGLOB_ADJOINT, &
+ ! adjoint simulations
+ if( SIMULATION_TYPE == 3 ) &
+ call compute_coupling_acoustic_el(NSPEC_ADJOINT,NGLOB_ADJOINT, &
ibool,b_displ,b_potential_dot_dot_acoustic, &
num_coupling_ac_el_faces, &
coupling_ac_el_ispec,coupling_ac_el_ijk, &
coupling_ac_el_normal, &
coupling_ac_el_jacobian2Dw, &
ispec_is_inner,phase_is_inner)
-
- ! transfers potentials to CPU
- if(GPU_MODE) then
- ! only potential_dot_dot_acoustic/b_potential_dot_dot_acoustic is updated above
- call transfer_fields_acoustic_to_device(NGLOB_AB,potential_acoustic, &
- potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
- if( SIMULATION_TYPE == 3 ) &
- call transfer_fields_acoustic_to_device(NGLOB_AB,b_potential_acoustic, &
- b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
+ else
+ ! on GPU
+ if( num_coupling_ac_el_faces > 0 ) &
+ call compute_coupling_acoustic_el_cuda(Mesh_pointer,phase_is_inner, &
+ num_coupling_ac_el_faces,SIMULATION_TYPE)
+
endif
endif
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90 2011-10-08 01:09:46 UTC (rev 19050)
@@ -452,23 +452,9 @@
GPU_MODE,Mesh_pointer)
! acoustic coupling
- if( ACOUSTIC_SIMULATION ) then
-
- ! daniel: workaround - todo on GPU
- ! transfers potentials to CPU
- if(GPU_MODE) then
- call transfer_fields_acoustic_from_device(NGLOB_AB,potential_acoustic, &
- potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
- call transfer_fields_from_device(NDIM*NGLOB_AB,displ,veloc,accel,Mesh_pointer)
- ! backward simulation
- if( SIMULATION_TYPE == 3 ) then
- call transfer_fields_acoustic_from_device(NGLOB_AB,b_potential_acoustic, &
- b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
- call transfer_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel,Mesh_pointer)
- endif
- endif
-
- call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, &
+ if( ACOUSTIC_SIMULATION ) then
+ if( .NOT. GPU_MODE ) then
+ call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, &
ibool,accel,potential_dot_dot_acoustic, &
num_coupling_ac_el_faces, &
coupling_ac_el_ispec,coupling_ac_el_ijk, &
@@ -476,25 +462,22 @@
coupling_ac_el_jacobian2Dw, &
ispec_is_inner,phase_is_inner)
- ! adjoint simulations
- if( SIMULATION_TYPE == 3 ) &
- call compute_coupling_elastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
+ ! adjoint simulations
+ if( SIMULATION_TYPE == 3 ) &
+ call compute_coupling_elastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
ibool,b_accel,b_potential_dot_dot_acoustic, &
num_coupling_ac_el_faces, &
coupling_ac_el_ispec,coupling_ac_el_ijk, &
coupling_ac_el_normal, &
coupling_ac_el_jacobian2Dw, &
ispec_is_inner,phase_is_inner)
-
- ! daniel: workaround - todo on GPU
- ! transfers potentials to CPU
- if(GPU_MODE) then
- ! only accel/b_accel is updated above
- call transfer_fields_to_device(NDIM*NGLOB_AB,displ,veloc,accel, Mesh_pointer)
- if( SIMULATION_TYPE == 3 ) &
- call transfer_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
+ else
+ ! on GPU
+ if( num_coupling_ac_el_faces > 0 ) &
+ call compute_coupling_elastic_ac_cuda(Mesh_pointer,phase_is_inner, &
+ num_coupling_ac_el_faces,SIMULATION_TYPE)
+
endif
-
endif
@@ -615,16 +598,21 @@
b_accel(3,:) = b_accel(3,:)*rmass(:)
endif !adjoint
else ! GPU_MODE == 1
- call kernel_3_cuda(Mesh_pointer, NGLOB_AB, deltatover2,SIMULATION_TYPE,b_deltatover2)
+ call kernel_3_a_cuda(Mesh_pointer, NGLOB_AB, deltatover2,SIMULATION_TYPE,b_deltatover2,OCEANS)
endif
! updates acceleration with ocean load term
if(OCEANS) then
- call elastic_ocean_load(NSPEC_AB,NGLOB_AB, &
+ if( .NOT. GPU_MODE ) then
+ call elastic_ocean_load(NSPEC_AB,NGLOB_AB, &
ibool,rmass,rmass_ocean_load,accel, &
free_surface_normal,free_surface_ijk,free_surface_ispec, &
num_free_surface_faces,SIMULATION_TYPE, &
NGLOB_ADJOINT,b_accel)
+ else
+ ! on GPU
+ call elastic_ocean_load_cuda(Mesh_pointer,SIMULATION_TYPE)
+ endif
endif
! updates velocities
@@ -648,6 +636,8 @@
veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
! adjoint simulations
if (SIMULATION_TYPE == 3) b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
+ else ! GPU_MODE == 1
+ if( OCEANS ) call kernel_3_b_cuda(Mesh_pointer, NGLOB_AB, deltatover2,SIMULATION_TYPE,b_deltatover2)
endif
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_acoustic.f90 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_acoustic.f90 2011-10-08 01:09:46 UTC (rev 19050)
@@ -76,6 +76,9 @@
real(kind=CUSTOM_REAL) :: rhol,cpl,jacobianw,absorbl
integer :: ispec,iglob,i,j,k,iface,igll
!integer:: reclen1,reclen2
+
+ ! checks if anything to do
+ if( num_abs_boundary_faces == 0 ) return
! adjoint simulations:
if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0) then
@@ -144,8 +147,7 @@
else
! GPU_MODE == .true.
call compute_stacey_acoustic_cuda(Mesh_pointer, phase_is_inner, &
- num_abs_boundary_faces, SIMULATION_TYPE, &
- SAVE_FORWARD,b_absorb_potential)
+ SIMULATION_TYPE,SAVE_FORWARD,b_absorb_potential)
endif
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_elastic.f90 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_elastic.f90 2011-10-08 01:09:46 UTC (rev 19050)
@@ -84,6 +84,8 @@
integer :: ispec,iglob,i,j,k,iface,igll
!integer:: reclen1,reclen2
+ ! checks if anything to do
+ if( num_abs_boundary_faces == 0 ) return
! adjoint simulations:
if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0) then
@@ -162,10 +164,8 @@
else
! GPU_MODE == .true.
- call compute_stacey_elastic_cuda(Mesh_pointer, NSPEC_AB, NGLOB_AB, phase_is_inner,&
- num_abs_boundary_faces, SIMULATION_TYPE, NSTEP, NGLOB_ADJOINT,&
- b_num_abs_boundary_faces, b_reclen_field,b_absorb_field, &
- SAVE_FORWARD,it)
+ call compute_stacey_elastic_cuda(Mesh_pointer,phase_is_inner, &
+ SIMULATION_TYPE,SAVE_FORWARD,b_absorb_field)
endif
! adjoint simulations: stores absorbed wavefield part
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90 2011-10-08 01:09:46 UTC (rev 19050)
@@ -659,7 +659,7 @@
! frees allocated memory on GPU
- call prepare_cleanup_device(Mesh_pointer,num_abs_boundary_faces, &
+ call prepare_cleanup_device(Mesh_pointer, &
SIMULATION_TYPE,ACOUSTIC_SIMULATION,ELASTIC_SIMULATION, &
ABSORBING_CONDITIONS,NOISE_TOMOGRAPHY,COMPUTE_AND_STORE_STRAIN, &
ATTENUATION)
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90 2011-10-08 01:09:46 UTC (rev 19050)
@@ -397,22 +397,31 @@
endif
if (NOISE_TOMOGRAPHY/=0) then
- ! save/read the surface movie using the same c routine as we do for absorbing boundaries (file ID is 2)
+ ! save/read the surface movie using the same c routine as we do for absorbing boundaries (file ID is 2)
- ! size of single record
- reclen=CUSTOM_REAL*NDIM*NGLLSQUARE*NSPEC_TOP
- ! total file size
- filesize = reclen
- filesize = filesize*NSTEP
+ ! size of single record
+ reclen=CUSTOM_REAL*NDIM*NGLLSQUARE*NSPEC_TOP
- write(outputname,"('/proc',i6.6,'_surface_movie')") myrank
- if (NOISE_TOMOGRAPHY==1) call open_file_abs_w(2,trim(LOCAL_PATH)//trim(outputname), &
+ ! check integer size limit: size of b_reclen_field must fit onto an 4-byte integer
+ if( NSPEC_TOP > 2147483647 / (CUSTOM_REAL * NGLLSQUARE * NDIM) ) then
+ print *,'reclen of noise surface_movie needed exceeds integer 4-byte limit: ',reclen
+ print *,' ',CUSTOM_REAL, NDIM, NGLLSQUARE, NSPEC_TOP
+ print*,'bit size fortran: ',bit_size(NSPEC_TOP)
+ call exit_MPI(myrank,"error NSPEC_TOP integer limit")
+ endif
+
+ ! total file size
+ filesize = reclen
+ filesize = filesize*NSTEP
+
+ write(outputname,"('/proc',i6.6,'_surface_movie')") myrank
+ if (NOISE_TOMOGRAPHY==1) call open_file_abs_w(2,trim(LOCAL_PATH)//trim(outputname), &
len_trim(trim(LOCAL_PATH)//trim(outputname)), &
filesize)
- if (NOISE_TOMOGRAPHY==2) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), &
+ if (NOISE_TOMOGRAPHY==2) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), &
len_trim(trim(LOCAL_PATH)//trim(outputname)), &
filesize)
- if (NOISE_TOMOGRAPHY==3) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), &
+ if (NOISE_TOMOGRAPHY==3) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), &
len_trim(trim(LOCAL_PATH)//trim(outputname)), &
filesize)
endif
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2011-10-08 01:09:46 UTC (rev 19050)
@@ -541,6 +541,14 @@
! size of single record
b_reclen_field = CUSTOM_REAL * NDIM * NGLLSQUARE * num_abs_boundary_faces
+
+ ! check integer size limit: size of b_reclen_field must fit onto an 4-byte integer
+ if( num_abs_boundary_faces > 2147483647 / (CUSTOM_REAL * NDIM * NGLLSQUARE) ) then
+ print *,'reclen needed exceeds integer 4-byte limit: ',b_reclen_field
+ print *,' ',CUSTOM_REAL, NDIM, NGLLSQUARE, num_abs_boundary_faces
+ print*,'bit size fortran: ',bit_size(b_reclen_field)
+ call exit_MPI(myrank,"error b_reclen_field integer limit")
+ endif
! total file size
filesize = b_reclen_field
@@ -582,13 +590,21 @@
! size of single record
b_reclen_potential = CUSTOM_REAL * NGLLSQUARE * num_abs_boundary_faces
+
+ ! check integer size limit: size of b_reclen_potential must fit onto an 4-byte integer
+ if( num_abs_boundary_faces > 2147483647 / (CUSTOM_REAL * NGLLSQUARE) ) then
+ print *,'reclen needed exceeds integer 4-byte limit: ',b_reclen_potential
+ print *,' ',CUSTOM_REAL, NGLLSQUARE, num_abs_boundary_faces
+ print*,'bit size fortran: ',bit_size(b_reclen_potential)
+ call exit_MPI(myrank,"error b_reclen_potential integer limit")
+ endif
! total file size (two lines to implicitly convert to 8-byte integers)
filesize = b_reclen_potential
filesize = filesize*NSTEP
! daniel: debug check size limit
- !if( NSTEP > 2147483648 / b_reclen_potential ) then
+ !if( NSTEP > 2147483647 / b_reclen_potential ) then
! print *,'file size needed exceeds integer 4-byte limit: ',b_reclen_potential,NSTEP
! print *,' ',CUSTOM_REAL, NGLLSQUARE, num_abs_boundary_faces,NSTEP
! print*,'file size fortran: ',filesize
@@ -739,10 +755,11 @@
nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, &
hprime_xx, hprime_yy, hprime_zz, &
hprimewgll_xx, wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
+ ABSORBING_CONDITIONS, &
abs_boundary_ispec, abs_boundary_ijk, &
abs_boundary_normal, &
abs_boundary_jacobian2Dw, &
- b_absorb_field, num_abs_boundary_faces, b_num_abs_boundary_faces, &
+ num_abs_boundary_faces, &
ispec_is_inner, &
NSOURCES, sourcearrays, islice_selected_source, ispec_selected_source, &
number_receiver_global, ispec_selected_rec, nrec, nrec_local, &
@@ -755,7 +772,10 @@
ispec_is_acoustic, &
NOISE_TOMOGRAPHY,num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
ABSORBING_CONDITIONS,b_reclen_potential,b_absorb_potential, &
- SIMULATION_TYPE,rho_ac_kl,kappa_ac_kl)
+ SIMULATION_TYPE,rho_ac_kl,kappa_ac_kl, &
+ ELASTIC_SIMULATION, num_coupling_ac_el_faces, &
+ coupling_ac_el_ispec,coupling_ac_el_ijk, &
+ coupling_ac_el_normal,coupling_ac_el_jacobian2Dw)
! prepares fields on GPU for elastic simulations
if( ELASTIC_SIMULATION ) &
@@ -763,7 +783,7 @@
rmass,rho_vp,rho_vs, &
num_phase_ispec_elastic,phase_ispec_inner_elastic, &
ispec_is_elastic, &
- ABSORBING_CONDITIONS,b_absorb_field,b_num_abs_boundary_faces, &
+ ABSORBING_CONDITIONS,b_absorb_field,b_reclen_field, &
SIMULATION_TYPE,rho_kl,mu_kl,kappa_kl, &
COMPUTE_AND_STORE_STRAIN, &
epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
@@ -775,7 +795,9 @@
b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
one_minus_sum_beta,factor_common, &
alphaval,betaval,gammaval, &
- b_alphaval,b_betaval,b_gammaval)
+ b_alphaval,b_betaval,b_gammaval, &
+ OCEANS,rmass_ocean_load, &
+ free_surface_normal,num_free_surface_faces)
! prepares needed receiver array for adjoint runs
if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) &
@@ -819,9 +841,9 @@
! outputs usage
if( myrank == 0 ) then
call get_free_device_memory(free_mb,used_mb,total_mb)
- write(IMAIN,*)" GPU usage: free =",free_mb," MB"
- write(IMAIN,*)" used =",used_mb," MB"
- write(IMAIN,*)" total =",total_mb," MB"
+ write(IMAIN,*)" GPU usage: free =",free_mb," MB",nint(free_mb/total_mb*100.0),"%"
+ write(IMAIN,*)" used =",used_mb," MB",nint(used_mb/total_mb*100.0),"%"
+ write(IMAIN,*)" total =",total_mb," MB",nint(total_mb/total_mb*100.0),"%"
write(IMAIN,*)
endif
More information about the CIG-COMMITS
mailing list