[cig-commits] r21240 - seismo/3D/SPECFEM3D/trunk/src/specfem3D

Wed Jan 16 10:19:53 PST 2013

Author: dkomati1
Date: 2013-01-16 10:19:52 -0800 (Wed, 16 Jan 2013)
New Revision: 21240

Added:
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_viscoelastic.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_ac.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_po.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_Dev.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_noDev.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_viscoelastic.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/older_not_maintained_compute_forces_viscoelastic_Dev_openmp.f90
Removed:
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_elastic.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_elastic_ac.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_elastic_po.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_Dev.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_calling_routine.F90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_noDev.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_elastic.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/older_compute_forces_elastic_Dev_openmp.f90
Modified:
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_poroelastic.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/read_mesh_databases.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90
Log:
renamed a few more files and subroutines for clarity; now done renaming


Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in
===================================================================

--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in	2013-01-16 18:19:52 UTC (rev 21240)
@@ -145,13 +145,13 @@
 CUDA_OBJECTS = \
 	$O/check_fields_cuda.cuda.o \
 	$O/compute_add_sources_acoustic_cuda.cuda.o \
-	$O/compute_add_sources_elastic_cuda.cuda.o \
+	$O/compute_add_sources_viscoelastic_cuda.cuda.o \
 	$O/compute_coupling_cuda.cuda.o \
 	$O/compute_forces_acoustic_cuda.cuda.o \
-	$O/compute_forces_elastic_cuda.cuda.o \
+	$O/compute_forces_viscoelastic_cuda.cuda.o \
 	$O/compute_kernels_cuda.cuda.o \
 	$O/compute_stacey_acoustic_cuda.cuda.o \
-	$O/compute_stacey_elastic_cuda.cuda.o \
+	$O/compute_stacey_viscoelastic_cuda.cuda.o \
 	$O/initialize_cuda.cuda.o \
 	$O/it_update_displacement_cuda.cuda.o \
 	$O/noise_tomography_cuda.cuda.o \
@@ -179,21 +179,21 @@
 	$O/fault_solver_dynamic.o \
 	$O/fault_solver_kinematic.o \
 	$O/compute_add_sources_acoustic.o \
-	$O/compute_add_sources_elastic.o \
+	$O/compute_add_sources_viscoelastic.o \
 	$O/compute_add_sources_poroelastic.o \
 	$O/compute_boundary_kernel.o \
 	$O/compute_coupling_acoustic_el.o \
 	$O/compute_coupling_acoustic_po.o \
-	$O/compute_coupling_elastic_ac.o \
-	$O/compute_coupling_elastic_po.o \
+	$O/compute_coupling_viscoelastic_ac.o \
+	$O/compute_coupling_viscoelastic_po.o \
 	$O/compute_coupling_poroelastic_ac.o \
 	$O/compute_coupling_poroelastic_el.o \
 	$O/compute_forces_acoustic_calling_routine.o \
 	$O/compute_forces_acoustic_PML.o \
 	$O/compute_forces_acoustic_pot_noDev.o \
-	$O/compute_forces_elastic_calling_routine.o \
-	$O/compute_forces_elastic_Dev.o \
-	$O/compute_forces_elastic_noDev.o \
+	$O/compute_forces_viscoelastic_calling_routine.o \
+	$O/compute_forces_viscoelastic_Dev.o \
+	$O/compute_forces_viscoelastic_noDev.o \
 	$O/compute_forces_poro_fluid_part.o \
 	$O/compute_forces_poroelastic.o \
 	$O/compute_forces_poro_solid_part.o \
@@ -201,7 +201,7 @@
 	$O/compute_interpolated_dva.o \
 	$O/compute_kernels.o \
 	$O/compute_stacey_acoustic.o \
-	$O/compute_stacey_elastic.o \
+	$O/compute_stacey_viscoelastic.o \
 	$O/compute_stacey_poroelastic.o \
 	$O/create_color_image.o \
 	$O/detect_mesh_surfaces.o \
@@ -264,7 +264,7 @@
 @COND_MPI_FALSE at COND_MPI_OBJECTS = $O/serial.o
 
 # objects toggled between openmp and non-openmp version
- at COND_OPENMP_TRUE@COND_OPENMP_OBJECTS = $O/older_compute_forces_elastic_Dev_openmp.openmp.o
+ at COND_OPENMP_TRUE@COND_OPENMP_OBJECTS = $O/older_not_maintained_compute_forces_viscoelastic_Dev_openmp.openmp.o
 @COND_OPENMP_FALSE at COND_OPENMP_OBJECTS =
 
 LIBSPECFEM = $L/libspecfem.a

Deleted: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_elastic.f90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_elastic.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -1,583 +0,0 @@
-!=====================================================================
-!
-!               S p e c f e m 3 D  V e r s i o n  2 . 1
-!               ---------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Princeton University, USA and CNRS / INRIA / University of Pau
-! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
-!                             July 2012
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-! for elastic solver
-
-  subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
-                        ibool,ispec_is_inner,phase_is_inner, &
-                        NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,&
-                        hdur,hdur_gaussian,tshift_src,dt,t0,sourcearrays, &
-                        ispec_is_elastic,SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT, &
-                        nrec,islice_selected_rec,ispec_selected_rec, &
-                        nadj_rec_local,adj_sourcearrays,b_accel, &
-                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY,GPU_MODE,Mesh_pointer  )
-
-  use specfem_par,only: PRINT_SOURCE_TIME_FUNCTION,stf_used_total, &
-                        xigll,yigll,zigll,xi_receiver,eta_receiver,gamma_receiver,&
-                        station_name,network_name,adj_source_file, &
-                        num_free_surface_faces,free_surface_ispec, &
-                        free_surface_ijk,free_surface_jacobian2Dw, &
-                        noise_sourcearray,irec_master_noise, &
-                        normal_x_noise,normal_y_noise,normal_z_noise, &
-                        mask_noise,noise_surface_movie, &
-                        nrec_local,number_receiver_global, &
-                        nsources_local,USE_FORCE_POINT_SOURCE, &
-                        USE_RICKER_TIME_FUNCTION
-
-  implicit none
-
-  include "constants.h"
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: accel
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-
-! communication overlap
-  logical, dimension(NSPEC_AB) :: ispec_is_inner
-  logical :: phase_is_inner
-
-! source
-  integer :: NSOURCES,myrank,it
-  integer, dimension(NSOURCES) :: islice_selected_source,ispec_selected_source
-  double precision, dimension(NSOURCES) :: hdur,hdur_gaussian,hdur_tiny,tshift_src
-  double precision :: dt,t0
-  real(kind=CUSTOM_REAL), dimension(NSOURCES,NDIM,NGLLX,NGLLY,NGLLZ) :: sourcearrays
-
-  double precision, external :: comp_source_time_function,comp_source_time_function_gauss,comp_source_time_function_rickr
-
-  logical, dimension(NSPEC_AB) :: ispec_is_elastic
-
-!adjoint simulations
-  integer:: SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT
-  logical:: GPU_MODE
-  integer(kind=8) :: Mesh_pointer
-  integer:: nrec
-  integer,dimension(nrec) :: islice_selected_rec,ispec_selected_rec
-  integer:: nadj_rec_local
-  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_ADJOINT):: b_accel
-  logical :: ibool_read_adj_arrays
-  integer :: it_sub_adj,itime,NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY
-  real(kind=CUSTOM_REAL),dimension(nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ):: &
-    adj_sourcearrays
-
-! local parameters
-  double precision :: stf
-  real(kind=CUSTOM_REAL),dimension(:,:,:,:,:),allocatable:: adj_sourcearray
-  real(kind=CUSTOM_REAL) stf_used,stf_used_total_all,time_source
-  ! for GPU_MODE
-  double precision, dimension(NSOURCES) :: stf_pre_compute
-  integer :: isource,iglob,i,j,k,ispec
-  integer :: irec_local,irec, ier
-
-! adjoint sources in SU format
-  integer :: it_start,it_end
-  real(kind=CUSTOM_REAL) :: adj_temp(NSTEP)
-  real(kind=CUSTOM_REAL) :: adj_src(NTSTEP_BETWEEN_READ_ADJSRC,NDIM)
-  character(len=256) :: procname
-  integer,parameter :: nheader=240      ! 240 bytes
-  !integer(kind=2) :: i2head(nheader/2)  ! 2-byte-integer
-  !integer(kind=4) :: i4head(nheader/4)  ! 4-byte-integer
-  real(kind=4)    :: r4head(nheader/4)  ! 4-byte-real
-  !equivalence (i2head,i4head,r4head)    ! share the same 240-byte-memory
-  double precision :: hxir(NGLLX),hpxir(NGLLX),hetar(NGLLY),hpetar(NGLLY),hgammar(NGLLZ),hpgammar(NGLLZ)
-
-! plotting source time function
-  if(PRINT_SOURCE_TIME_FUNCTION .and. .not. phase_is_inner ) then
-     ! initializes total
-     stf_used_total = 0.0_CUSTOM_REAL
-  endif
-
-  ! forward simulations
-  if (SIMULATION_TYPE == 1 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) then
-
-    if(GPU_MODE) then
-       if( NSOURCES > 0 ) then
-          do isource = 1,NSOURCES
-             ! precomputes source time function factor
-             if(USE_FORCE_POINT_SOURCE) then
-                if( USE_RICKER_TIME_FUNCTION ) then
-                   stf_pre_compute(isource) = &
-                        comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_src(isource),hdur(isource))
-                else
-                   stf_pre_compute(isource) = &
-                        comp_source_time_function_gauss(dble(it-1)*DT-t0-tshift_src(isource),hdur_tiny(isource))
-                endif
-             else
-                if( USE_RICKER_TIME_FUNCTION ) then
-                   stf_pre_compute(isource) = &
-                        comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_src(isource),hdur(isource))
-                else
-                   stf_pre_compute(isource) = &
-                        comp_source_time_function(dble(it-1)*DT-t0-tshift_src(isource),hdur_gaussian(isource))
-                endif
-             endif
-          enddo
-          ! only implements SIMTYPE=1 and NOISE_TOM=0
-          ! write(*,*) "fortran dt = ", dt
-          ! change dt -> DT
-          call compute_add_sources_el_cuda(Mesh_pointer, phase_is_inner, &
-                                          NSOURCES, stf_pre_compute, myrank)
-
-       endif
-
-    else ! .NOT. GPU_MODE
-
-      do isource = 1,NSOURCES
-
-         !   add the source (only if this proc carries the source)
-         if(myrank == islice_selected_source(isource)) then
-
-            ispec = ispec_selected_source(isource)
-
-            if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
-
-               if( ispec_is_elastic(ispec) ) then
-
-                  if(USE_FORCE_POINT_SOURCE) then
-
-                    !f0 = hdur(isource) !! using hdur as a FREQUENCY
-                    !if (it == 1 .and. myrank == 0) then
-                    !  write(IMAIN,*) 'using a source of dominant frequency ',f0
-                    !  write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
-                    !  write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
-                    !endif
-
-                    if( USE_RICKER_TIME_FUNCTION) then
-                       stf = comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_src(isource),hdur(isource))
-                    else
-                       stf = comp_source_time_function_gauss(dble(it-1)*DT-t0-tshift_src(isource),hdur_tiny(isource))
-                    endif
-
-                    ! add the inclined force source array
-                    ! distinguish between single and double precision for reals
-                    if(CUSTOM_REAL == SIZE_REAL) then
-                       stf_used = sngl(stf)
-                    else
-                       stf_used = stf
-                    endif
-
-                    do k=1,NGLLZ
-                       do j=1,NGLLY
-                          do i=1,NGLLX
-                             iglob = ibool(i,j,k,ispec)
-                             accel(:,iglob) = accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used
-                          enddo
-                       enddo
-                    enddo
-
-                  else
-
-                    if( USE_RICKER_TIME_FUNCTION) then
-                      stf = comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_src(isource),hdur(isource))
-                    else
-                      stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_src(isource),hdur_gaussian(isource))
-                    endif
-
-                    !     distinguish between single and double precision for reals
-                    if(CUSTOM_REAL == SIZE_REAL) then
-                      stf_used = sngl(stf)
-                    else
-                      stf_used = stf
-                    endif
-
-                    !     add source array
-                    do k=1,NGLLZ
-                      do j=1,NGLLY
-                        do i=1,NGLLX
-                          iglob = ibool(i,j,k,ispec)
-                          accel(:,iglob) = accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used
-                        enddo
-                      enddo
-                    enddo
-
-                  endif ! USE_FORCE_POINT_SOURCE
-
-                  stf_used_total = stf_used_total + stf_used
-
-               endif ! ispec_is_elastic
-            endif ! ispec_is_inner
-         endif ! myrank
-
-      enddo ! NSOURCES
-    endif ! GPU_MODE
-  endif ! forward
-
-! NOTE: adjoint sources and backward wavefield timing:
-!             idea is to start with the backward field b_displ,.. at time (T)
-!             and convolve with the adjoint field at time (T-t)
-!
-! backward/reconstructed wavefields:
-!       time for b_displ( it ) would correspond to (NSTEP - it - 1 )*DT - t0
-!       if we read in saved wavefields b_displ() before Newmark time scheme
-!       (see sources for simulation_type 1 and seismograms)
-!       since at the beginning of the time loop, the numerical Newmark time scheme updates
-!       the wavefields, that is b_displ( it=1) would correspond to time (NSTEP -1 - 1)*DT - t0
-!
-!       b_displ is now read in after Newmark time scheme:
-!       we read the backward/reconstructed wavefield at the end of the first time loop,
-!       such that b_displ(it=1) corresponds to -t0 + (NSTEP-1)*DT.
-!       assuming that until that end the backward/reconstructed wavefield and adjoint fields
-!       have a zero contribution to adjoint kernels.
-!       thus the correct indexing is NSTEP - it + 1, instead of NSTEP - it
-!
-! adjoint wavefields:
-!       since the adjoint source traces were derived from the seismograms,
-!       it follows that for the adjoint wavefield, the time equivalent to ( T - t ) uses the time-reversed
-!       adjoint source traces which start at -t0 and end at time (NSTEP-1)*DT - t0
-!       for step it=1: (NSTEP -it + 1)*DT - t0 for backward wavefields corresponds to time T
-
-! adjoint simulations
-  if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
-
-    ! adds adjoint source in this partitions
-    if( nadj_rec_local > 0 ) then
-
-      ! read in adjoint sources block by block (for memory consideration)
-      ! e.g., in exploration experiments, both the number of receivers (nrec) and
-      ! the number of time steps (NSTEP) are huge,
-      ! which may cause problems since we have a large array:
-      !   adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ)
-
-      ! figure out if we need to read in a chunk of the adjoint source at this timestep
-      it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )   !chunk_number
-      ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0))
-
-      ! needs to read in a new chunk/block of the adjoint source
-      ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner'
-      ! we first do calculations for the boudaries, and then start communication
-      ! with other partitions while calculate for the inner part
-      ! this must be done carefully, otherwise the adjoint sources may be added twice
-      if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then
-
-        ! allocates temporary source array
-        allocate(adj_sourcearray(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ),stat=ier)
-        if( ier /= 0 ) stop 'error allocating array adj_sourcearray'
-
-        if (.not. SU_FORMAT) then
-          !!! read ascii adjoint sources
-          irec_local = 0
-          do irec = 1, nrec
-            ! compute source arrays
-            if (myrank == islice_selected_rec(irec)) then
-              irec_local = irec_local + 1
-              ! reads in **sta**.**net**.**LH**.adj files
-              adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
-              call compute_arrays_adjoint_source(myrank,adj_source_file, &
-                    xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                    adj_sourcearray, xigll,yigll,zigll, &
-                    it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
-              do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
-                adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
-              enddo
-            endif
-          enddo
-        else
-          !!! read SU adjoint sources
-          ! range of the block we need to read
-          it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1
-          it_end   = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1
-          write(procname,"(i4)") myrank
-          ! read adjoint sources
-          open(unit=IIN_SU1, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj', &
-                            status='old',access='direct',recl=240+4*NSTEP,iostat = ier)
-          if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
-                                    //'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj does not exit')
-          open(unit=IIN_SU2, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dy_SU.adj', &
-                            status='old',access='direct',recl=240+4*NSTEP,iostat = ier)
-          if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
-                                    //'../SEM/'//trim(adjustl(procname))//'_dy_SU.adj does not exit')
-          open(unit=IIN_SU3, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dz_SU.adj', &
-                            status='old',access='direct',recl=240+4*NSTEP,iostat = ier)
-          if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
-                                    //'../SEM/'//trim(adjustl(procname))//'_dz_SU.adj does not exit')
-
-          do irec_local = 1,nrec_local
-            irec = number_receiver_global(irec_local)
-            read(IIN_SU1,rec=irec_local) r4head, adj_temp
-            adj_src(:,1)=adj_temp(it_start:it_end)
-            read(IIN_SU2,rec=irec_local) r4head, adj_temp
-            adj_src(:,2)=adj_temp(it_start:it_end)
-            read(IIN_SU3,rec=irec_local) r4head, adj_temp
-            adj_src(:,3)=adj_temp(it_start:it_end)
-            ! lagrange interpolators for receiver location
-            call lagrange_any(xi_receiver(irec),NGLLX,xigll,hxir,hpxir)
-            call lagrange_any(eta_receiver(irec),NGLLY,yigll,hetar,hpetar)
-            call lagrange_any(gamma_receiver(irec),NGLLZ,zigll,hgammar,hpgammar)
-            ! interpolates adjoint source onto GLL points within this element
-            do k = 1, NGLLZ
-              do j = 1, NGLLY
-                do i = 1, NGLLX
-                  adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:)
-                enddo
-              enddo
-            enddo
-            do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
-              adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
-            enddo
-          enddo
-          close(IIN_SU1)
-          close(IIN_SU2)
-          close(IIN_SU3)
-        endif !if(.not. SU_FORMAT)
-
-        deallocate(adj_sourcearray)
-      endif ! if(ibool_read_adj_arrays)
-
-
-      if( it < NSTEP ) then
-
-        if(.NOT. GPU_MODE) then
-
-          ! receivers act as sources
-          irec_local = 0
-          do irec = 1,nrec
-
-            ! add the source (only if this proc carries the source)
-            if (myrank == islice_selected_rec(irec)) then
-              irec_local = irec_local + 1
-
-              ispec = ispec_selected_rec(irec)
-              if( ispec_is_elastic(ispec) ) then
-
-                ! checks if element is in phase_is_inner run
-                if (ispec_is_inner(ispec_selected_rec(irec)) .eqv. phase_is_inner) then
-
-                  ! adds source array
-                  do k = 1,NGLLZ
-                    do j = 1,NGLLY
-                      do i = 1,NGLLX
-                        iglob = ibool(i,j,k,ispec_selected_rec(irec))
-
-                        accel(:,iglob) = accel(:,iglob)  &
-                                  + adj_sourcearrays(irec_local, &
-                                  NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
-                                  :,i,j,k)
-                      enddo
-                    enddo
-                  enddo
-                endif ! phase_is_inner
-              endif ! ispec_is_elastic
-            endif
-          enddo ! nrec
-        else ! GPU_MODE == .true.
-          call add_sources_el_sim_type_2_or_3(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
-                                            ispec_is_inner,ispec_is_elastic, &
-                                            ispec_selected_rec,myrank,nrec, &
-                                            NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
-                                            islice_selected_rec,nadj_rec_local, &
-                                            NTSTEP_BETWEEN_READ_ADJSRC)
-        endif ! GPU_MODE
-      endif ! it
-    endif ! nadj_rec_local
-  endif !adjoint
-
-! note:  b_displ() is read in after Newmark time scheme, thus
-!           b_displ(it=1) corresponds to -t0 + (NSTEP-1)*DT.
-!           thus indexing is NSTEP - it , instead of NSTEP - it - 1
-
-! adjoint simulations
-  if (SIMULATION_TYPE == 3 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) then
-
-     if(GPU_MODE) then
-        if( NSOURCES > 0 ) then
-           do isource = 1,NSOURCES
-              ! precomputes source time function factors
-              if(USE_FORCE_POINT_SOURCE) then
-                 if( USE_RICKER_TIME_FUNCTION ) then
-                    stf_pre_compute(isource) = &
-                         comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
-                 else
-                    stf_pre_compute(isource) = &
-                         comp_source_time_function_gauss(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_tiny(isource))
-                 endif
-              else
-                 if( USE_RICKER_TIME_FUNCTION ) then
-                    stf_pre_compute(isource) = &
-                         comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
-                 else
-                    stf_pre_compute(isource) = &
-                         comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_gaussian(isource))
-                 endif
-              endif
-           enddo
-           ! only implements SIMTYPE=3
-           call compute_add_sources_el_s3_cuda(Mesh_pointer,stf_pre_compute, &
-                NSOURCES,phase_is_inner,myrank)
-
-        endif
-
-    else ! .NOT. GPU_MODE
-
-      ! backward source reconstruction
-      do isource = 1,NSOURCES
-
-         ! add the source (only if this proc carries the source)
-         if(myrank == islice_selected_source(isource)) then
-
-            ispec = ispec_selected_source(isource)
-
-            if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
-
-               if( ispec_is_elastic(ispec) ) then
-
-                  if(USE_FORCE_POINT_SOURCE) then
-
-                     !f0 = hdur(isource) !! using hdur as a FREQUENCY 
-                     !if (it == 1 .and. myrank == 0) then
-                     !   write(IMAIN,*) 'using a source of dominant frequency ',f0
-                     !   write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
-                     !   write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
-                     !endif
-
-                     if( USE_RICKER_TIME_FUNCTION ) then
-                        stf = comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
-                     else
-                        stf = comp_source_time_function_gauss(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_tiny(isource))
-                     endif
-
-                    ! add the inclined force source array
-                    ! distinguish between single and double precision for reals
-                     if(CUSTOM_REAL == SIZE_REAL) then
-                        stf_used = sngl(stf)
-                     else
-                        stf_used = stf
-                     endif
-
-                     do k=1,NGLLZ
-                        do j=1,NGLLY
-                           do i=1,NGLLX
-                              iglob = ibool(i,j,k,ispec)
-                              b_accel(:,iglob) = b_accel(:,iglob) + sourcearrays(isource,:,i,j,k) * stf_used
-                           enddo
-                        enddo
-                     enddo
-
-                  else
-
-                     ! see note above: time step corresponds now to NSTEP-it
-                     ! (also compare to it-1 for forward simulation)
-                     if( USE_RICKER_TIME_FUNCTION ) then
-                       stf = comp_source_time_function_rickr( &
-                                      dble(it-1)*DT-t0-tshift_src(isource),hdur(isource))
-                     else
-                       stf = comp_source_time_function( &
-                                      dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_gaussian(isource))
-                     endif
-
-                     ! distinguish between single and double precision for reals
-                     if(CUSTOM_REAL == SIZE_REAL) then
-                        stf_used = sngl(stf)
-                     else
-                        stf_used = stf
-                     endif
-
-                     !  add source array
-                     do k=1,NGLLZ
-                        do j=1,NGLLY
-                           do i=1,NGLLX
-                              iglob = ibool(i,j,k,ispec_selected_source(isource))
-                              b_accel(:,iglob) = b_accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used
-                           enddo
-                        enddo
-                     enddo
-                  endif ! USE_FORCE_POINT_SOURCE
-
-                  stf_used_total = stf_used_total + stf_used
-
-               endif ! elastic
-            endif ! phase_inner
-         endif ! myrank
-
-      enddo ! NSOURCES
-    endif ! GPU_MODE
-  endif ! adjoint
-
-  ! master prints out source time function to file
-  if(PRINT_SOURCE_TIME_FUNCTION .and. phase_is_inner) then
-    time_source = (it-1)*DT - t0
-    call sum_all_cr(stf_used_total,stf_used_total_all)
-    if( myrank == 0 ) write(IOSTF,*) time_source,stf_used_total_all
-  endif
-
-  ! for noise simulations
-  ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
-  ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
-  ! because boundary points are claculated first!
-  if( .not. phase_is_inner ) then
-    if ( NOISE_TOMOGRAPHY == 1 ) then
-       ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
-       ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
-       ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
-       ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
-       if(.NOT. GPU_MODE) then
-          call add_source_master_rec_noise(myrank,nrec, &
-               NSTEP,accel,noise_sourcearray, &
-               ibool,islice_selected_rec,ispec_selected_rec, &
-               it,irec_master_noise, &
-               NSPEC_AB,NGLOB_AB)
-       else ! GPU_MODE == .true.
-          call add_source_master_rec_noise_cu(Mesh_pointer, myrank, it, irec_master_noise, islice_selected_rec)
-       endif
-    elseif ( NOISE_TOMOGRAPHY == 2 ) then
-       ! second step of noise tomography, i.e., read the surface movie saved at every timestep
-       ! use the movie to drive the ensemble forward wavefield
-       call noise_read_add_surface_movie(NGLLSQUARE*num_free_surface_faces, &
-                              accel, &
-                              normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
-                              ibool,noise_surface_movie, &
-                              NSTEP-it+1, &
-                              NSPEC_AB,NGLOB_AB, &
-                              num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
-                              free_surface_jacobian2Dw,&
-                              Mesh_pointer,GPU_MODE,NOISE_TOMOGRAPHY)
-        ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
-        ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
-        ! note the ensemble forward sources are generally distributed on the surface of the earth
-        ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
-        ! therefore, we must add it here, before applying the inverse of mass matrix
-    elseif ( NOISE_TOMOGRAPHY == 3 ) then
-        ! third step of noise tomography, i.e., read the surface movie saved at every timestep
-        ! use the movie to reconstruct the ensemble forward wavefield
-        ! the ensemble adjoint wavefield is done as usual
-        ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
-        call noise_read_add_surface_movie(NGLLSQUARE*num_free_surface_faces, &
-                              b_accel, &
-                              normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
-                              ibool,noise_surface_movie, &
-                              it, &
-                              NSPEC_AB,NGLOB_AB, &
-                              num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
-                              free_surface_jacobian2Dw,&
-                               Mesh_pointer,GPU_MODE,NOISE_TOMOGRAPHY)
-    endif
-  endif
-
-
-  end subroutine compute_add_sources_elastic

Copied: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_viscoelastic.f90 (from rev 21238, seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_elastic.f90)
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_viscoelastic.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_viscoelastic.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -0,0 +1,583 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 1
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+!                             July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+! for elastic solver
+
+  subroutine compute_add_sources_viscoelastic( NSPEC_AB,NGLOB_AB,accel, &
+                        ibool,ispec_is_inner,phase_is_inner, &
+                        NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,&
+                        hdur,hdur_gaussian,tshift_src,dt,t0,sourcearrays, &
+                        ispec_is_elastic,SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT, &
+                        nrec,islice_selected_rec,ispec_selected_rec, &
+                        nadj_rec_local,adj_sourcearrays,b_accel, &
+                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY,GPU_MODE,Mesh_pointer  )
+
+  use specfem_par,only: PRINT_SOURCE_TIME_FUNCTION,stf_used_total, &
+                        xigll,yigll,zigll,xi_receiver,eta_receiver,gamma_receiver,&
+                        station_name,network_name,adj_source_file, &
+                        num_free_surface_faces,free_surface_ispec, &
+                        free_surface_ijk,free_surface_jacobian2Dw, &
+                        noise_sourcearray,irec_master_noise, &
+                        normal_x_noise,normal_y_noise,normal_z_noise, &
+                        mask_noise,noise_surface_movie, &
+                        nrec_local,number_receiver_global, &
+                        nsources_local,USE_FORCE_POINT_SOURCE, &
+                        USE_RICKER_TIME_FUNCTION
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: accel
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+
+! communication overlap
+  logical, dimension(NSPEC_AB) :: ispec_is_inner
+  logical :: phase_is_inner
+
+! source
+  integer :: NSOURCES,myrank,it
+  integer, dimension(NSOURCES) :: islice_selected_source,ispec_selected_source
+  double precision, dimension(NSOURCES) :: hdur,hdur_gaussian,hdur_tiny,tshift_src
+  double precision :: dt,t0
+  real(kind=CUSTOM_REAL), dimension(NSOURCES,NDIM,NGLLX,NGLLY,NGLLZ) :: sourcearrays
+
+  double precision, external :: comp_source_time_function,comp_source_time_function_gauss,comp_source_time_function_rickr
+
+  logical, dimension(NSPEC_AB) :: ispec_is_elastic
+
+!adjoint simulations
+  integer:: SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT
+  logical:: GPU_MODE
+  integer(kind=8) :: Mesh_pointer
+  integer:: nrec
+  integer,dimension(nrec) :: islice_selected_rec,ispec_selected_rec
+  integer:: nadj_rec_local
+  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_ADJOINT):: b_accel
+  logical :: ibool_read_adj_arrays
+  integer :: it_sub_adj,itime,NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY
+  real(kind=CUSTOM_REAL),dimension(nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ):: &
+    adj_sourcearrays
+
+! local parameters
+  double precision :: stf
+  real(kind=CUSTOM_REAL),dimension(:,:,:,:,:),allocatable:: adj_sourcearray
+  real(kind=CUSTOM_REAL) stf_used,stf_used_total_all,time_source
+  ! for GPU_MODE
+  double precision, dimension(NSOURCES) :: stf_pre_compute
+  integer :: isource,iglob,i,j,k,ispec
+  integer :: irec_local,irec, ier
+
+! adjoint sources in SU format
+  integer :: it_start,it_end
+  real(kind=CUSTOM_REAL) :: adj_temp(NSTEP)
+  real(kind=CUSTOM_REAL) :: adj_src(NTSTEP_BETWEEN_READ_ADJSRC,NDIM)
+  character(len=256) :: procname
+  integer,parameter :: nheader=240      ! 240 bytes
+  !integer(kind=2) :: i2head(nheader/2)  ! 2-byte-integer
+  !integer(kind=4) :: i4head(nheader/4)  ! 4-byte-integer
+  real(kind=4)    :: r4head(nheader/4)  ! 4-byte-real
+  !equivalence (i2head,i4head,r4head)    ! share the same 240-byte-memory
+  double precision :: hxir(NGLLX),hpxir(NGLLX),hetar(NGLLY),hpetar(NGLLY),hgammar(NGLLZ),hpgammar(NGLLZ)
+
+! plotting source time function
+  if(PRINT_SOURCE_TIME_FUNCTION .and. .not. phase_is_inner ) then
+     ! initializes total
+     stf_used_total = 0.0_CUSTOM_REAL
+  endif
+
+  ! forward simulations
+  if (SIMULATION_TYPE == 1 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) then
+
+    if(GPU_MODE) then
+       if( NSOURCES > 0 ) then
+          do isource = 1,NSOURCES
+             ! precomputes source time function factor
+             if(USE_FORCE_POINT_SOURCE) then
+                if( USE_RICKER_TIME_FUNCTION ) then
+                   stf_pre_compute(isource) = &
+                        comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_src(isource),hdur(isource))
+                else
+                   stf_pre_compute(isource) = &
+                        comp_source_time_function_gauss(dble(it-1)*DT-t0-tshift_src(isource),hdur_tiny(isource))
+                endif
+             else
+                if( USE_RICKER_TIME_FUNCTION ) then
+                   stf_pre_compute(isource) = &
+                        comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_src(isource),hdur(isource))
+                else
+                   stf_pre_compute(isource) = &
+                        comp_source_time_function(dble(it-1)*DT-t0-tshift_src(isource),hdur_gaussian(isource))
+                endif
+             endif
+          enddo
+          ! only implements SIMTYPE=1 and NOISE_TOM=0
+          ! write(*,*) "fortran dt = ", dt
+          ! change dt -> DT
+          call compute_add_sources_el_cuda(Mesh_pointer, phase_is_inner, &
+                                          NSOURCES, stf_pre_compute, myrank)
+
+       endif
+
+    else ! .NOT. GPU_MODE
+
+      do isource = 1,NSOURCES
+
+         !   add the source (only if this proc carries the source)
+         if(myrank == islice_selected_source(isource)) then
+
+            ispec = ispec_selected_source(isource)
+
+            if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+
+               if( ispec_is_elastic(ispec) ) then
+
+                  if(USE_FORCE_POINT_SOURCE) then
+
+                    !f0 = hdur(isource) !! using hdur as a FREQUENCY
+                    !if (it == 1 .and. myrank == 0) then
+                    !  write(IMAIN,*) 'using a source of dominant frequency ',f0
+                    !  write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
+                    !  write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
+                    !endif
+
+                    if( USE_RICKER_TIME_FUNCTION) then
+                       stf = comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_src(isource),hdur(isource))
+                    else
+                       stf = comp_source_time_function_gauss(dble(it-1)*DT-t0-tshift_src(isource),hdur_tiny(isource))
+                    endif
+
+                    ! add the inclined force source array
+                    ! distinguish between single and double precision for reals
+                    if(CUSTOM_REAL == SIZE_REAL) then
+                       stf_used = sngl(stf)
+                    else
+                       stf_used = stf
+                    endif
+
+                    do k=1,NGLLZ
+                       do j=1,NGLLY
+                          do i=1,NGLLX
+                             iglob = ibool(i,j,k,ispec)
+                             accel(:,iglob) = accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used
+                          enddo
+                       enddo
+                    enddo
+
+                  else
+
+                    if( USE_RICKER_TIME_FUNCTION) then
+                      stf = comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_src(isource),hdur(isource))
+                    else
+                      stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_src(isource),hdur_gaussian(isource))
+                    endif
+
+                    !     distinguish between single and double precision for reals
+                    if(CUSTOM_REAL == SIZE_REAL) then
+                      stf_used = sngl(stf)
+                    else
+                      stf_used = stf
+                    endif
+
+                    !     add source array
+                    do k=1,NGLLZ
+                      do j=1,NGLLY
+                        do i=1,NGLLX
+                          iglob = ibool(i,j,k,ispec)
+                          accel(:,iglob) = accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used
+                        enddo
+                      enddo
+                    enddo
+
+                  endif ! USE_FORCE_POINT_SOURCE
+
+                  stf_used_total = stf_used_total + stf_used
+
+               endif ! ispec_is_elastic
+            endif ! ispec_is_inner
+         endif ! myrank
+
+      enddo ! NSOURCES
+    endif ! GPU_MODE
+  endif ! forward
+
+! NOTE: adjoint sources and backward wavefield timing:
+!             idea is to start with the backward field b_displ,.. at time (T)
+!             and convolve with the adjoint field at time (T-t)
+!
+! backward/reconstructed wavefields:
+!       time for b_displ( it ) would correspond to (NSTEP - it - 1 )*DT - t0
+!       if we read in saved wavefields b_displ() before Newmark time scheme
+!       (see sources for simulation_type 1 and seismograms)
+!       since at the beginning of the time loop, the numerical Newmark time scheme updates
+!       the wavefields, that is b_displ( it=1) would correspond to time (NSTEP -1 - 1)*DT - t0
+!
+!       b_displ is now read in after Newmark time scheme:
+!       we read the backward/reconstructed wavefield at the end of the first time loop,
+!       such that b_displ(it=1) corresponds to -t0 + (NSTEP-1)*DT.
+!       assuming that until that end the backward/reconstructed wavefield and adjoint fields
+!       have a zero contribution to adjoint kernels.
+!       thus the correct indexing is NSTEP - it + 1, instead of NSTEP - it
+!
+! adjoint wavefields:
+!       since the adjoint source traces were derived from the seismograms,
+!       it follows that for the adjoint wavefield, the time equivalent to ( T - t ) uses the time-reversed
+!       adjoint source traces which start at -t0 and end at time (NSTEP-1)*DT - t0
+!       for step it=1: (NSTEP -it + 1)*DT - t0 for backward wavefields corresponds to time T
+
+! adjoint simulations
+  if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
+
+    ! adds adjoint source in this partitions
+    if( nadj_rec_local > 0 ) then
+
+      ! read in adjoint sources block by block (for memory consideration)
+      ! e.g., in exploration experiments, both the number of receivers (nrec) and
+      ! the number of time steps (NSTEP) are huge,
+      ! which may cause problems since we have a large array:
+      !   adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ)
+
+      ! figure out if we need to read in a chunk of the adjoint source at this timestep
+      it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )   !chunk_number
+      ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0))
+
+      ! needs to read in a new chunk/block of the adjoint source
+      ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner'
+      ! we first do calculations for the boudaries, and then start communication
+      ! with other partitions while calculate for the inner part
+      ! this must be done carefully, otherwise the adjoint sources may be added twice
+      if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then
+
+        ! allocates temporary source array
+        allocate(adj_sourcearray(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array adj_sourcearray'
+
+        if (.not. SU_FORMAT) then
+          !!! read ascii adjoint sources
+          irec_local = 0
+          do irec = 1, nrec
+            ! compute source arrays
+            if (myrank == islice_selected_rec(irec)) then
+              irec_local = irec_local + 1
+              ! reads in **sta**.**net**.**LH**.adj files
+              adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
+              call compute_arrays_adjoint_source(myrank,adj_source_file, &
+                    xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                    adj_sourcearray, xigll,yigll,zigll, &
+                    it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+              do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
+                adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
+              enddo
+            endif
+          enddo
+        else
+          !!! read SU adjoint sources
+          ! range of the block we need to read
+          it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1
+          it_end   = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1
+          write(procname,"(i4)") myrank
+          ! read adjoint sources
+          open(unit=IIN_SU1, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj', &
+                            status='old',access='direct',recl=240+4*NSTEP,iostat = ier)
+          if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
+                                    //'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj does not exit')
+          open(unit=IIN_SU2, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dy_SU.adj', &
+                            status='old',access='direct',recl=240+4*NSTEP,iostat = ier)
+          if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
+                                    //'../SEM/'//trim(adjustl(procname))//'_dy_SU.adj does not exit')
+          open(unit=IIN_SU3, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dz_SU.adj', &
+                            status='old',access='direct',recl=240+4*NSTEP,iostat = ier)
+          if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
+                                    //'../SEM/'//trim(adjustl(procname))//'_dz_SU.adj does not exit')
+
+          do irec_local = 1,nrec_local
+            irec = number_receiver_global(irec_local)
+            read(IIN_SU1,rec=irec_local) r4head, adj_temp
+            adj_src(:,1)=adj_temp(it_start:it_end)
+            read(IIN_SU2,rec=irec_local) r4head, adj_temp
+            adj_src(:,2)=adj_temp(it_start:it_end)
+            read(IIN_SU3,rec=irec_local) r4head, adj_temp
+            adj_src(:,3)=adj_temp(it_start:it_end)
+            ! lagrange interpolators for receiver location
+            call lagrange_any(xi_receiver(irec),NGLLX,xigll,hxir,hpxir)
+            call lagrange_any(eta_receiver(irec),NGLLY,yigll,hetar,hpetar)
+            call lagrange_any(gamma_receiver(irec),NGLLZ,zigll,hgammar,hpgammar)
+            ! interpolates adjoint source onto GLL points within this element
+            do k = 1, NGLLZ
+              do j = 1, NGLLY
+                do i = 1, NGLLX
+                  adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:)
+                enddo
+              enddo
+            enddo
+            do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
+              adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
+            enddo
+          enddo
+          close(IIN_SU1)
+          close(IIN_SU2)
+          close(IIN_SU3)
+        endif !if(.not. SU_FORMAT)
+
+        deallocate(adj_sourcearray)
+      endif ! if(ibool_read_adj_arrays)
+
+
+      if( it < NSTEP ) then
+
+        if(.NOT. GPU_MODE) then
+
+          ! receivers act as sources
+          irec_local = 0
+          do irec = 1,nrec
+
+            ! add the source (only if this proc carries the source)
+            if (myrank == islice_selected_rec(irec)) then
+              irec_local = irec_local + 1
+
+              ispec = ispec_selected_rec(irec)
+              if( ispec_is_elastic(ispec) ) then
+
+                ! checks if element is in phase_is_inner run
+                if (ispec_is_inner(ispec_selected_rec(irec)) .eqv. phase_is_inner) then
+
+                  ! adds source array
+                  do k = 1,NGLLZ
+                    do j = 1,NGLLY
+                      do i = 1,NGLLX
+                        iglob = ibool(i,j,k,ispec_selected_rec(irec))
+
+                        accel(:,iglob) = accel(:,iglob)  &
+                                  + adj_sourcearrays(irec_local, &
+                                  NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
+                                  :,i,j,k)
+                      enddo
+                    enddo
+                  enddo
+                endif ! phase_is_inner
+              endif ! ispec_is_elastic
+            endif
+          enddo ! nrec
+        else ! GPU_MODE == .true.
+          call add_sources_el_sim_type_2_or_3(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
+                                            ispec_is_inner,ispec_is_elastic, &
+                                            ispec_selected_rec,myrank,nrec, &
+                                            NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
+                                            islice_selected_rec,nadj_rec_local, &
+                                            NTSTEP_BETWEEN_READ_ADJSRC)
+        endif ! GPU_MODE
+      endif ! it
+    endif ! nadj_rec_local
+  endif !adjoint
+
+! note:  b_displ() is read in after Newmark time scheme, thus
+!           b_displ(it=1) corresponds to -t0 + (NSTEP-1)*DT.
+!           thus indexing is NSTEP - it , instead of NSTEP - it - 1
+
+! adjoint simulations
+  if (SIMULATION_TYPE == 3 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) then
+
+     if(GPU_MODE) then
+        if( NSOURCES > 0 ) then
+           do isource = 1,NSOURCES
+              ! precomputes source time function factors
+              if(USE_FORCE_POINT_SOURCE) then
+                 if( USE_RICKER_TIME_FUNCTION ) then
+                    stf_pre_compute(isource) = &
+                         comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
+                 else
+                    stf_pre_compute(isource) = &
+                         comp_source_time_function_gauss(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_tiny(isource))
+                 endif
+              else
+                 if( USE_RICKER_TIME_FUNCTION ) then
+                    stf_pre_compute(isource) = &
+                         comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
+                 else
+                    stf_pre_compute(isource) = &
+                         comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_gaussian(isource))
+                 endif
+              endif
+           enddo
+           ! only implements SIMTYPE=3
+           call compute_add_sources_el_s3_cuda(Mesh_pointer,stf_pre_compute, &
+                NSOURCES,phase_is_inner,myrank)
+
+        endif
+
+    else ! .NOT. GPU_MODE
+
+      ! backward source reconstruction
+      do isource = 1,NSOURCES
+
+         ! add the source (only if this proc carries the source)
+         if(myrank == islice_selected_source(isource)) then
+
+            ispec = ispec_selected_source(isource)
+
+            if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+
+               if( ispec_is_elastic(ispec) ) then
+
+                  if(USE_FORCE_POINT_SOURCE) then
+
+                     !f0 = hdur(isource) !! using hdur as a FREQUENCY 
+                     !if (it == 1 .and. myrank == 0) then
+                     !   write(IMAIN,*) 'using a source of dominant frequency ',f0
+                     !   write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
+                     !   write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
+                     !endif
+
+                     if( USE_RICKER_TIME_FUNCTION ) then
+                        stf = comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
+                     else
+                        stf = comp_source_time_function_gauss(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_tiny(isource))
+                     endif
+
+                    ! add the inclined force source array
+                    ! distinguish between single and double precision for reals
+                     if(CUSTOM_REAL == SIZE_REAL) then
+                        stf_used = sngl(stf)
+                     else
+                        stf_used = stf
+                     endif
+
+                     do k=1,NGLLZ
+                        do j=1,NGLLY
+                           do i=1,NGLLX
+                              iglob = ibool(i,j,k,ispec)
+                              b_accel(:,iglob) = b_accel(:,iglob) + sourcearrays(isource,:,i,j,k) * stf_used
+                           enddo
+                        enddo
+                     enddo
+
+                  else
+
+                     ! see note above: time step corresponds now to NSTEP-it
+                     ! (also compare to it-1 for forward simulation)
+                     if( USE_RICKER_TIME_FUNCTION ) then
+                       stf = comp_source_time_function_rickr( &
+                                      dble(it-1)*DT-t0-tshift_src(isource),hdur(isource))
+                     else
+                       stf = comp_source_time_function( &
+                                      dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_gaussian(isource))
+                     endif
+
+                     ! distinguish between single and double precision for reals
+                     if(CUSTOM_REAL == SIZE_REAL) then
+                        stf_used = sngl(stf)
+                     else
+                        stf_used = stf
+                     endif
+
+                     !  add source array
+                     do k=1,NGLLZ
+                        do j=1,NGLLY
+                           do i=1,NGLLX
+                              iglob = ibool(i,j,k,ispec_selected_source(isource))
+                              b_accel(:,iglob) = b_accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used
+                           enddo
+                        enddo
+                     enddo
+                  endif ! USE_FORCE_POINT_SOURCE
+
+                  stf_used_total = stf_used_total + stf_used
+
+               endif ! elastic
+            endif ! phase_inner
+         endif ! myrank
+
+      enddo ! NSOURCES
+    endif ! GPU_MODE
+  endif ! adjoint
+
+  ! master prints out source time function to file
+  if(PRINT_SOURCE_TIME_FUNCTION .and. phase_is_inner) then
+    time_source = (it-1)*DT - t0
+    call sum_all_cr(stf_used_total,stf_used_total_all)
+    if( myrank == 0 ) write(IOSTF,*) time_source,stf_used_total_all
+  endif
+
+  ! for noise simulations
+  ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
+  ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
+  ! because boundary points are claculated first!
+  if( .not. phase_is_inner ) then
+    if ( NOISE_TOMOGRAPHY == 1 ) then
+       ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
+       ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
+       ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
+       ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
+       if(.NOT. GPU_MODE) then
+          call add_source_master_rec_noise(myrank,nrec, &
+               NSTEP,accel,noise_sourcearray, &
+               ibool,islice_selected_rec,ispec_selected_rec, &
+               it,irec_master_noise, &
+               NSPEC_AB,NGLOB_AB)
+       else ! GPU_MODE == .true.
+          call add_source_master_rec_noise_cu(Mesh_pointer, myrank, it, irec_master_noise, islice_selected_rec)
+       endif
+    elseif ( NOISE_TOMOGRAPHY == 2 ) then
+       ! second step of noise tomography, i.e., read the surface movie saved at every timestep
+       ! use the movie to drive the ensemble forward wavefield
+       call noise_read_add_surface_movie(NGLLSQUARE*num_free_surface_faces, &
+                              accel, &
+                              normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
+                              ibool,noise_surface_movie, &
+                              NSTEP-it+1, &
+                              NSPEC_AB,NGLOB_AB, &
+                              num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
+                              free_surface_jacobian2Dw,&
+                              Mesh_pointer,GPU_MODE,NOISE_TOMOGRAPHY)
+        ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
+        ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
+        ! note the ensemble forward sources are generally distributed on the surface of the earth
+        ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
+        ! therefore, we must add it here, before applying the inverse of mass matrix
+    elseif ( NOISE_TOMOGRAPHY == 3 ) then
+        ! third step of noise tomography, i.e., read the surface movie saved at every timestep
+        ! use the movie to reconstruct the ensemble forward wavefield
+        ! the ensemble adjoint wavefield is done as usual
+        ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
+        call noise_read_add_surface_movie(NGLLSQUARE*num_free_surface_faces, &
+                              b_accel, &
+                              normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
+                              ibool,noise_surface_movie, &
+                              it, &
+                              NSPEC_AB,NGLOB_AB, &
+                              num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
+                              free_surface_jacobian2Dw,&
+                               Mesh_pointer,GPU_MODE,NOISE_TOMOGRAPHY)
+    endif
+  endif
+
+
+  end subroutine compute_add_sources_viscoelastic

Deleted: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_elastic_ac.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_elastic_ac.f90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_elastic_ac.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -1,229 +0,0 @@
-!=====================================================================
-!
-!               S p e c f e m 3 D  V e r s i o n  2 . 1
-!               ---------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Princeton University, USA and CNRS / INRIA / University of Pau
-! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
-!                             July 2012
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-! for elastic solver
-
-  subroutine compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, &
-                        ibool,accel,potential_dot_dot_acoustic, &
-                        num_coupling_ac_el_faces, &
-                        coupling_ac_el_ispec,coupling_ac_el_ijk, &
-                        coupling_ac_el_normal, &
-                        coupling_ac_el_jacobian2Dw, &
-                        ispec_is_inner,phase_is_inner)
-
-! returns the updated acceleration array: accel
-
-  implicit none
-  include 'constants.h'
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement and pressure
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: accel
-  real(kind=CUSTOM_REAL), dimension(NGLOB_AB) :: potential_dot_dot_acoustic
-
-! global indexing
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-
-! acoustic-elastic coupling surface
-  integer :: num_coupling_ac_el_faces
-  real(kind=CUSTOM_REAL) :: coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces)
-  real(kind=CUSTOM_REAL) :: coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces)
-  integer :: coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces)
-  integer :: coupling_ac_el_ispec(num_coupling_ac_el_faces)
-
-! communication overlap
-  logical, dimension(NSPEC_AB) :: ispec_is_inner
-  logical :: phase_is_inner
-
-! local parameters
-  real(kind=CUSTOM_REAL) :: pressure
-  real(kind=CUSTOM_REAL) :: nx,ny,nz,jacobianw
-
-  integer :: iface,igll,ispec,iglob
-  integer :: i,j,k
-
-! loops on all coupling faces
-  do iface = 1,num_coupling_ac_el_faces
-
-    ! gets corresponding spectral element
-    ! (note: can be either acoustic or elastic element, no need to specify since
-    !           no material properties are needed for this coupling term)
-    ispec = coupling_ac_el_ispec(iface)
-
-    if( ispec_is_inner(ispec) .eqv. phase_is_inner ) then
-
-      ! loops over common GLL points
-      do igll = 1, NGLLSQUARE
-        i = coupling_ac_el_ijk(1,igll,iface)
-        j = coupling_ac_el_ijk(2,igll,iface)
-        k = coupling_ac_el_ijk(3,igll,iface)
-
-        ! gets global index of this common GLL point
-        ! (note: should be the same as for corresponding i',j',k',ispec_elastic or ispec_elastic )
-        iglob = ibool(i,j,k,ispec)
-
-        ! acoustic pressure on global point
-        pressure = - potential_dot_dot_acoustic(iglob)
-
-        ! gets associated normal on GLL point
-        ! (note convention: pointing outwards of acoustic element)
-        nx = coupling_ac_el_normal(1,igll,iface)
-        ny = coupling_ac_el_normal(2,igll,iface)
-        nz = coupling_ac_el_normal(3,igll,iface)
-
-        ! gets associated, weighted 2D jacobian
-        ! (note: should be the same for elastic and acoustic element)
-        jacobianw = coupling_ac_el_jacobian2Dw(igll,iface)
-
-        ! continuity of displacement and pressure on global point
-        !
-        ! note: Newmark time scheme together with definition of scalar potential:
-        !          pressure = - chi_dot_dot
-        !          requires that this coupling term uses the *UPDATED* pressure (chi_dot_dot), i.e.
-        !          pressure at time step [t + delta_t]
-        !          (see e.g. Chaljub & Vilotte, Nissen-Meyer thesis...)
-        !          it means you have to calculate and update the acoustic pressure first before
-        !          calculating this term...
-        accel(1,iglob) = accel(1,iglob) + jacobianw*nx*pressure
-        accel(2,iglob) = accel(2,iglob) + jacobianw*ny*pressure
-        accel(3,iglob) = accel(3,iglob) + jacobianw*nz*pressure
-
-      enddo ! igll
-
-    endif
-
-  enddo ! iface
-
-end subroutine compute_coupling_elastic_ac
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-  subroutine compute_coupling_ocean(NSPEC_AB,NGLOB_AB, &
-                                    ibool,rmassx,rmassy,rmassz, &
-                                    rmass_ocean_load,accel, &
-                                    free_surface_normal,free_surface_ijk,free_surface_ispec, &
-                                    num_free_surface_faces,SIMULATION_TYPE, &
-                                    NGLOB_ADJOINT,b_accel)
-
-! updates acceleration with ocean load term:
-! approximates ocean-bottom continuity of pressure & displacement for longer period waves (> ~20s ),
-! assuming incompressible fluid column above bathymetry ocean bottom
-
-  implicit none
-
-  include 'constants.h'
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_AB),intent(inout) :: accel
-  real(kind=CUSTOM_REAL),dimension(NGLOB_AB),intent(in) :: rmassx,rmassy,rmassz
-  real(kind=CUSTOM_REAL),dimension(NGLOB_AB),intent(in) :: rmass_ocean_load
-
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB),intent(in) :: ibool
-
-  ! free surface
-  integer :: num_free_surface_faces
-  real(kind=CUSTOM_REAL) :: free_surface_normal(NDIM,NGLLSQUARE,num_free_surface_faces)
-  integer :: free_surface_ijk(3,NGLLSQUARE,num_free_surface_faces)
-  integer :: free_surface_ispec(num_free_surface_faces)
-
-  ! adjoint simulations
-  integer :: SIMULATION_TYPE,NGLOB_ADJOINT
-  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_ADJOINT):: b_accel
-
-! local parameters
-  real(kind=CUSTOM_REAL) :: nx,ny,nz
-  real(kind=CUSTOM_REAL) :: force_normal_comp
-  integer :: i,j,k,ispec,iglob
-  integer :: igll,iface
-  logical,dimension(NGLOB_AB) :: updated_dof_ocean_load
-  ! adjoint locals
-  real(kind=CUSTOM_REAL) :: b_force_normal_comp
-
-  !   initialize the updates
-  updated_dof_ocean_load(:) = .false.
-
-  ! for surface elements exactly at the top of the model (ocean bottom)
-  do iface = 1,num_free_surface_faces
-
-    ispec = free_surface_ispec(iface)
-    do igll = 1, NGLLSQUARE
-      i = free_surface_ijk(1,igll,iface)
-      j = free_surface_ijk(2,igll,iface)
-      k = free_surface_ijk(3,igll,iface)
-
-      ! get global point number
-      iglob = ibool(i,j,k,ispec)
-
-      ! only update once
-      if(.not. updated_dof_ocean_load(iglob)) then
-
-        ! get normal
-        nx = free_surface_normal(1,igll,iface)
-        ny = free_surface_normal(2,igll,iface)
-        nz = free_surface_normal(3,igll,iface)
-
-        ! make updated component of right-hand side
-        ! we divide by rmass() which is 1 / M
-        ! we use the total force which includes the Coriolis term above
-        force_normal_comp = accel(1,iglob)*nx / rmassx(iglob) &
-                            + accel(2,iglob)*ny / rmassy(iglob) &
-                            + accel(3,iglob)*nz / rmassz(iglob)
-
-        accel(1,iglob) = accel(1,iglob) &
-          + (rmass_ocean_load(iglob) - rmassx(iglob)) * force_normal_comp * nx
-        accel(2,iglob) = accel(2,iglob) &
-          + (rmass_ocean_load(iglob) - rmassy(iglob)) * force_normal_comp * ny
-        accel(3,iglob) = accel(3,iglob) &
-          + (rmass_ocean_load(iglob) - rmassz(iglob)) * force_normal_comp * nz
-
-        ! adjoint simulations
-        if (SIMULATION_TYPE == 3) then
-          b_force_normal_comp = b_accel(1,iglob)*nx / rmassx(iglob) &
-                                + b_accel(2,iglob)*ny / rmassy(iglob) &
-                                + b_accel(3,iglob)*nz / rmassz(iglob)
-
-          b_accel(1,iglob) = b_accel(1,iglob) &
-            + (rmass_ocean_load(iglob) - rmassx(iglob)) * b_force_normal_comp * nx
-          b_accel(2,iglob) = b_accel(2,iglob) &
-            + (rmass_ocean_load(iglob) - rmassy(iglob)) * b_force_normal_comp * ny
-          b_accel(3,iglob) = b_accel(3,iglob) &
-            + (rmass_ocean_load(iglob) - rmassz(iglob)) * b_force_normal_comp * nz
-        endif !adjoint
-
-        ! done with this point
-        updated_dof_ocean_load(iglob) = .true.
-
-      endif
-
-    enddo ! igll
-  enddo ! iface
-
-  end subroutine compute_coupling_ocean
-

Deleted: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_elastic_po.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_elastic_po.f90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_elastic_po.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -1,503 +0,0 @@
-!=====================================================================
-!
-!               S p e c f e m 3 D  V e r s i o n  2 . 1
-!               ---------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Princeton University, USA and CNRS / INRIA / University of Pau
-! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
-!                             July 2012
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-! for elastic solver
-
-  subroutine compute_coupling_elastic_po(NSPEC_AB,NGLOB_AB,ibool,&
-                        displs_poroelastic,displw_poroelastic,&
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        hprime_xx,hprime_yy,hprime_zz,&
-                        kappaarraystore,rhoarraystore,mustore, &
-                        phistore,tortstore,jacobian,&
-                        displ,accel,kappastore, &
-                        ANISOTROPY,NSPEC_ANISO, &
-                        c11store,c12store,c13store,c14store,c15store,c16store,&
-                        c22store,c23store,c24store,c25store,c26store,c33store,&
-                        c34store,c35store,c36store,c44store,c45store,c46store,&
-                        c55store,c56store,c66store, &
-                        SIMULATION_TYPE,NGLOB_ADJOINT,NSPEC_ADJOINT, &
-                        num_coupling_el_po_faces, &
-                        coupling_el_po_ispec,coupling_po_el_ispec, &
-                        coupling_el_po_ijk,coupling_po_el_ijk, &
-                        coupling_el_po_normal, &
-                        coupling_el_po_jacobian2Dw, &
-                        ispec_is_inner,phase_is_inner)
-
-! returns the updated accelerations array: accel
-
-  implicit none
-  include 'constants.h'
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacements, etc
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displs_poroelastic,&
-                                                      displw_poroelastic
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
-
-! global indexing
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-
-  integer :: SIMULATION_TYPE
-  integer :: NGLOB_ADJOINT,NSPEC_ADJOINT
-
-! elastic-poroelastic coupling surface
-  integer :: num_coupling_el_po_faces
-  real(kind=CUSTOM_REAL) :: coupling_el_po_normal(NDIM,NGLLSQUARE,num_coupling_el_po_faces)
-  real(kind=CUSTOM_REAL) :: coupling_el_po_jacobian2Dw(NGLLSQUARE,num_coupling_el_po_faces)
-  integer :: coupling_el_po_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
-  integer :: coupling_po_el_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
-  integer :: coupling_el_po_ispec(num_coupling_el_po_faces)
-  integer :: coupling_po_el_ispec(num_coupling_el_po_faces)
-
-! properties
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        phistore,tortstore,jacobian
-  real(kind=CUSTOM_REAL), dimension(2,NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: rhoarraystore
-  real(kind=CUSTOM_REAL), dimension(3,NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: kappaarraystore
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy
-  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz
-
-! communication overlap
-  logical, dimension(NSPEC_AB) :: ispec_is_inner
-  logical :: phase_is_inner
-
-! local parameters
-  real(kind=CUSTOM_REAL) :: sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
-  real(kind=CUSTOM_REAL) :: rhol_s,rhol_f,phil,tortl,rhol_bar
-  real(kind=CUSTOM_REAL) :: nx,ny,nz,jacobianw
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-  real(kind=CUSTOM_REAL) :: kappal_s
-  real(kind=CUSTOM_REAL) :: kappal_f
-  real(kind=CUSTOM_REAL) :: mul_fr,kappal_fr
-  real(kind=CUSTOM_REAL) :: D_biot,H_biot,C_biot,M_biot
-  real(kind=CUSTOM_REAL) :: mul_G,lambdal_G,lambdalplus2mul_G
-
-! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer :: iface,igll,ispec_po,ispec_el,iglob,iglob_el,iglob_po
-  integer :: i,j,k,l
-
-  real(kind=CUSTOM_REAL) tempx1l,tempx2l,tempx3l
-  real(kind=CUSTOM_REAL) tempy1l,tempy2l,tempy3l
-  real(kind=CUSTOM_REAL) tempz1l,tempz2l,tempz3l
-  real(kind=CUSTOM_REAL) tempx1ls,tempx2ls,tempx3ls,tempx1lw,tempx2lw,tempx3lw
-  real(kind=CUSTOM_REAL) tempy1ls,tempy2ls,tempy3ls,tempy1lw,tempy2lw,tempy3lw
-  real(kind=CUSTOM_REAL) tempz1ls,tempz2ls,tempz3ls,tempz1lw,tempz2lw,tempz3lw
-
-  real(kind=CUSTOM_REAL) :: duxdxl,duydxl,duzdxl,duxdyl,duydyl,duzdyl,duxdzl,duydzl,duzdzl
-  real(kind=CUSTOM_REAL) :: dwxdxl,dwydxl,dwzdxl,dwxdyl,dwydyl,dwzdyl,dwxdzl,dwydzl,dwzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl_plus_duzdzl,dwxdxl_plus_dwydyl_plus_dwzdzl
-
-  real(kind=CUSTOM_REAL) hp1,hp2,hp3
-
-! Jacobian matrix and determinant
-  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-
-! loops on all coupling faces
-  do iface = 1,num_coupling_el_po_faces
-
-    ! gets corresponding poro/elastic spectral element
-    ispec_po = coupling_el_po_ispec(iface)
-    ispec_el = coupling_po_el_ispec(iface)
-
-    if( ispec_is_inner(ispec_el) .eqv. phase_is_inner ) then
-
-      ! loops over common GLL points
-      do igll = 1, NGLLSQUARE
-
-    !-----------------------
-    ! from the poroelastic side
-    !-----------------------
-        i = coupling_el_po_ijk(1,igll,iface)
-        j = coupling_el_po_ijk(2,igll,iface)
-        k = coupling_el_po_ijk(3,igll,iface)
-
-        iglob_po = ibool(i,j,k,ispec_po)
-
-! get poroelastic parameters of current local GLL
-    phil = phistore(i,j,k,ispec_po)
-    tortl = tortstore(i,j,k,ispec_po)
-!solid properties
-    kappal_s = kappaarraystore(1,i,j,k,ispec_po)
-    rhol_s = rhoarraystore(1,i,j,k,ispec_po)
-!fluid properties
-    kappal_f = kappaarraystore(2,i,j,k,ispec_po)
-    rhol_f = rhoarraystore(2,i,j,k,ispec_po)
-!frame properties
-    mul_fr = mustore(i,j,k,ispec_po)
-    kappal_fr = kappaarraystore(3,i,j,k,ispec_po)
-    rhol_bar =  (1._CUSTOM_REAL - phil)*rhol_s + phil*rhol_f
-!Biot coefficients for the input phi
-      D_biot = kappal_s*(1._CUSTOM_REAL + phil*(kappal_s/kappal_f - 1._CUSTOM_REAL))
-      H_biot = (kappal_s - kappal_fr)*(kappal_s - kappal_fr)/(D_biot - kappal_fr) + &
-                kappal_fr + 4._CUSTOM_REAL*mul_fr/3._CUSTOM_REAL
-      C_biot = kappal_s*(kappal_s - kappal_fr)/(D_biot - kappal_fr)
-      M_biot = kappal_s*kappal_s/(D_biot - kappal_fr)
-
-!T = G:grad u_s + C_biot div w I
-!and T_f = C_biot div u_s I + M_biot div w I
-      mul_G = mul_fr
-      lambdal_G = H_biot - 2._CUSTOM_REAL*mul_fr
-      lambdalplus2mul_G = lambdal_G + 2._CUSTOM_REAL*mul_G
-
-! derivative along x,y,z for u_s and w
-              tempx1ls = 0.
-              tempx2ls = 0.
-              tempx3ls = 0.
-
-              tempy1ls = 0.
-              tempy2ls = 0.
-              tempy3ls = 0.
-
-              tempz1ls = 0.
-              tempz2ls = 0.
-              tempz3ls = 0.
-
-              tempx1lw = 0.
-              tempx2lw = 0.
-              tempx3lw = 0.
-
-              tempy1lw = 0.
-              tempy2lw = 0.
-              tempy3lw = 0.
-
-              tempz1lw = 0.
-              tempz2lw = 0.
-              tempz3lw = 0.
-
-! first double loop over GLL points to compute and store gradients
-          do l = 1,NGLLX
-                hp1 = hprime_xx(i,l)
-                iglob = ibool(l,j,k,ispec_po)
-                tempx1ls = tempx1ls + displs_poroelastic(1,iglob)*hp1
-                tempy1ls = tempy1ls + displs_poroelastic(2,iglob)*hp1
-                tempz1ls = tempz1ls + displs_poroelastic(3,iglob)*hp1
-                tempx1lw = tempx1lw + displw_poroelastic(1,iglob)*hp1
-                tempy1lw = tempy1lw + displw_poroelastic(2,iglob)*hp1
-                tempz1lw = tempz1lw + displw_poroelastic(3,iglob)*hp1
-                ! adjoint simulations
-                if (SIMULATION_TYPE == 3) then
-                  ! to do
-                  stop 'compute_coupling_elastic_po() : adjoint run not implemented yet'
-
-                  ! dummy to avoid compiler warnings
-                  iglob = NGLOB_ADJOINT
-                  iglob = NSPEC_ADJOINT
-
-                endif ! adjoint
-    !!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-    !!! can merge these loops because NGLLX = NGLLY = NGLLZ          do
-    !l=1,NGLLY
-                hp2 = hprime_yy(j,l)
-                iglob = ibool(i,l,k,ispec_po)
-                tempx2ls = tempx2ls + displs_poroelastic(1,iglob)*hp2
-                tempy2ls = tempy2ls + displs_poroelastic(2,iglob)*hp2
-                tempz2ls = tempz2ls + displs_poroelastic(3,iglob)*hp2
-                tempx2lw = tempx2lw + displw_poroelastic(1,iglob)*hp2
-                tempy2lw = tempy2lw + displw_poroelastic(2,iglob)*hp2
-                tempz2lw = tempz2lw + displw_poroelastic(3,iglob)*hp2
-                ! adjoint simulations
-                if (SIMULATION_TYPE == 3) then
-                endif ! adjoint
-    !!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-    !!! can merge these loops because NGLLX = NGLLY = NGLLZ          do
-    !l=1,NGLLZ
-                hp3 = hprime_zz(k,l)
-                iglob = ibool(i,j,l,ispec_po)
-                tempx3ls = tempx3ls + displs_poroelastic(1,iglob)*hp3
-                tempy3ls = tempy3ls + displs_poroelastic(2,iglob)*hp3
-                tempz3ls = tempz3ls + displs_poroelastic(3,iglob)*hp3
-                tempx3lw = tempx3lw + displw_poroelastic(1,iglob)*hp3
-                tempy3lw = tempy3lw + displw_poroelastic(2,iglob)*hp3
-                tempz3lw = tempz3lw + displw_poroelastic(3,iglob)*hp3
-                ! adjoint simulations
-                if (SIMULATION_TYPE == 3) then
-                endif ! adjoint
-          enddo
-
-              xixl = xix(i,j,k,ispec_po)
-              xiyl = xiy(i,j,k,ispec_po)
-              xizl = xiz(i,j,k,ispec_po)
-              etaxl = etax(i,j,k,ispec_po)
-              etayl = etay(i,j,k,ispec_po)
-              etazl = etaz(i,j,k,ispec_po)
-              gammaxl = gammax(i,j,k,ispec_po)
-              gammayl = gammay(i,j,k,ispec_po)
-              gammazl = gammaz(i,j,k,ispec_po)
-              jacobianl = jacobian(i,j,k,ispec_po)
-
-! derivatives of displacement
-              duxdxl = xixl*tempx1ls + etaxl*tempx2ls + gammaxl*tempx3ls
-              duxdyl = xiyl*tempx1ls + etayl*tempx2ls + gammayl*tempx3ls
-              duxdzl = xizl*tempx1ls + etazl*tempx2ls + gammazl*tempx3ls
-
-              duydxl = xixl*tempy1ls + etaxl*tempy2ls + gammaxl*tempy3ls
-              duydyl = xiyl*tempy1ls + etayl*tempy2ls + gammayl*tempy3ls
-              duydzl = xizl*tempy1ls + etazl*tempy2ls + gammazl*tempy3ls
-
-              duzdxl = xixl*tempz1ls + etaxl*tempz2ls + gammaxl*tempz3ls
-              duzdyl = xiyl*tempz1ls + etayl*tempz2ls + gammayl*tempz3ls
-              duzdzl = xizl*tempz1ls + etazl*tempz2ls + gammazl*tempz3ls
-
-              dwxdxl = xixl*tempx1lw + etaxl*tempx2lw + gammaxl*tempx3lw
-              dwxdyl = xiyl*tempx1lw + etayl*tempx2lw + gammayl*tempx3lw
-              dwxdzl = xizl*tempx1lw + etazl*tempx2lw + gammazl*tempx3lw
-
-              dwydxl = xixl*tempy1lw + etaxl*tempy2lw + gammaxl*tempy3lw
-              dwydyl = xiyl*tempy1lw + etayl*tempy2lw + gammayl*tempy3lw
-              dwydzl = xizl*tempy1lw + etazl*tempy2lw + gammazl*tempy3lw
-
-              dwzdxl = xixl*tempz1lw + etaxl*tempz2lw + gammaxl*tempz3lw
-              dwzdyl = xiyl*tempz1lw + etayl*tempz2lw + gammayl*tempz3lw
-              dwzdzl = xizl*tempz1lw + etazl*tempz2lw + gammazl*tempz3lw
-
-    ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl_plus_duzdzl = duxdxl + duydyl + duzdzl
-              dwxdxl_plus_dwydyl_plus_dwzdzl = dwxdxl + dwydyl + dwzdzl
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-! compute stress tensor (include attenuation or anisotropy if needed)
-
-!  if(VISCOATTENUATION) then
-!chris:check
-
-! Dissipation only controlled by frame share attenuation in poroelastic (see
-! Morency & Tromp, GJI 2008).
-! attenuation is implemented following the memory variable formulation of
-! J. M. Carcione, Seismic modeling in viscoelastic media, Geophysics,
-! vol. 58(1), p. 110-120 (1993). More details can be found in
-! J. M. Carcione, D. Kosloff and R. Kosloff, Wave propagation simulation in a
-! linear
-! viscoelastic medium, Geophysical Journal International, vol. 95, p. 597-611
-! (1988).
-
-!  else
-
-! no attenuation
-    sigma_xx =  lambdalplus2mul_G*duxdxl + lambdal_G*duydyl_plus_duzdzl + C_biot*dwxdxl_plus_dwydyl_plus_dwzdzl
-    sigma_yy =  lambdalplus2mul_G*duydyl + lambdal_G*duxdxl_plus_duzdzl + C_biot*dwxdxl_plus_dwydyl_plus_dwzdzl
-    sigma_zz =  lambdalplus2mul_G*duzdzl + lambdal_G*duxdxl_plus_duydyl + C_biot*dwxdxl_plus_dwydyl_plus_dwzdzl
-
-    sigma_xy =  mul_G*duxdyl_plus_duydxl
-    sigma_xz =  mul_G*duzdxl_plus_duxdzl
-    sigma_yz =  mul_G*duzdyl_plus_duydzl
-
-
-    !-----------------------
-    ! from the elastic side
-    !-----------------------
-        i = coupling_po_el_ijk(1,igll,iface)
-        j = coupling_po_el_ijk(2,igll,iface)
-        k = coupling_po_el_ijk(3,igll,iface)
-
-        ! gets global index of this common GLL point
-        ! (note: should be the same as for corresponding
-        ! i',j',k',ispec_poroelastic or ispec_elastic )
-        iglob_el = ibool(i,j,k,ispec_el)
-        if (iglob_el .ne. iglob_po) stop 'poroelastic-elastic coupling error'
-          tempx1l = 0.
-          tempx2l = 0.
-          tempx3l = 0.
-
-          tempy1l = 0.
-          tempy2l = 0.
-          tempy3l = 0.
-
-          tempz1l = 0.
-          tempz2l = 0.
-          tempz3l = 0.
-
-          do l=1,NGLLX
-            hp1 = hprime_xx(i,l)
-            iglob = ibool(l,j,k,ispec_el)
-            tempx1l = tempx1l + displ(1,iglob)*hp1
-            tempy1l = tempy1l + displ(2,iglob)*hp1
-            tempz1l = tempz1l + displ(3,iglob)*hp1
-
-            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
-            hp2 = hprime_yy(j,l)
-            iglob = ibool(i,l,k,ispec_el)
-            tempx2l = tempx2l + displ(1,iglob)*hp2
-            tempy2l = tempy2l + displ(2,iglob)*hp2
-            tempz2l = tempz2l + displ(3,iglob)*hp2
-
-            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
-            hp3 = hprime_zz(k,l)
-            iglob = ibool(i,j,l,ispec_el)
-            tempx3l = tempx3l + displ(1,iglob)*hp3
-            tempy3l = tempy3l + displ(2,iglob)*hp3
-            tempz3l = tempz3l + displ(3,iglob)*hp3
-          enddo
-
-          ! get derivatives of ux, uy and uz with respect to x, y and z
-          xixl = xix(i,j,k,ispec_el)
-          xiyl = xiy(i,j,k,ispec_el)
-          xizl = xiz(i,j,k,ispec_el)
-          etaxl = etax(i,j,k,ispec_el)
-          etayl = etay(i,j,k,ispec_el)
-          etazl = etaz(i,j,k,ispec_el)
-          gammaxl = gammax(i,j,k,ispec_el)
-          gammayl = gammay(i,j,k,ispec_el)
-          gammazl = gammaz(i,j,k,ispec_el)
-          jacobianl = jacobian(i,j,k,ispec_el)
-
-          duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-          duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-          duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
-
-          duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
-          duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
-          duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
-
-          duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
-          duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
-          duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
-
-          ! precompute some sums to save CPU time
-          duxdxl_plus_duydyl = duxdxl + duydyl
-          duxdxl_plus_duzdzl = duxdxl + duzdzl
-          duydyl_plus_duzdzl = duydyl + duzdzl
-          duxdyl_plus_duydxl = duxdyl + duydxl
-          duzdxl_plus_duxdzl = duzdxl + duxdzl
-          duzdyl_plus_duydzl = duzdyl + duydzl
-
-          kappal = kappastore(i,j,k,ispec_el)
-          mul = mustore(i,j,k,ispec_el)
-
-          ! full anisotropic case, stress calculations
-          if(ANISOTROPY) then
-            c11 = c11store(i,j,k,ispec_el)
-            c12 = c12store(i,j,k,ispec_el)
-            c13 = c13store(i,j,k,ispec_el)
-            c14 = c14store(i,j,k,ispec_el)
-            c15 = c15store(i,j,k,ispec_el)
-            c16 = c16store(i,j,k,ispec_el)
-            c22 = c22store(i,j,k,ispec_el)
-            c23 = c23store(i,j,k,ispec_el)
-            c24 = c24store(i,j,k,ispec_el)
-            c25 = c25store(i,j,k,ispec_el)
-            c26 = c26store(i,j,k,ispec_el)
-            c33 = c33store(i,j,k,ispec_el)
-            c34 = c34store(i,j,k,ispec_el)
-            c35 = c35store(i,j,k,ispec_el)
-            c36 = c36store(i,j,k,ispec_el)
-            c44 = c44store(i,j,k,ispec_el)
-            c45 = c45store(i,j,k,ispec_el)
-            c46 = c46store(i,j,k,ispec_el)
-            c55 = c55store(i,j,k,ispec_el)
-            c56 = c56store(i,j,k,ispec_el)
-            c66 = c66store(i,j,k,ispec_el)
-
-            sigma_xx = sigma_xx + c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                      c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-            sigma_yy = sigma_yy + c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                      c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-            sigma_zz = sigma_zz + c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                      c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-            sigma_xy = sigma_xy + c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                      c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-            sigma_xz = sigma_xz + c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                      c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-            sigma_yz = sigma_yz + c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                      c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-          else
-
-            ! isotropic case
-            lambdalplus2mul = kappal + FOUR_THIRDS * mul
-            lambdal = lambdalplus2mul - 2.*mul
-
-            ! compute stress sigma
-            sigma_xx = sigma_xx + lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-            sigma_yy = sigma_yy + lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-            sigma_zz = sigma_zz + lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-            sigma_xy = sigma_xy + mul*duxdyl_plus_duydxl
-            sigma_xz = sigma_xz + mul*duzdxl_plus_duxdzl
-            sigma_yz = sigma_yz + mul*duzdyl_plus_duydzl
-
-          endif ! ANISOTROPY
-
-        ! gets associated normal on GLL point
-        ! (note convention: pointing outwards of poroelastic element)
-        nx = coupling_el_po_normal(1,igll,iface)
-        ny = coupling_el_po_normal(2,igll,iface)
-        nz = coupling_el_po_normal(3,igll,iface)
-
-        ! gets associated, weighted 2D jacobian
-        ! (note: should be the same for poroelastic and elastic element)
-        jacobianw = coupling_el_po_jacobian2Dw(igll,iface)
-
-        ! continuity of displacement and traction on global point
-        !
-        ! note: continuity of displacement is enforced after the velocity update
-        accel(1,iglob_el) = accel(1,iglob_el) - jacobianw* &
-                 ( sigma_xx*nx + sigma_xy*ny + sigma_xz*nz )/2.d0
-
-        accel(2,iglob_el) = accel(2,iglob_el) - jacobianw* &
-                 ( sigma_xy*nx + sigma_yy*ny + sigma_yz*nz )/2.d0
-
-        accel(3,iglob_el) = accel(3,iglob_el) - jacobianw* &
-                 ( sigma_xz*nx + sigma_yz*ny + sigma_zz*nz )/2.d0
-
-      enddo ! igll
-
-    endif
-
-  enddo ! iface
-
-end subroutine compute_coupling_elastic_po
-

Copied: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_ac.f90 (from rev 21238, seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_elastic_ac.f90)
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_ac.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_ac.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -0,0 +1,229 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 1
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+!                             July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+! for elastic solver
+
+  subroutine compute_coupling_viscoelastic_ac(NSPEC_AB,NGLOB_AB, &
+                        ibool,accel,potential_dot_dot_acoustic, &
+                        num_coupling_ac_el_faces, &
+                        coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                        coupling_ac_el_normal, &
+                        coupling_ac_el_jacobian2Dw, &
+                        ispec_is_inner,phase_is_inner)
+
+! returns the updated acceleration array: accel
+
+  implicit none
+  include 'constants.h'
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and pressure
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: accel
+  real(kind=CUSTOM_REAL), dimension(NGLOB_AB) :: potential_dot_dot_acoustic
+
+! global indexing
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+
+! acoustic-elastic coupling surface
+  integer :: num_coupling_ac_el_faces
+  real(kind=CUSTOM_REAL) :: coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces)
+  real(kind=CUSTOM_REAL) :: coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces)
+  integer :: coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces)
+  integer :: coupling_ac_el_ispec(num_coupling_ac_el_faces)
+
+! communication overlap
+  logical, dimension(NSPEC_AB) :: ispec_is_inner
+  logical :: phase_is_inner
+
+! local parameters
+  real(kind=CUSTOM_REAL) :: pressure
+  real(kind=CUSTOM_REAL) :: nx,ny,nz,jacobianw
+
+  integer :: iface,igll,ispec,iglob
+  integer :: i,j,k
+
+! loops on all coupling faces
+  do iface = 1,num_coupling_ac_el_faces
+
+    ! gets corresponding spectral element
+    ! (note: can be either acoustic or elastic element, no need to specify since
+    !           no material properties are needed for this coupling term)
+    ispec = coupling_ac_el_ispec(iface)
+
+    if( ispec_is_inner(ispec) .eqv. phase_is_inner ) then
+
+      ! loops over common GLL points
+      do igll = 1, NGLLSQUARE
+        i = coupling_ac_el_ijk(1,igll,iface)
+        j = coupling_ac_el_ijk(2,igll,iface)
+        k = coupling_ac_el_ijk(3,igll,iface)
+
+        ! gets global index of this common GLL point
+        ! (note: should be the same as for corresponding i',j',k',ispec_elastic or ispec_elastic )
+        iglob = ibool(i,j,k,ispec)
+
+        ! acoustic pressure on global point
+        pressure = - potential_dot_dot_acoustic(iglob)
+
+        ! gets associated normal on GLL point
+        ! (note convention: pointing outwards of acoustic element)
+        nx = coupling_ac_el_normal(1,igll,iface)
+        ny = coupling_ac_el_normal(2,igll,iface)
+        nz = coupling_ac_el_normal(3,igll,iface)
+
+        ! gets associated, weighted 2D jacobian
+        ! (note: should be the same for elastic and acoustic element)
+        jacobianw = coupling_ac_el_jacobian2Dw(igll,iface)
+
+        ! continuity of displacement and pressure on global point
+        !
+        ! note: Newmark time scheme together with definition of scalar potential:
+        !          pressure = - chi_dot_dot
+        !          requires that this coupling term uses the *UPDATED* pressure (chi_dot_dot), i.e.
+        !          pressure at time step [t + delta_t]
+        !          (see e.g. Chaljub & Vilotte, Nissen-Meyer thesis...)
+        !          it means you have to calculate and update the acoustic pressure first before
+        !          calculating this term...
+        accel(1,iglob) = accel(1,iglob) + jacobianw*nx*pressure
+        accel(2,iglob) = accel(2,iglob) + jacobianw*ny*pressure
+        accel(3,iglob) = accel(3,iglob) + jacobianw*nz*pressure
+
+      enddo ! igll
+
+    endif
+
+  enddo ! iface
+
+end subroutine compute_coupling_viscoelastic_ac
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine compute_coupling_ocean(NSPEC_AB,NGLOB_AB, &
+                                    ibool,rmassx,rmassy,rmassz, &
+                                    rmass_ocean_load,accel, &
+                                    free_surface_normal,free_surface_ijk,free_surface_ispec, &
+                                    num_free_surface_faces,SIMULATION_TYPE, &
+                                    NGLOB_ADJOINT,b_accel)
+
+! updates acceleration with ocean load term:
+! approximates ocean-bottom continuity of pressure & displacement for longer period waves (> ~20s ),
+! assuming incompressible fluid column above bathymetry ocean bottom
+
+  implicit none
+
+  include 'constants.h'
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_AB),intent(inout) :: accel
+  real(kind=CUSTOM_REAL),dimension(NGLOB_AB),intent(in) :: rmassx,rmassy,rmassz
+  real(kind=CUSTOM_REAL),dimension(NGLOB_AB),intent(in) :: rmass_ocean_load
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB),intent(in) :: ibool
+
+  ! free surface
+  integer :: num_free_surface_faces
+  real(kind=CUSTOM_REAL) :: free_surface_normal(NDIM,NGLLSQUARE,num_free_surface_faces)
+  integer :: free_surface_ijk(3,NGLLSQUARE,num_free_surface_faces)
+  integer :: free_surface_ispec(num_free_surface_faces)
+
+  ! adjoint simulations
+  integer :: SIMULATION_TYPE,NGLOB_ADJOINT
+  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_ADJOINT):: b_accel
+
+! local parameters
+  real(kind=CUSTOM_REAL) :: nx,ny,nz
+  real(kind=CUSTOM_REAL) :: force_normal_comp
+  integer :: i,j,k,ispec,iglob
+  integer :: igll,iface
+  logical,dimension(NGLOB_AB) :: updated_dof_ocean_load
+  ! adjoint locals
+  real(kind=CUSTOM_REAL) :: b_force_normal_comp
+
+  !   initialize the updates
+  updated_dof_ocean_load(:) = .false.
+
+  ! for surface elements exactly at the top of the model (ocean bottom)
+  do iface = 1,num_free_surface_faces
+
+    ispec = free_surface_ispec(iface)
+    do igll = 1, NGLLSQUARE
+      i = free_surface_ijk(1,igll,iface)
+      j = free_surface_ijk(2,igll,iface)
+      k = free_surface_ijk(3,igll,iface)
+
+      ! get global point number
+      iglob = ibool(i,j,k,ispec)
+
+      ! only update once
+      if(.not. updated_dof_ocean_load(iglob)) then
+
+        ! get normal
+        nx = free_surface_normal(1,igll,iface)
+        ny = free_surface_normal(2,igll,iface)
+        nz = free_surface_normal(3,igll,iface)
+
+        ! make updated component of right-hand side
+        ! we divide by rmass() which is 1 / M
+        ! we use the total force which includes the Coriolis term above
+        force_normal_comp = accel(1,iglob)*nx / rmassx(iglob) &
+                            + accel(2,iglob)*ny / rmassy(iglob) &
+                            + accel(3,iglob)*nz / rmassz(iglob)
+
+        accel(1,iglob) = accel(1,iglob) &
+          + (rmass_ocean_load(iglob) - rmassx(iglob)) * force_normal_comp * nx
+        accel(2,iglob) = accel(2,iglob) &
+          + (rmass_ocean_load(iglob) - rmassy(iglob)) * force_normal_comp * ny
+        accel(3,iglob) = accel(3,iglob) &
+          + (rmass_ocean_load(iglob) - rmassz(iglob)) * force_normal_comp * nz
+
+        ! adjoint simulations
+        if (SIMULATION_TYPE == 3) then
+          b_force_normal_comp = b_accel(1,iglob)*nx / rmassx(iglob) &
+                                + b_accel(2,iglob)*ny / rmassy(iglob) &
+                                + b_accel(3,iglob)*nz / rmassz(iglob)
+
+          b_accel(1,iglob) = b_accel(1,iglob) &
+            + (rmass_ocean_load(iglob) - rmassx(iglob)) * b_force_normal_comp * nx
+          b_accel(2,iglob) = b_accel(2,iglob) &
+            + (rmass_ocean_load(iglob) - rmassy(iglob)) * b_force_normal_comp * ny
+          b_accel(3,iglob) = b_accel(3,iglob) &
+            + (rmass_ocean_load(iglob) - rmassz(iglob)) * b_force_normal_comp * nz
+        endif !adjoint
+
+        ! done with this point
+        updated_dof_ocean_load(iglob) = .true.
+
+      endif
+
+    enddo ! igll
+  enddo ! iface
+
+  end subroutine compute_coupling_ocean
+

Copied: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_po.f90 (from rev 21238, seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_elastic_po.f90)
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_po.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_po.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -0,0 +1,503 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 1
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+!                             July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+! for elastic solver
+
+  subroutine compute_coupling_viscoelastic_po(NSPEC_AB,NGLOB_AB,ibool,&
+                        displs_poroelastic,displw_poroelastic,&
+                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                        hprime_xx,hprime_yy,hprime_zz,&
+                        kappaarraystore,rhoarraystore,mustore, &
+                        phistore,tortstore,jacobian,&
+                        displ,accel,kappastore, &
+                        ANISOTROPY,NSPEC_ANISO, &
+                        c11store,c12store,c13store,c14store,c15store,c16store,&
+                        c22store,c23store,c24store,c25store,c26store,c33store,&
+                        c34store,c35store,c36store,c44store,c45store,c46store,&
+                        c55store,c56store,c66store, &
+                        SIMULATION_TYPE,NGLOB_ADJOINT,NSPEC_ADJOINT, &
+                        num_coupling_el_po_faces, &
+                        coupling_el_po_ispec,coupling_po_el_ispec, &
+                        coupling_el_po_ijk,coupling_po_el_ijk, &
+                        coupling_el_po_normal, &
+                        coupling_el_po_jacobian2Dw, &
+                        ispec_is_inner,phase_is_inner)
+
+! returns the updated accelerations array: accel
+
+  implicit none
+  include 'constants.h'
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacements, etc
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displs_poroelastic,&
+                                                      displw_poroelastic
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+! global indexing
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+
+  integer :: SIMULATION_TYPE
+  integer :: NGLOB_ADJOINT,NSPEC_ADJOINT
+
+! elastic-poroelastic coupling surface
+  integer :: num_coupling_el_po_faces
+  real(kind=CUSTOM_REAL) :: coupling_el_po_normal(NDIM,NGLLSQUARE,num_coupling_el_po_faces)
+  real(kind=CUSTOM_REAL) :: coupling_el_po_jacobian2Dw(NGLLSQUARE,num_coupling_el_po_faces)
+  integer :: coupling_el_po_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
+  integer :: coupling_po_el_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
+  integer :: coupling_el_po_ispec(num_coupling_el_po_faces)
+  integer :: coupling_po_el_ispec(num_coupling_el_po_faces)
+
+! properties
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        phistore,tortstore,jacobian
+  real(kind=CUSTOM_REAL), dimension(2,NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: rhoarraystore
+  real(kind=CUSTOM_REAL), dimension(3,NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: kappaarraystore
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy
+  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz
+
+! communication overlap
+  logical, dimension(NSPEC_AB) :: ispec_is_inner
+  logical :: phase_is_inner
+
+! local parameters
+  real(kind=CUSTOM_REAL) :: sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
+  real(kind=CUSTOM_REAL) :: rhol_s,rhol_f,phil,tortl,rhol_bar
+  real(kind=CUSTOM_REAL) :: nx,ny,nz,jacobianw
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+  real(kind=CUSTOM_REAL) :: kappal_s
+  real(kind=CUSTOM_REAL) :: kappal_f
+  real(kind=CUSTOM_REAL) :: mul_fr,kappal_fr
+  real(kind=CUSTOM_REAL) :: D_biot,H_biot,C_biot,M_biot
+  real(kind=CUSTOM_REAL) :: mul_G,lambdal_G,lambdalplus2mul_G
+
+! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer :: iface,igll,ispec_po,ispec_el,iglob,iglob_el,iglob_po
+  integer :: i,j,k,l
+
+  real(kind=CUSTOM_REAL) tempx1l,tempx2l,tempx3l
+  real(kind=CUSTOM_REAL) tempy1l,tempy2l,tempy3l
+  real(kind=CUSTOM_REAL) tempz1l,tempz2l,tempz3l
+  real(kind=CUSTOM_REAL) tempx1ls,tempx2ls,tempx3ls,tempx1lw,tempx2lw,tempx3lw
+  real(kind=CUSTOM_REAL) tempy1ls,tempy2ls,tempy3ls,tempy1lw,tempy2lw,tempy3lw
+  real(kind=CUSTOM_REAL) tempz1ls,tempz2ls,tempz3ls,tempz1lw,tempz2lw,tempz3lw
+
+  real(kind=CUSTOM_REAL) :: duxdxl,duydxl,duzdxl,duxdyl,duydyl,duzdyl,duxdzl,duydzl,duzdzl
+  real(kind=CUSTOM_REAL) :: dwxdxl,dwydxl,dwzdxl,dwxdyl,dwydyl,dwzdyl,dwxdzl,dwydzl,dwzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl_plus_duzdzl,dwxdxl_plus_dwydyl_plus_dwzdzl
+
+  real(kind=CUSTOM_REAL) hp1,hp2,hp3
+
+! Jacobian matrix and determinant
+  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+
+! loops on all coupling faces
+  do iface = 1,num_coupling_el_po_faces
+
+    ! gets corresponding poro/elastic spectral element
+    ispec_po = coupling_el_po_ispec(iface)
+    ispec_el = coupling_po_el_ispec(iface)
+
+    if( ispec_is_inner(ispec_el) .eqv. phase_is_inner ) then
+
+      ! loops over common GLL points
+      do igll = 1, NGLLSQUARE
+
+    !-----------------------
+    ! from the poroelastic side
+    !-----------------------
+        i = coupling_el_po_ijk(1,igll,iface)
+        j = coupling_el_po_ijk(2,igll,iface)
+        k = coupling_el_po_ijk(3,igll,iface)
+
+        iglob_po = ibool(i,j,k,ispec_po)
+
+! get poroelastic parameters of current local GLL
+    phil = phistore(i,j,k,ispec_po)
+    tortl = tortstore(i,j,k,ispec_po)
+!solid properties
+    kappal_s = kappaarraystore(1,i,j,k,ispec_po)
+    rhol_s = rhoarraystore(1,i,j,k,ispec_po)
+!fluid properties
+    kappal_f = kappaarraystore(2,i,j,k,ispec_po)
+    rhol_f = rhoarraystore(2,i,j,k,ispec_po)
+!frame properties
+    mul_fr = mustore(i,j,k,ispec_po)
+    kappal_fr = kappaarraystore(3,i,j,k,ispec_po)
+    rhol_bar =  (1._CUSTOM_REAL - phil)*rhol_s + phil*rhol_f
+!Biot coefficients for the input phi
+      D_biot = kappal_s*(1._CUSTOM_REAL + phil*(kappal_s/kappal_f - 1._CUSTOM_REAL))
+      H_biot = (kappal_s - kappal_fr)*(kappal_s - kappal_fr)/(D_biot - kappal_fr) + &
+                kappal_fr + 4._CUSTOM_REAL*mul_fr/3._CUSTOM_REAL
+      C_biot = kappal_s*(kappal_s - kappal_fr)/(D_biot - kappal_fr)
+      M_biot = kappal_s*kappal_s/(D_biot - kappal_fr)
+
+!T = G:grad u_s + C_biot div w I
+!and T_f = C_biot div u_s I + M_biot div w I
+      mul_G = mul_fr
+      lambdal_G = H_biot - 2._CUSTOM_REAL*mul_fr
+      lambdalplus2mul_G = lambdal_G + 2._CUSTOM_REAL*mul_G
+
+! derivative along x,y,z for u_s and w
+              tempx1ls = 0.
+              tempx2ls = 0.
+              tempx3ls = 0.
+
+              tempy1ls = 0.
+              tempy2ls = 0.
+              tempy3ls = 0.
+
+              tempz1ls = 0.
+              tempz2ls = 0.
+              tempz3ls = 0.
+
+              tempx1lw = 0.
+              tempx2lw = 0.
+              tempx3lw = 0.
+
+              tempy1lw = 0.
+              tempy2lw = 0.
+              tempy3lw = 0.
+
+              tempz1lw = 0.
+              tempz2lw = 0.
+              tempz3lw = 0.
+
+! first double loop over GLL points to compute and store gradients
+          do l = 1,NGLLX
+                hp1 = hprime_xx(i,l)
+                iglob = ibool(l,j,k,ispec_po)
+                tempx1ls = tempx1ls + displs_poroelastic(1,iglob)*hp1
+                tempy1ls = tempy1ls + displs_poroelastic(2,iglob)*hp1
+                tempz1ls = tempz1ls + displs_poroelastic(3,iglob)*hp1
+                tempx1lw = tempx1lw + displw_poroelastic(1,iglob)*hp1
+                tempy1lw = tempy1lw + displw_poroelastic(2,iglob)*hp1
+                tempz1lw = tempz1lw + displw_poroelastic(3,iglob)*hp1
+                ! adjoint simulations
+                if (SIMULATION_TYPE == 3) then
+                  ! to do
+                  stop 'compute_coupling_viscoelastic_po() : adjoint run not implemented yet'
+
+                  ! dummy to avoid compiler warnings
+                  iglob = NGLOB_ADJOINT
+                  iglob = NSPEC_ADJOINT
+
+                endif ! adjoint
+    !!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+    !!! can merge these loops because NGLLX = NGLLY = NGLLZ          do
+    !l=1,NGLLY
+                hp2 = hprime_yy(j,l)
+                iglob = ibool(i,l,k,ispec_po)
+                tempx2ls = tempx2ls + displs_poroelastic(1,iglob)*hp2
+                tempy2ls = tempy2ls + displs_poroelastic(2,iglob)*hp2
+                tempz2ls = tempz2ls + displs_poroelastic(3,iglob)*hp2
+                tempx2lw = tempx2lw + displw_poroelastic(1,iglob)*hp2
+                tempy2lw = tempy2lw + displw_poroelastic(2,iglob)*hp2
+                tempz2lw = tempz2lw + displw_poroelastic(3,iglob)*hp2
+                ! adjoint simulations
+                if (SIMULATION_TYPE == 3) then
+                endif ! adjoint
+    !!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+    !!! can merge these loops because NGLLX = NGLLY = NGLLZ          do
+    !l=1,NGLLZ
+                hp3 = hprime_zz(k,l)
+                iglob = ibool(i,j,l,ispec_po)
+                tempx3ls = tempx3ls + displs_poroelastic(1,iglob)*hp3
+                tempy3ls = tempy3ls + displs_poroelastic(2,iglob)*hp3
+                tempz3ls = tempz3ls + displs_poroelastic(3,iglob)*hp3
+                tempx3lw = tempx3lw + displw_poroelastic(1,iglob)*hp3
+                tempy3lw = tempy3lw + displw_poroelastic(2,iglob)*hp3
+                tempz3lw = tempz3lw + displw_poroelastic(3,iglob)*hp3
+                ! adjoint simulations
+                if (SIMULATION_TYPE == 3) then
+                endif ! adjoint
+          enddo
+
+              xixl = xix(i,j,k,ispec_po)
+              xiyl = xiy(i,j,k,ispec_po)
+              xizl = xiz(i,j,k,ispec_po)
+              etaxl = etax(i,j,k,ispec_po)
+              etayl = etay(i,j,k,ispec_po)
+              etazl = etaz(i,j,k,ispec_po)
+              gammaxl = gammax(i,j,k,ispec_po)
+              gammayl = gammay(i,j,k,ispec_po)
+              gammazl = gammaz(i,j,k,ispec_po)
+              jacobianl = jacobian(i,j,k,ispec_po)
+
+! derivatives of displacement
+              duxdxl = xixl*tempx1ls + etaxl*tempx2ls + gammaxl*tempx3ls
+              duxdyl = xiyl*tempx1ls + etayl*tempx2ls + gammayl*tempx3ls
+              duxdzl = xizl*tempx1ls + etazl*tempx2ls + gammazl*tempx3ls
+
+              duydxl = xixl*tempy1ls + etaxl*tempy2ls + gammaxl*tempy3ls
+              duydyl = xiyl*tempy1ls + etayl*tempy2ls + gammayl*tempy3ls
+              duydzl = xizl*tempy1ls + etazl*tempy2ls + gammazl*tempy3ls
+
+              duzdxl = xixl*tempz1ls + etaxl*tempz2ls + gammaxl*tempz3ls
+              duzdyl = xiyl*tempz1ls + etayl*tempz2ls + gammayl*tempz3ls
+              duzdzl = xizl*tempz1ls + etazl*tempz2ls + gammazl*tempz3ls
+
+              dwxdxl = xixl*tempx1lw + etaxl*tempx2lw + gammaxl*tempx3lw
+              dwxdyl = xiyl*tempx1lw + etayl*tempx2lw + gammayl*tempx3lw
+              dwxdzl = xizl*tempx1lw + etazl*tempx2lw + gammazl*tempx3lw
+
+              dwydxl = xixl*tempy1lw + etaxl*tempy2lw + gammaxl*tempy3lw
+              dwydyl = xiyl*tempy1lw + etayl*tempy2lw + gammayl*tempy3lw
+              dwydzl = xizl*tempy1lw + etazl*tempy2lw + gammazl*tempy3lw
+
+              dwzdxl = xixl*tempz1lw + etaxl*tempz2lw + gammaxl*tempz3lw
+              dwzdyl = xiyl*tempz1lw + etayl*tempz2lw + gammayl*tempz3lw
+              dwzdzl = xizl*tempz1lw + etazl*tempz2lw + gammazl*tempz3lw
+
+    ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl_plus_duzdzl = duxdxl + duydyl + duzdzl
+              dwxdxl_plus_dwydyl_plus_dwzdzl = dwxdxl + dwydyl + dwzdzl
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+! compute stress tensor (include attenuation or anisotropy if needed)
+
+!  if(VISCOATTENUATION) then
+!chris:check
+
+! Dissipation only controlled by frame share attenuation in poroelastic (see
+! Morency & Tromp, GJI 2008).
+! attenuation is implemented following the memory variable formulation of
+! J. M. Carcione, Seismic modeling in viscoelastic media, Geophysics,
+! vol. 58(1), p. 110-120 (1993). More details can be found in
+! J. M. Carcione, D. Kosloff and R. Kosloff, Wave propagation simulation in a
+! linear
+! viscoelastic medium, Geophysical Journal International, vol. 95, p. 597-611
+! (1988).
+
+!  else
+
+! no attenuation
+    sigma_xx =  lambdalplus2mul_G*duxdxl + lambdal_G*duydyl_plus_duzdzl + C_biot*dwxdxl_plus_dwydyl_plus_dwzdzl
+    sigma_yy =  lambdalplus2mul_G*duydyl + lambdal_G*duxdxl_plus_duzdzl + C_biot*dwxdxl_plus_dwydyl_plus_dwzdzl
+    sigma_zz =  lambdalplus2mul_G*duzdzl + lambdal_G*duxdxl_plus_duydyl + C_biot*dwxdxl_plus_dwydyl_plus_dwzdzl
+
+    sigma_xy =  mul_G*duxdyl_plus_duydxl
+    sigma_xz =  mul_G*duzdxl_plus_duxdzl
+    sigma_yz =  mul_G*duzdyl_plus_duydzl
+
+
+    !-----------------------
+    ! from the elastic side
+    !-----------------------
+        i = coupling_po_el_ijk(1,igll,iface)
+        j = coupling_po_el_ijk(2,igll,iface)
+        k = coupling_po_el_ijk(3,igll,iface)
+
+        ! gets global index of this common GLL point
+        ! (note: should be the same as for corresponding
+        ! i',j',k',ispec_poroelastic or ispec_elastic )
+        iglob_el = ibool(i,j,k,ispec_el)
+        if (iglob_el .ne. iglob_po) stop 'poroelastic-elastic coupling error'
+          tempx1l = 0.
+          tempx2l = 0.
+          tempx3l = 0.
+
+          tempy1l = 0.
+          tempy2l = 0.
+          tempy3l = 0.
+
+          tempz1l = 0.
+          tempz2l = 0.
+          tempz3l = 0.
+
+          do l=1,NGLLX
+            hp1 = hprime_xx(i,l)
+            iglob = ibool(l,j,k,ispec_el)
+            tempx1l = tempx1l + displ(1,iglob)*hp1
+            tempy1l = tempy1l + displ(2,iglob)*hp1
+            tempz1l = tempz1l + displ(3,iglob)*hp1
+
+            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+            hp2 = hprime_yy(j,l)
+            iglob = ibool(i,l,k,ispec_el)
+            tempx2l = tempx2l + displ(1,iglob)*hp2
+            tempy2l = tempy2l + displ(2,iglob)*hp2
+            tempz2l = tempz2l + displ(3,iglob)*hp2
+
+            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+            hp3 = hprime_zz(k,l)
+            iglob = ibool(i,j,l,ispec_el)
+            tempx3l = tempx3l + displ(1,iglob)*hp3
+            tempy3l = tempy3l + displ(2,iglob)*hp3
+            tempz3l = tempz3l + displ(3,iglob)*hp3
+          enddo
+
+          ! get derivatives of ux, uy and uz with respect to x, y and z
+          xixl = xix(i,j,k,ispec_el)
+          xiyl = xiy(i,j,k,ispec_el)
+          xizl = xiz(i,j,k,ispec_el)
+          etaxl = etax(i,j,k,ispec_el)
+          etayl = etay(i,j,k,ispec_el)
+          etazl = etaz(i,j,k,ispec_el)
+          gammaxl = gammax(i,j,k,ispec_el)
+          gammayl = gammay(i,j,k,ispec_el)
+          gammazl = gammaz(i,j,k,ispec_el)
+          jacobianl = jacobian(i,j,k,ispec_el)
+
+          duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
+          duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
+          duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+
+          duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
+          duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
+          duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
+
+          duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
+          duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
+          duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
+
+          ! precompute some sums to save CPU time
+          duxdxl_plus_duydyl = duxdxl + duydyl
+          duxdxl_plus_duzdzl = duxdxl + duzdzl
+          duydyl_plus_duzdzl = duydyl + duzdzl
+          duxdyl_plus_duydxl = duxdyl + duydxl
+          duzdxl_plus_duxdzl = duzdxl + duxdzl
+          duzdyl_plus_duydzl = duzdyl + duydzl
+
+          kappal = kappastore(i,j,k,ispec_el)
+          mul = mustore(i,j,k,ispec_el)
+
+          ! full anisotropic case, stress calculations
+          if(ANISOTROPY) then
+            c11 = c11store(i,j,k,ispec_el)
+            c12 = c12store(i,j,k,ispec_el)
+            c13 = c13store(i,j,k,ispec_el)
+            c14 = c14store(i,j,k,ispec_el)
+            c15 = c15store(i,j,k,ispec_el)
+            c16 = c16store(i,j,k,ispec_el)
+            c22 = c22store(i,j,k,ispec_el)
+            c23 = c23store(i,j,k,ispec_el)
+            c24 = c24store(i,j,k,ispec_el)
+            c25 = c25store(i,j,k,ispec_el)
+            c26 = c26store(i,j,k,ispec_el)
+            c33 = c33store(i,j,k,ispec_el)
+            c34 = c34store(i,j,k,ispec_el)
+            c35 = c35store(i,j,k,ispec_el)
+            c36 = c36store(i,j,k,ispec_el)
+            c44 = c44store(i,j,k,ispec_el)
+            c45 = c45store(i,j,k,ispec_el)
+            c46 = c46store(i,j,k,ispec_el)
+            c55 = c55store(i,j,k,ispec_el)
+            c56 = c56store(i,j,k,ispec_el)
+            c66 = c66store(i,j,k,ispec_el)
+
+            sigma_xx = sigma_xx + c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                      c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+            sigma_yy = sigma_yy + c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                      c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+            sigma_zz = sigma_zz + c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                      c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+            sigma_xy = sigma_xy + c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                      c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+            sigma_xz = sigma_xz + c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                      c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+            sigma_yz = sigma_yz + c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                      c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+          else
+
+            ! isotropic case
+            lambdalplus2mul = kappal + FOUR_THIRDS * mul
+            lambdal = lambdalplus2mul - 2.*mul
+
+            ! compute stress sigma
+            sigma_xx = sigma_xx + lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+            sigma_yy = sigma_yy + lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+            sigma_zz = sigma_zz + lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+            sigma_xy = sigma_xy + mul*duxdyl_plus_duydxl
+            sigma_xz = sigma_xz + mul*duzdxl_plus_duxdzl
+            sigma_yz = sigma_yz + mul*duzdyl_plus_duydzl
+
+          endif ! ANISOTROPY
+
+        ! gets associated normal on GLL point
+        ! (note convention: pointing outwards of poroelastic element)
+        nx = coupling_el_po_normal(1,igll,iface)
+        ny = coupling_el_po_normal(2,igll,iface)
+        nz = coupling_el_po_normal(3,igll,iface)
+
+        ! gets associated, weighted 2D jacobian
+        ! (note: should be the same for poroelastic and elastic element)
+        jacobianw = coupling_el_po_jacobian2Dw(igll,iface)
+
+        ! continuity of displacement and traction on global point
+        !
+        ! note: continuity of displacement is enforced after the velocity update
+        accel(1,iglob_el) = accel(1,iglob_el) - jacobianw* &
+                 ( sigma_xx*nx + sigma_xy*ny + sigma_xz*nz )/2.d0
+
+        accel(2,iglob_el) = accel(2,iglob_el) - jacobianw* &
+                 ( sigma_xy*nx + sigma_yy*ny + sigma_yz*nz )/2.d0
+
+        accel(3,iglob_el) = accel(3,iglob_el) - jacobianw* &
+                 ( sigma_xz*nx + sigma_yz*ny + sigma_zz*nz )/2.d0
+
+      enddo ! igll
+
+    endif
+
+  enddo ! iface
+
+end subroutine compute_coupling_viscoelastic_po
+

Deleted: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_Dev.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_Dev.f90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_Dev.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -1,844 +0,0 @@
-!=====================================================================
-!
-!               S p e c f e m 3 D  V e r s i o n  2 . 1
-!               ---------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Princeton University, USA and CNRS / INRIA / University of Pau
-! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
-!                             July 2012
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-! Deville routine for NGLL == 5 (default)
-
-  subroutine compute_forces_elastic_Dev_5p( iphase ,NSPEC_AB,NGLOB_AB, &
-                                    displ,veloc,accel, &
-                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                    hprime_xx,hprime_xxT, &
-                                    hprimewgll_xx,hprimewgll_xxT, &
-                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                                    kappastore,mustore,jacobian,ibool, &
-                                    ATTENUATION,deltat, &
-                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
-                                    NSPEC_ATTENUATION_AB, &
-                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                                    ANISOTROPY,NSPEC_ANISO, &
-                                    c11store,c12store,c13store,c14store,c15store,c16store,&
-                                    c22store,c23store,c24store,c25store,c26store,c33store,&
-                                    c34store,c35store,c36store,c44store,c45store,c46store,&
-                                    c55store,c56store,c66store, &
-                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                                    is_moho_top,is_moho_bot, &
-                                    dsdx_top,dsdx_bot, &
-                                    ispec2D_moho_top,ispec2D_moho_bot, &
-                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                                    phase_ispec_inner_elastic)
-
-
-! computes elastic tensor term
-
-  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
-                      N_SLS,SAVE_MOHO_MESH, &
-                      ONE_THIRD,FOUR_THIRDS,m1,m2
-  use fault_solver_dynamic, only : Kelvin_Voigt_eta
-
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement, velocity and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,veloc,accel
-
-! time step
-  real(kind=CUSTOM_REAL) :: deltat
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore,jacobian
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,5) :: hprime_xx,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX) :: hprime_xxT,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-      R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-    dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-! local parameters
-  real(kind=CUSTOM_REAL), dimension(5,5,5) :: dummyx_loc,dummyy_loc,dummyz_loc, &
-    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
-  real(kind=CUSTOM_REAL), dimension(5,5,5) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  real(kind=CUSTOM_REAL) duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att
-  real(kind=CUSTOM_REAL) duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(5,25) :: B1_m1_m2_5points,B2_m1_m2_5points,B3_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(5,25) :: C1_m1_m2_5points,C2_m1_m2_5points,C3_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(5,25) :: E1_m1_m2_5points,E2_m1_m2_5points,E3_m1_m2_5points
-
-  equivalence(dummyx_loc,B1_m1_m2_5points)
-  equivalence(dummyy_loc,B2_m1_m2_5points)
-  equivalence(dummyz_loc,B3_m1_m2_5points)
-  equivalence(tempx1,C1_m1_m2_5points)
-  equivalence(tempy1,C2_m1_m2_5points)
-  equivalence(tempz1,C3_m1_m2_5points)
-  equivalence(newtempx1,E1_m1_m2_5points)
-  equivalence(newtempy1,E2_m1_m2_5points)
-  equivalence(newtempz1,E3_m1_m2_5points)
-
-  real(kind=CUSTOM_REAL), dimension(5,5,5) :: &
-    tempx1_att,tempx2_att,tempx3_att,tempy1_att,tempy2_att,tempy3_att,tempz1_att,tempz2_att,tempz3_att
-  real(kind=CUSTOM_REAL), dimension(5,5,5) :: dummyx_loc_att,dummyy_loc_att,dummyz_loc_att
-  real(kind=CUSTOM_REAL), dimension(5,25) :: B1_m1_m2_5points_att,B2_m1_m2_5points_att,B3_m1_m2_5points_att
-  real(kind=CUSTOM_REAL), dimension(5,25) :: C1_m1_m2_5points_att,C2_m1_m2_5points_att,C3_m1_m2_5points_att
-
-  equivalence(dummyx_loc_att,B1_m1_m2_5points_att)
-  equivalence(dummyy_loc_att,B2_m1_m2_5points_att)
-  equivalence(dummyz_loc_att,B3_m1_m2_5points_att)
-  equivalence(tempx1_att,C1_m1_m2_5points_att)
-  equivalence(tempy1_att,C2_m1_m2_5points_att)
-  equivalence(tempz1_att,C3_m1_m2_5points_att)
-
-  real(kind=CUSTOM_REAL), dimension(25,5) :: &
-    A1_mxm_m2_m1_5points,A2_mxm_m2_m1_5points,A3_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(25,5) :: &
-    C1_mxm_m2_m1_5points,C2_mxm_m2_m1_5points,C3_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(25,5) :: &
-    E1_mxm_m2_m1_5points,E2_mxm_m2_m1_5points,E3_mxm_m2_m1_5points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_5points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_5points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_5points)
-  equivalence(tempx3,C1_mxm_m2_m1_5points)
-  equivalence(tempy3,C2_mxm_m2_m1_5points)
-  equivalence(tempz3,C3_mxm_m2_m1_5points)
-  equivalence(newtempx3,E1_mxm_m2_m1_5points)
-  equivalence(newtempy3,E2_mxm_m2_m1_5points)
-  equivalence(newtempz3,E3_mxm_m2_m1_5points)
-
-  real(kind=CUSTOM_REAL), dimension(25,5) :: &
-    A1_mxm_m2_m1_5points_att,A2_mxm_m2_m1_5points_att,A3_mxm_m2_m1_5points_att
-  real(kind=CUSTOM_REAL), dimension(25,5) :: &
-    C1_mxm_m2_m1_5points_att,C2_mxm_m2_m1_5points_att,C3_mxm_m2_m1_5points_att
-
-  equivalence(dummyx_loc_att,A1_mxm_m2_m1_5points_att)
-  equivalence(dummyy_loc_att,A2_mxm_m2_m1_5points_att)
-  equivalence(dummyz_loc_att,A3_mxm_m2_m1_5points_att)
-  equivalence(tempx3_att,C1_mxm_m2_m1_5points_att)
-  equivalence(tempy3_att,C2_mxm_m2_m1_5points_att)
-  equivalence(tempz3_att,C3_mxm_m2_m1_5points_att)
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
-  integer i,j,k
-
-  real(kind=CUSTOM_REAL) :: eta
-
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  ! choses inner/outer elements
-  if( iphase == 1 ) then
-    num_elements = nspec_outer_elastic
-  else
-    num_elements = nspec_inner_elastic
-  endif
-
-  do ispec_p = 1,num_elements
-
-        ! returns element id from stored element list
-        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-        ! adjoint simulations: moho kernel
-        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
-          if (is_moho_top(ispec)) then
-            ispec2D_moho_top = ispec2D_moho_top + 1
-          else if (is_moho_bot(ispec)) then
-            ispec2D_moho_bot = ispec2D_moho_bot + 1
-          endif
-        endif ! adjoint
-
-       ! Kelvin Voigt damping: artificial viscosity around dynamic faults
-
-        ! stores displacment values in local array
-        if (allocated(Kelvin_Voigt_eta)) then
-          eta = Kelvin_Voigt_eta(ispec)   
-          do k=1,NGLLZ
-            do j=1,NGLLY
-              do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k) = displ(1,iglob) + eta*veloc(1,iglob)
-                dummyy_loc(i,j,k) = displ(2,iglob) + eta*veloc(2,iglob)
-                dummyz_loc(i,j,k) = displ(3,iglob) + eta*veloc(3,iglob)
-              enddo
-            enddo
-          enddo
-
-        else
-          do k=1,NGLLZ
-            do j=1,NGLLY
-              do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k) = displ(1,iglob)
-                dummyy_loc(i,j,k) = displ(2,iglob)
-                dummyz_loc(i,j,k) = displ(3,iglob)
-              enddo
-            enddo
-          enddo
-        endif
-
-        ! use first order Taylor expansion of displacement for local storage of stresses 
-        ! at this current time step, to fix attenuation in a consistent way
-        if(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
-           do k=1,NGLLZ
-              do j=1,NGLLY
-                 do i=1,NGLLX
-                    iglob = ibool(i,j,k,ispec)
-                    dummyx_loc_att(i,j,k) = deltat*veloc(1,iglob)
-                    dummyy_loc_att(i,j,k) = deltat*veloc(2,iglob)
-                    dummyz_loc_att(i,j,k) = deltat*veloc(3,iglob)
-                 enddo
-              enddo
-           enddo
-        endif
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_5points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
-                                  hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
-                                  hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
-                                  hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
-                                  hprime_xx(i,5)*B1_m1_m2_5points(5,j)
-            C2_m1_m2_5points(i,j) = hprime_xx(i,1)*B2_m1_m2_5points(1,j) + &
-                                  hprime_xx(i,2)*B2_m1_m2_5points(2,j) + &
-                                  hprime_xx(i,3)*B2_m1_m2_5points(3,j) + &
-                                  hprime_xx(i,4)*B2_m1_m2_5points(4,j) + &
-                                  hprime_xx(i,5)*B2_m1_m2_5points(5,j)
-            C3_m1_m2_5points(i,j) = hprime_xx(i,1)*B3_m1_m2_5points(1,j) + &
-                                  hprime_xx(i,2)*B3_m1_m2_5points(2,j) + &
-                                  hprime_xx(i,3)*B3_m1_m2_5points(3,j) + &
-                                  hprime_xx(i,4)*B3_m1_m2_5points(4,j) + &
-                                  hprime_xx(i,5)*B3_m1_m2_5points(5,j)
-          enddo
-        enddo
-
-        if(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
-           ! temporary variables used for fixing attenuation in a consistent way
-           do j=1,m2
-              do i=1,m1
-                 C1_m1_m2_5points_att(i,j) = C1_m1_m2_5points(i,j) + & 
-                      hprime_xx(i,1)*B1_m1_m2_5points_att(1,j) + &
-                      hprime_xx(i,2)*B1_m1_m2_5points_att(2,j) + &
-                      hprime_xx(i,3)*B1_m1_m2_5points_att(3,j) + &
-                      hprime_xx(i,4)*B1_m1_m2_5points_att(4,j) + &
-                      hprime_xx(i,5)*B1_m1_m2_5points_att(5,j)
-
-                 C2_m1_m2_5points_att(i,j) = C2_m1_m2_5points(i,j) + &
-                      hprime_xx(i,1)*B2_m1_m2_5points_att(1,j) + &
-                      hprime_xx(i,2)*B2_m1_m2_5points_att(2,j) + &
-                      hprime_xx(i,3)*B2_m1_m2_5points_att(3,j) + &
-                      hprime_xx(i,4)*B2_m1_m2_5points_att(4,j) + &
-                      hprime_xx(i,5)*B2_m1_m2_5points_att(5,j)
-
-                 C3_m1_m2_5points_att(i,j) = C3_m1_m2_5points(i,j) + &
-                      hprime_xx(i,1)*B3_m1_m2_5points_att(1,j) + &
-                      hprime_xx(i,2)*B3_m1_m2_5points_att(2,j) + &
-                      hprime_xx(i,3)*B3_m1_m2_5points_att(3,j) + &
-                      hprime_xx(i,4)*B3_m1_m2_5points_att(4,j) + &
-                      hprime_xx(i,5)*B3_m1_m2_5points_att(5,j)
-              enddo
-           enddo
-        endif
-
-        !   call mxm_m1_m1_5points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-        do j=1,m1
-          do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyx_loc(i,5,k)*hprime_xxT(5,j)
-              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyy_loc(i,5,k)*hprime_xxT(5,j)
-              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyz_loc(i,5,k)*hprime_xxT(5,j)
-            enddo
-          enddo
-        enddo
-
-        if(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
-           ! temporary variables used for fixing attenuation in a consistent way
-           do j=1,m1
-              do i=1,m1
-                 ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-                 do k = 1,NGLLX
-                    tempx2_att(i,j,k) = tempx2(i,j,k) + &
-                         dummyx_loc_att(i,1,k)*hprime_xxT(1,j) + &
-                         dummyx_loc_att(i,2,k)*hprime_xxT(2,j) + &
-                         dummyx_loc_att(i,3,k)*hprime_xxT(3,j) + &
-                         dummyx_loc_att(i,4,k)*hprime_xxT(4,j) + &
-                         dummyx_loc_att(i,5,k)*hprime_xxT(5,j)
-
-                    tempy2_att(i,j,k) = tempy2(i,j,k) + &
-                         dummyy_loc_att(i,1,k)*hprime_xxT(1,j) + &
-                         dummyy_loc_att(i,2,k)*hprime_xxT(2,j) + &
-                         dummyy_loc_att(i,3,k)*hprime_xxT(3,j) + &
-                         dummyy_loc_att(i,4,k)*hprime_xxT(4,j) + &
-                         dummyy_loc_att(i,5,k)*hprime_xxT(5,j)
-
-                    tempz2_att(i,j,k) = tempz2(i,j,k) + &
-                         dummyz_loc_att(i,1,k)*hprime_xxT(1,j) + &
-                         dummyz_loc_att(i,2,k)*hprime_xxT(2,j) + &
-                         dummyz_loc_att(i,3,k)*hprime_xxT(3,j) + &
-                         dummyz_loc_att(i,4,k)*hprime_xxT(4,j) + &
-                         dummyz_loc_att(i,5,k)*hprime_xxT(5,j)
-                 enddo
-              enddo
-           enddo
-        endif
-
-        ! call mxm_m2_m1_5points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                      A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                      A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                      A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                      A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-            C2_mxm_m2_m1_5points(i,j) = A2_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                      A2_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                      A2_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                      A2_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                      A2_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-            C3_mxm_m2_m1_5points(i,j) = A3_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                      A3_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                      A3_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                      A3_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                      A3_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-          enddo
-        enddo
-
-        if(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
-           ! temporary variables used for fixing attenuation in a consistent way
-           do j=1,m1
-              do i=1,m2
-                 C1_mxm_m2_m1_5points_att(i,j) = C1_mxm_m2_m1_5points(i,j) + &
-                      A1_mxm_m2_m1_5points_att(i,1)*hprime_xxT(1,j) + &
-                      A1_mxm_m2_m1_5points_att(i,2)*hprime_xxT(2,j) + &
-                      A1_mxm_m2_m1_5points_att(i,3)*hprime_xxT(3,j) + &
-                      A1_mxm_m2_m1_5points_att(i,4)*hprime_xxT(4,j) + &
-                      A1_mxm_m2_m1_5points_att(i,5)*hprime_xxT(5,j)
-
-                 C2_mxm_m2_m1_5points_att(i,j) = C2_mxm_m2_m1_5points(i,j) + &
-                      A2_mxm_m2_m1_5points_att(i,1)*hprime_xxT(1,j) + &
-                      A2_mxm_m2_m1_5points_att(i,2)*hprime_xxT(2,j) + &
-                      A2_mxm_m2_m1_5points_att(i,3)*hprime_xxT(3,j) + &
-                      A2_mxm_m2_m1_5points_att(i,4)*hprime_xxT(4,j) + &
-                      A2_mxm_m2_m1_5points_att(i,5)*hprime_xxT(5,j)
-
-                 C3_mxm_m2_m1_5points_att(i,j) = C3_mxm_m2_m1_5points(i,j) + &
-                      A3_mxm_m2_m1_5points_att(i,1)*hprime_xxT(1,j) + &
-                      A3_mxm_m2_m1_5points_att(i,2)*hprime_xxT(2,j) + &
-                      A3_mxm_m2_m1_5points_att(i,3)*hprime_xxT(3,j) + &
-                      A3_mxm_m2_m1_5points_att(i,4)*hprime_xxT(4,j) + &
-                      A3_mxm_m2_m1_5points_att(i,5)*hprime_xxT(5,j)
-              enddo
-           enddo
-        endif
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-              ! save strain on the Moho boundary
-              if (SAVE_MOHO_MESH ) then
-                if (is_moho_top(ispec)) then
-                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-                else if (is_moho_bot(ispec)) then
-                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-                endif
-              endif
-
-              ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-              if ( ATTENUATION .and. COMPUTE_AND_STORE_STRAIN ) then
-                 ! temporary variables used for fixing attenuation in a consistent way
-                 duxdxl_att = xixl*tempx1_att(i,j,k) + etaxl*tempx2_att(i,j,k) + gammaxl*tempx3_att(i,j,k)
-                 duxdyl_att = xiyl*tempx1_att(i,j,k) + etayl*tempx2_att(i,j,k) + gammayl*tempx3_att(i,j,k)
-                 duxdzl_att = xizl*tempx1_att(i,j,k) + etazl*tempx2_att(i,j,k) + gammazl*tempx3_att(i,j,k)
-
-                 duydxl_att = xixl*tempy1_att(i,j,k) + etaxl*tempy2_att(i,j,k) + gammaxl*tempy3_att(i,j,k)
-                 duydyl_att = xiyl*tempy1_att(i,j,k) + etayl*tempy2_att(i,j,k) + gammayl*tempy3_att(i,j,k)
-                 duydzl_att = xizl*tempy1_att(i,j,k) + etazl*tempy2_att(i,j,k) + gammazl*tempy3_att(i,j,k)
-
-                 duzdxl_att = xixl*tempz1_att(i,j,k) + etaxl*tempz2_att(i,j,k) + gammaxl*tempz3_att(i,j,k)
-                 duzdyl_att = xiyl*tempz1_att(i,j,k) + etayl*tempz2_att(i,j,k) + gammayl*tempz3_att(i,j,k)
-                 duzdzl_att = xizl*tempz1_att(i,j,k) + etazl*tempz2_att(i,j,k) + gammazl*tempz3_att(i,j,k)
-
-                 ! precompute some sums to save CPU time
-                 duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att
-                 duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att
-                 duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att
-
-                 ! compute deviatoric strain
-                 templ = ONE_THIRD * (duxdxl_att + duydyl_att + duzdzl_att)
-                 if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                 epsilondev_xx_loc(i,j,k) = duxdxl_att - templ
-                 epsilondev_yy_loc(i,j,k) = duydyl_att - templ
-                 epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl_att
-                 epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl_att
-                 epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl_att
-              else
-                 ! computes deviatoric strain attenuation and/or for kernel calculations
-                 if (COMPUTE_AND_STORE_STRAIN) then
-                    templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                    if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                    epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                    epsilondev_yy_loc(i,j,k) = duydyl - templ
-                    epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                    epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                    epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-                 endif
-              endif
-
-              kappal = kappastore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-
-              ! attenuation
-              if(ATTENUATION) then
-                ! use unrelaxed parameters if attenuation
-                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-              endif
-
-  ! full anisotropic case, stress calculations
-              if(ANISOTROPY) then
-                c11 = c11store(i,j,k,ispec)
-                c12 = c12store(i,j,k,ispec)
-                c13 = c13store(i,j,k,ispec)
-                c14 = c14store(i,j,k,ispec)
-                c15 = c15store(i,j,k,ispec)
-                c16 = c16store(i,j,k,ispec)
-                c22 = c22store(i,j,k,ispec)
-                c23 = c23store(i,j,k,ispec)
-                c24 = c24store(i,j,k,ispec)
-                c25 = c25store(i,j,k,ispec)
-                c26 = c26store(i,j,k,ispec)
-                c33 = c33store(i,j,k,ispec)
-                c34 = c34store(i,j,k,ispec)
-                c35 = c35store(i,j,k,ispec)
-                c36 = c36store(i,j,k,ispec)
-                c44 = c44store(i,j,k,ispec)
-                c45 = c45store(i,j,k,ispec)
-                c46 = c46store(i,j,k,ispec)
-                c55 = c55store(i,j,k,ispec)
-                c56 = c56store(i,j,k,ispec)
-                c66 = c66store(i,j,k,ispec)
-
-                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-              else
-
-  ! isotropic case
-                lambdalplus2mul = kappal + FOUR_THIRDS * mul
-                lambdal = lambdalplus2mul - 2.*mul
-
-                ! compute stress sigma
-                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-                sigma_xy = mul*duxdyl_plus_duydxl
-                sigma_xz = mul*duzdxl_plus_duxdzl
-                sigma_yz = mul*duzdyl_plus_duydzl
-
-              endif ! ANISOTROPY
-
-              ! subtract memory variables if attenuation
-              if(ATTENUATION) then
-! way 1
-!                do i_sls = 1,N_SLS
-!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
-!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
-!                  sigma_xx = sigma_xx - R_xx_val
-!                  sigma_yy = sigma_yy - R_yy_val
-!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
-!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-!                enddo
-
-! way 2
-! note: this should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          by default, N_SLS = 3, therefore we take steps of 3
-              if(imodulo_N_SLS >= 1) then
-                do i_sls = 1,imodulo_N_SLS
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                enddo
-              endif
-
-              if(N_SLS >= imodulo_N_SLS+1) then
-                do i_sls = imodulo_N_SLS+1,N_SLS,3
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-
-                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
-                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
-                  sigma_xx = sigma_xx - R_xx_val2
-                  sigma_yy = sigma_yy - R_yy_val2
-                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
-
-                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
-                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
-                  sigma_xx = sigma_xx - R_xx_val3
-                  sigma_yy = sigma_yy - R_yy_val3
-                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
-                enddo
-              endif
-
-
-              endif
-
-            ! define symmetric components of sigma
-            sigma_yx = sigma_xy
-            sigma_zx = sigma_xz
-            sigma_zy = sigma_yz
-
-            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
-            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_5points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
-        do j=1,m2
-          do i=1,m1
-            E1_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_5points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C1_m1_m2_5points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C1_m1_m2_5points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C1_m1_m2_5points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C1_m1_m2_5points(5,j)
-            E2_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_5points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C2_m1_m2_5points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C2_m1_m2_5points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C2_m1_m2_5points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C2_m1_m2_5points(5,j)
-            E3_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_5points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C3_m1_m2_5points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C3_m1_m2_5points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C3_m1_m2_5points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C3_m1_m2_5points(5,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_5points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
-        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
-        do i=1,m1
-          do j=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempx2(i,5,k)*hprimewgll_xx(5,j)
-              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempy2(i,5,k)*hprimewgll_xx(5,j)
-              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempz2(i,5,k)*hprimewgll_xx(5,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_5points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
-        do j=1,m1
-          do i=1,m2
-            E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                      C1_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                      C1_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                      C1_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                      C1_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-            E2_mxm_m2_m1_5points(i,j) = C2_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                      C2_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                      C2_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                      C2_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                      C2_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-            E3_mxm_m2_m1_5points(i,j) = C3_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                      C3_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                      C3_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                      C3_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                      C3_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-
-              fac1 = wgllwgll_yz(j,k)
-              fac2 = wgllwgll_xz(i,k)
-              fac3 = wgllwgll_xy(i,j)
-
-              ! sum contributions from each element to the global mesh using indirect addressing
-              iglob = ibool(i,j,k,ispec)
-              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
-                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
-              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
-                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
-              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
-                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
-
-              !  update memory variables based upon the Runge-Kutta scheme
-              if(ATTENUATION) then
-
-                 ! use Runge-Kutta scheme to march in time
-                 do i_sls = 1,N_SLS
-
-                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                    alphaval_loc = alphaval(i_sls)
-                    betaval_loc = betaval(i_sls)
-                    gammaval_loc = gammaval(i_sls)
-
-                    ! term in xx
-                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yy
-                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in zz not computed since zero trace
-                    ! term in xy
-                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in xz
-                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yz
-                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-
-                 enddo   ! end of loop on memory variables
-
-              endif  !  end attenuation
-
-            enddo
-          enddo
-        enddo
-
-        ! save deviatoric strain for Runge-Kutta scheme
-        if ( COMPUTE_AND_STORE_STRAIN ) then
-          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-        endif
-
-  enddo  ! spectral element loop
-
-  end subroutine compute_forces_elastic_Dev_5p
-

Deleted: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_calling_routine.F90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_calling_routine.F90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -1,554 +0,0 @@
-!=====================================================================
-!
-!               S p e c f e m 3 D  V e r s i o n  2 . 1
-!               ---------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Princeton University, USA and CNRS / INRIA / University of Pau
-! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
-!                             July 2012
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-! elastic solver
-
-subroutine compute_forces_elastic()
-
-  use specfem_par
-  use specfem_par_acoustic
-  use specfem_par_elastic
-  use specfem_par_poroelastic
-  use fault_solver_dynamic, only : bc_dynflt_set3d_all,SIMULATION_TYPE_DYN
-  use fault_solver_kinematic, only : bc_kinflt_set_all,SIMULATION_TYPE_KIN
-
-  implicit none
-
-  integer:: iphase
-  logical:: phase_is_inner
-
-! distinguishes two runs: for points on MPI interfaces, and points within the partitions
-  do iphase=1,2
-
-    !first for points on MPI interfaces
-    if( iphase == 1 ) then
-      phase_is_inner = .false.
-    else
-      phase_is_inner = .true.
-    endif
-
-
-! elastic term
-    if( .NOT. GPU_MODE ) then
-      if(USE_DEVILLE_PRODUCTS) then
-        ! uses Deville (2002) optimizations
-        call compute_forces_elastic_Dev_sim1(iphase)
-
-        ! adjoint simulations: backward/reconstructed wavefield
-        if( SIMULATION_TYPE == 3 ) &
-          call compute_forces_elastic_Dev_sim3(iphase)
-
-      else
-        ! no optimizations used
-        call compute_forces_elastic_noDev( iphase, NSPEC_AB,NGLOB_AB,displ,veloc,accel, &
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        hprime_xx,hprime_yy,hprime_zz, &
-                        hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                        kappastore,mustore,jacobian,ibool, &
-                        ATTENUATION,deltat,PML_CONDITIONS, &
-                        nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax,NSPEC2D_BOTTOM,NSPEC2D_TOP, &
-                        ibelm_xmin,ibelm_xmax,ibelm_ymin,ibelm_ymax,ibelm_bottom,ibelm_top, &
-                        one_minus_sum_beta,factor_common, &
-                        alphaval,betaval,gammaval,&
-                        NSPEC_ATTENUATION_AB, &
-                        R_xx,R_yy,R_xy,R_xz,R_yz, &
-                        epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                        epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                        ANISOTROPY,NSPEC_ANISO, &
-                        c11store,c12store,c13store,c14store,c15store,c16store, &
-                        c22store,c23store,c24store,c25store,c26store,c33store, &
-                        c34store,c35store,c36store,c44store,c45store,c46store, &
-                        c55store,c56store,c66store, &
-                        SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                        NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                        is_moho_top,is_moho_bot, &
-                        dsdx_top,dsdx_bot, &
-                        ispec2D_moho_top,ispec2D_moho_bot, &
-                        num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic, &
-                        phase_ispec_inner_elastic,ispec_is_elastic )
-
-        ! adjoint simulations: backward/reconstructed wavefield
-        if( SIMULATION_TYPE == 3 ) &
-          call compute_forces_elastic_noDev( iphase, NSPEC_AB,NGLOB_AB, &
-                        b_displ,b_veloc,b_accel, &
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        hprime_xx,hprime_yy,hprime_zz, &
-                        hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                        kappastore,mustore,jacobian,ibool, &
-                        ATTENUATION,deltat,PML_CONDITIONS, &
-                        nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax,NSPEC2D_BOTTOM,NSPEC2D_TOP, &
-                        ibelm_xmin,ibelm_xmax,ibelm_ymin,ibelm_ymax,ibelm_bottom,ibelm_top, &
-                        one_minus_sum_beta,factor_common, &
-                        b_alphaval,b_betaval,b_gammaval, &
-                        NSPEC_ATTENUATION_AB, &
-                        b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
-                        b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, &
-                        b_epsilondev_xz,b_epsilondev_yz,b_epsilon_trace_over_3, &
-                        ANISOTROPY,NSPEC_ANISO, &
-                        c11store,c12store,c13store,c14store,c15store,c16store, &
-                        c22store,c23store,c24store,c25store,c26store,c33store, &
-                        c34store,c35store,c36store,c44store,c45store,c46store, &
-                        c55store,c56store,c66store, &
-                        SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                        NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                        is_moho_top,is_moho_bot, &
-                        b_dsdx_top,b_dsdx_bot, &
-                        ispec2D_moho_top,ispec2D_moho_bot, &
-                        num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic, &
-                        phase_ispec_inner_elastic  )
-
-      endif
-
-    else
-      ! on GPU
-      ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
-      call compute_forces_elastic_cuda(Mesh_pointer, iphase, deltat, &
-                                      nspec_outer_elastic, &
-                                      nspec_inner_elastic, &
-                                      COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
-
-      if(phase_is_inner .eqv. .true.) then
-         ! while Inner elements compute "Kernel_2", we wait for MPI to
-         ! finish and transfer the boundary terms to the device
-         ! asynchronously
-
-         !daniel: todo - this avoids calling the fortran vector send from CUDA routine
-         ! wait for asynchronous copy to finish
-         call sync_copy_from_device(Mesh_pointer,iphase,buffer_send_vector_ext_mesh)
-         ! sends mpi buffers
-         call assemble_MPI_vector_send_cuda(NPROC, &
-                  buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
-                  num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                  nibool_interfaces_ext_mesh,&
-                  my_neighbours_ext_mesh, &
-                  request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
-
-         ! transfers mpi buffers onto GPU
-         call transfer_boundary_to_device(NPROC,Mesh_pointer,buffer_recv_vector_ext_mesh, &
-                  num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                  request_recv_vector_ext_mesh)
-      endif ! inner elements
-
-   endif ! GPU_MODE
-
-
-! adds elastic absorbing boundary term to acceleration (Stacey conditions)
-    if( ABSORBING_CONDITIONS ) then
-       call compute_stacey_elastic(NSPEC_AB,NGLOB_AB,accel, &
-                        ibool,ispec_is_inner,phase_is_inner, &
-                        abs_boundary_normal,abs_boundary_jacobian2Dw, &
-                        abs_boundary_ijk,abs_boundary_ispec, &
-                        num_abs_boundary_faces, &
-                        veloc,rho_vp,rho_vs, &
-                        ispec_is_elastic,SIMULATION_TYPE,SAVE_FORWARD, &
-                        NSTEP,it,NGLOB_ADJOINT,b_accel, &
-                        b_num_abs_boundary_faces,b_reclen_field,b_absorb_field,&
-                        GPU_MODE,Mesh_pointer)
-    endif
-
-
-! acoustic coupling
-    if( ACOUSTIC_SIMULATION ) then
-      if( num_coupling_ac_el_faces > 0 ) then
-        if( .NOT. GPU_MODE ) then
-          if( SIMULATION_TYPE == 1 ) then
-            ! forward definition: pressure=-potential_dot_dot
-            call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, &
-                        ibool,accel,potential_dot_dot_acoustic, &
-                        num_coupling_ac_el_faces, &
-                        coupling_ac_el_ispec,coupling_ac_el_ijk, &
-                        coupling_ac_el_normal, &
-                        coupling_ac_el_jacobian2Dw, &
-                        ispec_is_inner,phase_is_inner)
-          else
-            ! handles adjoint runs coupling between adjoint potential and adjoint elastic wavefield
-            ! adoint definition: pressure^\dagger=potential^\dagger
-            call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, &
-                              ibool,accel,-potential_acoustic_adj_coupling, &
-                              num_coupling_ac_el_faces, &
-                              coupling_ac_el_ispec,coupling_ac_el_ijk, &
-                              coupling_ac_el_normal, &
-                              coupling_ac_el_jacobian2Dw, &
-                              ispec_is_inner,phase_is_inner)
-          endif
-
-        ! adjoint simulations
-        if( SIMULATION_TYPE == 3 ) &
-          call compute_coupling_elastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
-                        ibool,b_accel,b_potential_dot_dot_acoustic, &
-                        num_coupling_ac_el_faces, &
-                        coupling_ac_el_ispec,coupling_ac_el_ijk, &
-                        coupling_ac_el_normal, &
-                        coupling_ac_el_jacobian2Dw, &
-                        ispec_is_inner,phase_is_inner)
-
-        else
-          ! on GPU
-          call compute_coupling_el_ac_cuda(Mesh_pointer,phase_is_inner, &
-                                           num_coupling_ac_el_faces)
-        endif ! GPU_MODE
-      endif ! num_coupling_ac_el_faces
-    endif
-
-
-! poroelastic coupling
-    if( POROELASTIC_SIMULATION ) then
-      call compute_coupling_elastic_po(NSPEC_AB,NGLOB_AB,ibool,&
-                        displs_poroelastic,displw_poroelastic,&
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        hprime_xx,hprime_yy,hprime_zz,&
-                        kappaarraystore,rhoarraystore,mustore, &
-                        phistore,tortstore,jacobian,&
-                        displ,accel,kappastore, &
-                        ANISOTROPY,NSPEC_ANISO, &
-                        c11store,c12store,c13store,c14store,c15store,c16store,&
-                        c22store,c23store,c24store,c25store,c26store,c33store,&
-                        c34store,c35store,c36store,c44store,c45store,c46store,&
-                        c55store,c56store,c66store, &
-                        SIMULATION_TYPE,NGLOB_ADJOINT,NSPEC_ADJOINT, &
-                        num_coupling_el_po_faces, &
-                        coupling_el_po_ispec,coupling_po_el_ispec, &
-                        coupling_el_po_ijk,coupling_po_el_ijk, &
-                        coupling_el_po_normal, &
-                        coupling_el_po_jacobian2Dw, &
-                        ispec_is_inner,phase_is_inner)
-    endif
-
-! adds source term (single-force/moment-tensor solution)
-    call compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
-                        ibool,ispec_is_inner,phase_is_inner, &
-                        NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,&
-                        hdur,hdur_gaussian,tshift_src,dt,t0,sourcearrays, &
-                        ispec_is_elastic,SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT, &
-                        nrec,islice_selected_rec,ispec_selected_rec, &
-                        nadj_rec_local,adj_sourcearrays,b_accel, &
-                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY, &
-                        GPU_MODE, Mesh_pointer )
-
-    ! assemble all the contributions between slices using MPI
-    if( phase_is_inner .eqv. .false. ) then
-       ! sends accel values to corresponding MPI interface neighbors
-       if(.NOT. GPU_MODE) then
-          call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB,accel, &
-               buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
-               num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-               nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
-               my_neighbours_ext_mesh, &
-               request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
-       else ! GPU_MODE==1
-          ! transfers boundary region to host asynchronously. The
-          ! MPI-send is done from within compute_forces_elastic_cuda,
-          ! once the inner element kernels are launched, and the
-          ! memcpy has finished. see compute_forces_elastic_cuda:1655
-          call transfer_boundary_from_device_a(Mesh_pointer,nspec_outer_elastic)
-       endif ! GPU_MODE
-
-       ! adjoint simulations
-       if( SIMULATION_TYPE == 3 ) then
-          if(.NOT. GPU_MODE) then
-             call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_accel, &
-                  b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
-                  num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                  nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
-                  my_neighbours_ext_mesh, &
-                  b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
-          else ! GPU_MODE == 1
-             call transfer_boun_accel_from_device(NGLOB_AB*NDIM, Mesh_pointer, b_accel,&
-                       b_buffer_send_vector_ext_mesh,&
-                       num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh,&
-                       nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh,3) ! <-- 3 == adjoint b_accel
-             call assemble_MPI_vector_send_cuda(NPROC, &
-                  b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
-                  num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                  nibool_interfaces_ext_mesh,&
-                  my_neighbours_ext_mesh, &
-                  b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
-          endif ! GPU
-       endif !adjoint
-
-    else
-      ! waits for send/receive requests to be completed and assembles values
-      if(.NOT. GPU_MODE) then
-         call assemble_MPI_vector_ext_mesh_w_ordered(NPROC,NGLOB_AB,accel, &
-                            buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
-                            max_nibool_interfaces_ext_mesh, &
-                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
-                            my_neighbours_ext_mesh,myrank)
-
-      else ! GPU_MODE == 1
-         call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,accel, Mesh_pointer,&
-                            buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
-                            max_nibool_interfaces_ext_mesh, &
-                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh,1)
-      endif
-      ! adjoint simulations
-      if( SIMULATION_TYPE == 3 ) then
-         if(.NOT. GPU_MODE) then
-            call assemble_MPI_vector_ext_mesh_w_ordered(NPROC,NGLOB_ADJOINT,b_accel, &
-                             b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
-                             max_nibool_interfaces_ext_mesh, &
-                             nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                             b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh, &
-                             my_neighbours_ext_mesh,myrank)
-
-         else ! GPU_MODE == 1
-            call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,b_accel, Mesh_pointer,&
-                              b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
-                              max_nibool_interfaces_ext_mesh, &
-                              nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                              b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh,3)
-         endif
-      endif !adjoint
-
-    endif
-
-    !! DK DK May 2009: removed this because now each slice of a CUBIT + SCOTCH mesh
-    !! DK DK May 2009: has a different number of spectral elements and therefore
-    !! DK DK May 2009: only the general non-blocking MPI routines assemble_MPI_vector_ext_mesh_s
-    !! DK DK May 2009: and assemble_MPI_vector_ext_mesh_w above can be used.
-    !! DK DK May 2009: For adjoint runs below (SIMULATION_TYPE == 3) they should be used as well.
-
- enddo
-
-!Percy , Fault boundary term B*tau is added to the assembled forces 
-!        which at this point are stored in the array 'accel'
-  if (SIMULATION_TYPE_DYN) call bc_dynflt_set3d_all(accel,veloc,displ)
-  
-  if (SIMULATION_TYPE_KIN) call bc_kinflt_set_all(accel,veloc,displ)
-
- ! multiplies with inverse of mass matrix (note: rmass has been inverted already)
- if(.NOT. GPU_MODE) then
-    accel(1,:) = accel(1,:)*rmassx(:)
-    accel(2,:) = accel(2,:)*rmassy(:)
-    accel(3,:) = accel(3,:)*rmassz(:)
-    ! adjoint simulations
-    if (SIMULATION_TYPE == 3) then
-       b_accel(1,:) = b_accel(1,:)*rmassx(:)
-       b_accel(2,:) = b_accel(2,:)*rmassy(:)
-       b_accel(3,:) = b_accel(3,:)*rmassz(:)
-    endif !adjoint
- else ! GPU_MODE == 1
-    call kernel_3_a_cuda(Mesh_pointer, NGLOB_AB, deltatover2,b_deltatover2,OCEANS)
- endif
-
-! updates acceleration with ocean load term
-  if(OCEANS) then
-    if( .NOT. GPU_MODE ) then
-      call compute_coupling_ocean(NSPEC_AB,NGLOB_AB, &
-                                  ibool,rmassx,rmassy,rmassz, &
-                                  rmass_ocean_load,accel, &
-                                  free_surface_normal,free_surface_ijk,free_surface_ispec, &
-                                  num_free_surface_faces,SIMULATION_TYPE, &
-                                  NGLOB_ADJOINT,b_accel)
-    else
-      ! on GPU
-      call compute_coupling_ocean_cuda(Mesh_pointer)
-    endif
-  endif
-
-! updates velocities
-! Newmark finite-difference time scheme with elastic domains:
-! (see e.g. Hughes, 1987; Chaljub et al., 2003)
-!
-! u(t+delta_t) = u(t) + delta_t  v(t) + 1/2  delta_t**2 a(t)
-! v(t+delta_t) = v(t) + 1/2 delta_t a(t) + 1/2 delta_t a(t+delta_t)
-! a(t+delta_t) = 1/M_elastic ( -K_elastic u(t+delta) + B_elastic chi_dot_dot(t+delta_t) + f( t+delta_t) )
-!
-! where
-!   u, v, a are displacement,velocity & acceleration
-!   M is mass matrix, K stiffness matrix and B boundary term for acoustic/elastic domains
-!   f denotes a source term (acoustic/elastic)
-!   chi_dot_dot is acoustic (fluid) potential ( dotted twice with respect to time)
-!
-! corrector:
-!   updates the velocity term which requires a(t+delta)
-! GPU_MODE: this is handled in 'kernel_3' at the same time as accel*rmass
-  if(.NOT. GPU_MODE) then
-     veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
-     ! adjoint simulations
-     if (SIMULATION_TYPE == 3) b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
-  else ! GPU_MODE == 1
-    if( OCEANS ) call kernel_3_b_cuda(Mesh_pointer, NGLOB_AB, deltatover2,b_deltatover2)
-  endif
-
-
-end subroutine compute_forces_elastic
-
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-! distributes routines according to chosen NGLLX in constants.h
-
-!daniel: note -- i put it here rather than in compute_forces_elastic_Dev.f90 because compiler complains that:
-! " The storage extent of the dummy argument exceeds that of the actual argument. "
-
-subroutine compute_forces_elastic_Dev_sim1(iphase)
-
-! forward simulations
-
-  use specfem_par
-  use specfem_par_acoustic
-  use specfem_par_elastic
-  use specfem_par_poroelastic
-
-  implicit none
-
-  integer,intent(in) :: iphase
-
-  select case(NGLLX)
-
-  case (5)
-
-!----------------------------------------------------------------------------------------------
-
-! OpenMP routine flag for testing & benchmarking forward runs only
-! configure additional flag, e.g.: FLAGS_NO_CHECK="-O3 -DOPENMP_MODE -openmp"
-
-!----------------------------------------------------------------------------------------------
-#ifdef OPENMP_MODE
-!! DK DK Jan 2013: beware, that OpenMP version is not maintained / supported and thus probably does not work
-    call compute_forces_elastic_Dev_openmp(iphase, NSPEC_AB,NGLOB_AB,displ,veloc,accel, &
-           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-           hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-           kappastore,mustore,jacobian,ibool, &
-           ATTENUATION,deltat, &
-           one_minus_sum_beta,factor_common, &
-           alphaval,betaval,gammaval, &
-           NSPEC_ATTENUATION_AB, &
-           R_xx,R_yy,R_xy,R_xz,R_yz, &
-           epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-           epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-           ANISOTROPY,NSPEC_ANISO, &
-           c11store,c12store,c13store,c14store,c15store,c16store,&
-           c22store,c23store,c24store,c25store,c26store,c33store,&
-           c34store,c35store,c36store,c44store,c45store,c46store,&
-           c55store,c56store,c66store, &
-           SIMULATION_TYPE, COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-           NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
-           is_moho_top,is_moho_bot, &
-           dsdx_top,dsdx_bot, &
-           ispec2D_moho_top,ispec2D_moho_bot, &
-           num_phase_ispec_elastic,&
-           phase_ispec_inner_elastic,&
-           num_colors_outer_elastic,num_colors_inner_elastic)
-#else
-    call compute_forces_elastic_Dev_5p(iphase, NSPEC_AB,NGLOB_AB,displ,veloc,accel, &
-             xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-             hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-             wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-             kappastore,mustore,jacobian,ibool, &
-             ATTENUATION,deltat, &
-             one_minus_sum_beta,factor_common, &
-             alphaval,betaval,gammaval, &
-             NSPEC_ATTENUATION_AB, &
-             R_xx,R_yy,R_xy,R_xz,R_yz, &
-             epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-             epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-             ANISOTROPY,NSPEC_ANISO, &
-             c11store,c12store,c13store,c14store,c15store,c16store,&
-             c22store,c23store,c24store,c25store,c26store,c33store,&
-             c34store,c35store,c36store,c44store,c45store,c46store,&
-             c55store,c56store,c66store, &
-             SIMULATION_TYPE, COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-             NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
-             is_moho_top,is_moho_bot, &
-             dsdx_top,dsdx_bot, &
-             ispec2D_moho_top,ispec2D_moho_bot, &
-             num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-             phase_ispec_inner_elastic )
-#endif
-
-  case default
-
-    stop 'error no Deville routine available for chosen NGLLX'
-
-  end select
-
-end subroutine compute_forces_elastic_Dev_sim1
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-
-subroutine compute_forces_elastic_Dev_sim3(iphase)
-
-! uses backward/reconstructed displacement and acceleration arrays
-
-  use specfem_par
-  use specfem_par_acoustic
-  use specfem_par_elastic
-  use specfem_par_poroelastic
-
-  implicit none
-
-  integer,intent(in) :: iphase
-
-  select case(NGLLX)
-
-  case (5)
-    call compute_forces_elastic_Dev_5p(iphase, NSPEC_AB,NGLOB_AB, &
-                  b_displ,b_veloc,b_accel, &
-                  xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                  hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-                  wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                  kappastore,mustore,jacobian,ibool, &
-                  ATTENUATION,deltat, &
-                  one_minus_sum_beta,factor_common, &
-                  b_alphaval,b_betaval,b_gammaval, &
-                  NSPEC_ATTENUATION_AB, &
-                  b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
-                  b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, &
-                  b_epsilondev_xz,b_epsilondev_yz,b_epsilon_trace_over_3, &
-                  ANISOTROPY,NSPEC_ANISO, &
-                  c11store,c12store,c13store,c14store,c15store,c16store,&
-                  c22store,c23store,c24store,c25store,c26store,c33store,&
-                  c34store,c35store,c36store,c44store,c45store,c46store,&
-                  c55store,c56store,c66store, &
-                  SIMULATION_TYPE, COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY,&
-                  NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
-                  is_moho_top,is_moho_bot, &
-                  b_dsdx_top,b_dsdx_bot, &
-                  ispec2D_moho_top,ispec2D_moho_bot, &
-                  num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                  phase_ispec_inner_elastic )
-
-  case default
-
-    stop 'error no Deville routine available for chosen NGLLX'
-
-  end select
-
-
-end subroutine compute_forces_elastic_Dev_sim3
-

Deleted: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_noDev.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_noDev.f90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_noDev.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -1,825 +0,0 @@
-!=====================================================================
-!
-!               S p e c f e m 3 D  V e r s i o n  2 . 1
-!               ---------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Princeton University, USA and CNRS / INRIA / University of Pau
-! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
-!                             July 2012
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-subroutine compute_forces_elastic_noDev( iphase, &
-                        NSPEC_AB,NGLOB_AB,displ,veloc,accel, &
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        hprime_xx,hprime_yy,hprime_zz, &
-                        hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,&
-                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                        kappastore,mustore,jacobian,ibool, &
-                        ATTENUATION,deltat,PML_CONDITIONS, &
-                        nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax,NSPEC2D_BOTTOM,NSPEC2D_TOP, &
-                        ibelm_xmin,ibelm_xmax,ibelm_ymin,ibelm_ymax,ibelm_bottom,ibelm_top, &
-                        one_minus_sum_beta,factor_common, &
-                        alphaval,betaval,gammaval, &
-                        NSPEC_ATTENUATION_AB, &
-                        R_xx,R_yy,R_xy,R_xz,R_yz, &
-                        epsilondev_xx,epsilondev_yy,epsilondev_xy,&
-                        epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                        ANISOTROPY,NSPEC_ANISO, &
-                        c11store,c12store,c13store,c14store,c15store,c16store, &
-                        c22store,c23store,c24store,c25store,c26store,c33store, &
-                        c34store,c35store,c36store,c44store,c45store,c46store, &
-                        c55store,c56store,c66store, &
-                        SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                        NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                        is_moho_top,is_moho_bot, &
-                        dsdx_top,dsdx_bot, &
-                        ispec2D_moho_top,ispec2D_moho_bot, &
-                        num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic, &
-                        phase_ispec_inner_elastic)
-
-  use constants, only: NGLLX,NGLLY,NGLLZ,NDIM,N_SLS,SAVE_MOHO_MESH,ONE_THIRD,FOUR_THIRDS
-  use pml_par
-  use fault_solver_dynamic, only : Kelvin_Voigt_eta
-
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement, velocity and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,veloc,accel
-
-! time step
-  real(kind=CUSTOM_REAL) :: deltat
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore,jacobian
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
-  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! communication overlap
-!  logical, dimension(NSPEC_AB) :: ispec_is_inner
-!  logical :: phase_is_inner
-
-! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-       R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-! New dloc = displ + Kelvin Voigt damping*veloc
- real(kind=CUSTOM_REAL), dimension(3,NGLLX,NGLLY,NGLLZ) :: dloc
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-       dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-! C-PML absorbing boundary conditions
-  logical :: PML_CONDITIONS
-  integer :: nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax,NSPEC2D_BOTTOM,NSPEC2D_TOP
-  integer, dimension(nspec2D_xmin) :: ibelm_xmin
-  integer, dimension(nspec2D_xmax) :: ibelm_xmax
-  integer, dimension(nspec2D_ymin) :: ibelm_ymin
-  integer, dimension(nspec2D_ymax) :: ibelm_ymax
-  integer, dimension(NSPEC2D_BOTTOM) :: ibelm_bottom
-  integer, dimension(NSPEC2D_TOP) :: ibelm_top
-
-! local parameters
-  integer :: i_SLS
-  integer :: ispec,ispec2D,iglob,ispec_p,num_elements
-  integer :: i,j,k,l
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
-       tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) :: duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) :: duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) :: duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) :: sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) :: hp1,hp2,hp3
-  real(kind=CUSTOM_REAL) :: fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) :: tempx1l,tempx2l,tempx3l
-  real(kind=CUSTOM_REAL) :: tempy1l,tempy2l,tempy3l
-  real(kind=CUSTOM_REAL) :: tempz1l,tempz2l,tempz3l
-
-  real(kind=CUSTOM_REAL) :: lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) :: kappal
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) :: c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) :: R_xx_val,R_yy_val
-  real(kind=CUSTOM_REAL) :: factor_loc,alphaval_loc,betaval_loc,gammaval_loc,Sn,Snp1
-  real(kind=CUSTOM_REAL) :: templ
-
-  real(kind=CUSTOM_REAL) :: tempx1l_new,tempx2l_new,tempx3l_new
-  real(kind=CUSTOM_REAL) :: tempy1l_new,tempy2l_new,tempy3l_new
-  real(kind=CUSTOM_REAL) :: tempz1l_new,tempz2l_new,tempz3l_new
-
-  real(kind=CUSTOM_REAL) :: duxdxl_new,duxdyl_new,duxdzl_new,duydxl_new
-  real(kind=CUSTOM_REAL) :: duydyl_new,duydzl_new,duzdxl_new,duzdyl_new,duzdzl_new;
-  real(kind=CUSTOM_REAL) :: duxdyl_plus_duydxl_new,duzdxl_plus_duxdzl_new,duzdyl_plus_duydzl_new;
-
-  real(kind=CUSTOM_REAL) :: eta
-
-  ! local C-PML absorbing boundary conditions parameters
-  integer :: ispec_CPML
-
-  if( iphase == 1 ) then
-    num_elements = nspec_outer_elastic
-  else
-    num_elements = nspec_inner_elastic
-  endif
-
-  do ispec_p = 1,num_elements
-
-     ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-     ! adjoint simulations: moho kernel
-     ! note: call this only once
-     if (SAVE_MOHO_MESH .and. SIMULATION_TYPE == 3) then
-        if (is_moho_top(ispec)) then
-           ispec2D_moho_top = ispec2D_moho_top + 1
-        else if (is_moho_bot(ispec)) then
-           ispec2D_moho_bot = ispec2D_moho_bot + 1
-        endif
-     endif
-
-     ! Kelvin Voigt damping: artificial viscosity around dynamic faults
-     if (allocated(Kelvin_Voigt_eta)) then
-        eta = Kelvin_Voigt_eta(ispec)   
-        do k=1,NGLLZ
-           do j=1,NGLLY
-              do i=1,NGLLX
-                 iglob = ibool(i,j,k,ispec)
-                 dloc(:,i,j,k) = displ(:,iglob) + eta*veloc(:,iglob)
-              enddo
-           enddo
-        enddo
-
-     else
-        do k=1,NGLLZ
-           do j=1,NGLLY
-              do i=1,NGLLX
-                 iglob = ibool(i,j,k,ispec)
-                 dloc(:,i,j,k) = displ(:,iglob) 
-              enddo
-           enddo
-        enddo
-     endif
-
-     do k=1,NGLLZ
-        do j=1,NGLLY
-          do i=1,NGLLX
-
-          tempx1l = 0._CUSTOM_REAL
-          tempx2l = 0._CUSTOM_REAL
-          tempx3l = 0._CUSTOM_REAL
-
-          tempy1l = 0._CUSTOM_REAL
-          tempy2l = 0._CUSTOM_REAL
-          tempy3l = 0._CUSTOM_REAL
-
-          tempz1l = 0._CUSTOM_REAL
-          tempz2l = 0._CUSTOM_REAL
-          tempz3l = 0._CUSTOM_REAL
-
-          do l=1,NGLLX
-            hp1 = hprime_xx(i,l)
-            tempx1l = tempx1l + dloc(1,l,j,k)*hp1
-            tempy1l = tempy1l + dloc(2,l,j,k)*hp1
-            tempz1l = tempz1l + dloc(3,l,j,k)*hp1
-
-            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
-            hp2 = hprime_yy(j,l)
-            tempx2l = tempx2l + dloc(1,i,l,k)*hp2
-            tempy2l = tempy2l + dloc(2,i,l,k)*hp2
-            tempz2l = tempz2l + dloc(3,i,l,k)*hp2
-
-            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
-            hp3 = hprime_zz(k,l)
-            tempx3l = tempx3l + dloc(1,i,j,l)*hp3
-            tempy3l = tempy3l + dloc(2,i,j,l)*hp3
-            tempz3l = tempz3l + dloc(3,i,j,l)*hp3
-          enddo
-
-          if( (ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) .or. &
-               (PML_CONDITIONS .and. CPML_mask_ibool(ispec)) ) then
-             tempx1l_new = tempx1l
-             tempx2l_new = tempx2l
-             tempx3l_new = tempx3l
-
-             tempy1l_new = tempy1l
-             tempy2l_new = tempy2l
-             tempy3l_new = tempy3l
-
-             tempz1l_new = tempz1l
-             tempz2l_new = tempz2l
-             tempz3l_new = tempz3l
-
-             ! use first order Taylor expansion of displacement for local storage of stresses 
-             ! at this current time step, to fix attenuation in a consistent way
-             do l=1,NGLLX
-                hp1 = hprime_xx(i,l)
-                iglob = ibool(l,j,k,ispec)
-                tempx1l_new = tempx1l_new + deltat*veloc(1,iglob)*hp1
-                tempy1l_new = tempy1l_new + deltat*veloc(2,iglob)*hp1
-                tempz1l_new = tempz1l_new + deltat*veloc(3,iglob)*hp1
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-                hp2 = hprime_yy(j,l)
-                iglob = ibool(i,l,k,ispec)
-                tempx2l_new = tempx2l_new + deltat*veloc(1,iglob)*hp2
-                tempy2l_new = tempy2l_new + deltat*veloc(2,iglob)*hp2
-                tempz2l_new = tempz2l_new + deltat*veloc(3,iglob)*hp2
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-                hp3 = hprime_zz(k,l)
-                iglob = ibool(i,j,l,ispec)
-                tempx3l_new = tempx3l_new + deltat*veloc(1,iglob)*hp3
-                tempy3l_new = tempy3l_new + deltat*veloc(2,iglob)*hp3
-                tempz3l_new = tempz3l_new + deltat*veloc(3,iglob)*hp3
-             enddo
-          endif
-
-          ! get derivatives of ux, uy and uz with respect to x, y and z
-          xixl = xix(i,j,k,ispec)
-          xiyl = xiy(i,j,k,ispec)
-          xizl = xiz(i,j,k,ispec)
-          etaxl = etax(i,j,k,ispec)
-          etayl = etay(i,j,k,ispec)
-          etazl = etaz(i,j,k,ispec)
-          gammaxl = gammax(i,j,k,ispec)
-          gammayl = gammay(i,j,k,ispec)
-          gammazl = gammaz(i,j,k,ispec)
-          jacobianl = jacobian(i,j,k,ispec)
-
-          duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-          duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-          duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
-
-          duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
-          duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
-          duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
-
-          duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
-          duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
-          duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
-
-          ! stores derivatives of ux, uy and uz with respect to x, y and z
-          if( PML_CONDITIONS .and. CPML_mask_ibool(ispec) ) then
-             ispec_CPML = spec_to_CPML(ispec)
-
-             PML_dux_dxl(i,j,k,ispec_CPML) = duxdxl
-             PML_dux_dyl(i,j,k,ispec_CPML) = duxdyl
-             PML_dux_dzl(i,j,k,ispec_CPML) = duxdzl
-
-             PML_duy_dxl(i,j,k,ispec_CPML) = duydxl
-             PML_duy_dyl(i,j,k,ispec_CPML) = duydyl
-             PML_duy_dzl(i,j,k,ispec_CPML) = duydzl
-
-             PML_duz_dxl(i,j,k,ispec_CPML) = duzdxl
-             PML_duz_dyl(i,j,k,ispec_CPML) = duzdyl
-             PML_duz_dzl(i,j,k,ispec_CPML) = duzdzl
-          endif
-
-          ! adjoint simulations: save strain on the Moho boundary
-          if (SAVE_MOHO_MESH ) then
-            if (is_moho_top(ispec)) then
-              dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-              dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-              dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-              dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-              dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-              dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-              dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-              dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-              dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-            else if (is_moho_bot(ispec)) then
-              dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-              dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-              dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-              dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-              dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-              dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-              dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-              dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-              dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-            endif
-          endif
-
-          ! precompute some sums to save CPU time
-          duxdxl_plus_duydyl = duxdxl + duydyl
-          duxdxl_plus_duzdzl = duxdxl + duzdzl
-          duydyl_plus_duzdzl = duydyl + duzdzl
-          duxdyl_plus_duydxl = duxdyl + duydxl
-          duzdxl_plus_duxdzl = duzdxl + duxdzl
-          duzdyl_plus_duydzl = duzdyl + duydzl
-
-          if( (ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) .or. &
-               (PML_CONDITIONS .and. CPML_mask_ibool(ispec)) ) then
-             ! temporary variables used for fixing attenuation in a consistent way
-             duxdxl_new = xixl*tempx1l_new + etaxl*tempx2l_new + gammaxl*tempx3l_new
-             duxdyl_new = xiyl*tempx1l_new + etayl*tempx2l_new + gammayl*tempx3l_new
-             duxdzl_new = xizl*tempx1l_new + etazl*tempx2l_new + gammazl*tempx3l_new
-
-             duydxl_new = xixl*tempy1l_new + etaxl*tempy2l_new + gammaxl*tempy3l_new
-             duydyl_new = xiyl*tempy1l_new + etayl*tempy2l_new + gammayl*tempy3l_new
-             duydzl_new = xizl*tempy1l_new + etazl*tempy2l_new + gammazl*tempy3l_new
-
-             duzdxl_new = xixl*tempz1l_new + etaxl*tempz2l_new + gammaxl*tempz3l_new
-             duzdyl_new = xiyl*tempz1l_new + etayl*tempz2l_new + gammayl*tempz3l_new
-             duzdzl_new = xizl*tempz1l_new + etazl*tempz2l_new + gammazl*tempz3l_new
-
-             if( ATTENUATION .and. COMPUTE_AND_STORE_STRAIN ) then
-                ! precompute some sums to save CPU time
-                duxdyl_plus_duydxl_new = duxdyl_new + duydxl_new
-                duzdxl_plus_duxdzl_new = duzdxl_new + duxdzl_new
-                duzdyl_plus_duydzl_new = duzdyl_new + duydzl_new
-
-                ! compute deviatoric strain
-                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = ONE_THIRD * (duxdxl_new + duydyl_new + duzdzl_new)
-                epsilondev_xx_loc(i,j,k) = duxdxl_new - epsilon_trace_over_3(i,j,k,ispec)
-                epsilondev_yy_loc(i,j,k) = duydyl_new - epsilon_trace_over_3(i,j,k,ispec)
-                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl_new
-                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl_new
-                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl_new
-             endif
-
-             if( PML_CONDITIONS .and. CPML_mask_ibool(ispec) ) then
-                PML_dux_dxl_new(i,j,k,ispec_CPML) = duxdxl_new
-                PML_dux_dyl_new(i,j,k,ispec_CPML) = duxdyl_new
-                PML_dux_dzl_new(i,j,k,ispec_CPML) = duxdzl_new
-
-                PML_duy_dxl_new(i,j,k,ispec_CPML) = duydxl_new
-                PML_duy_dyl_new(i,j,k,ispec_CPML) = duydyl_new
-                PML_duy_dzl_new(i,j,k,ispec_CPML) = duydzl_new
-
-                PML_duz_dxl_new(i,j,k,ispec_CPML) = duzdxl_new
-                PML_duz_dyl_new(i,j,k,ispec_CPML) = duzdyl_new
-                PML_duz_dzl_new(i,j,k,ispec_CPML) = duzdzl_new 
-             endif
-
-          elseif( .not.(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) ) then
-
-             ! computes deviatoric strain attenuation and/or for kernel calculations
-             if (COMPUTE_AND_STORE_STRAIN) then
-                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                epsilondev_yy_loc(i,j,k) = duydyl - templ
-                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-             endif
-          endif
-
-          kappal = kappastore(i,j,k,ispec)
-          mul = mustore(i,j,k,ispec)
-
-          if(ATTENUATION) then
-            ! use unrelaxed parameters if attenuation
-            mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-          endif
-
-          ! full anisotropic case, stress calculations
-          if(ANISOTROPY) then
-            c11 = c11store(i,j,k,ispec)
-            c12 = c12store(i,j,k,ispec)
-            c13 = c13store(i,j,k,ispec)
-            c14 = c14store(i,j,k,ispec)
-            c15 = c15store(i,j,k,ispec)
-            c16 = c16store(i,j,k,ispec)
-            c22 = c22store(i,j,k,ispec)
-            c23 = c23store(i,j,k,ispec)
-            c24 = c24store(i,j,k,ispec)
-            c25 = c25store(i,j,k,ispec)
-            c26 = c26store(i,j,k,ispec)
-            c33 = c33store(i,j,k,ispec)
-            c34 = c34store(i,j,k,ispec)
-            c35 = c35store(i,j,k,ispec)
-            c36 = c36store(i,j,k,ispec)
-            c44 = c44store(i,j,k,ispec)
-            c45 = c45store(i,j,k,ispec)
-            c46 = c46store(i,j,k,ispec)
-            c55 = c55store(i,j,k,ispec)
-            c56 = c56store(i,j,k,ispec)
-            c66 = c66store(i,j,k,ispec)
-
-            sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                      c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-            sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                      c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-            sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                      c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-            sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                      c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-            sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                      c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-            sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                      c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-          else
-
-            ! isotropic case
-            lambdalplus2mul = kappal + FOUR_THIRDS * mul
-            lambdal = lambdalplus2mul - 2.*mul
-
-            ! compute stress sigma
-            sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-            sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-            sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-            sigma_xy = mul*duxdyl_plus_duydxl
-            sigma_xz = mul*duzdxl_plus_duxdzl
-            sigma_yz = mul*duzdyl_plus_duydzl
-
-          endif ! ANISOTROPY
-
-          ! subtract memory variables if attenuation
-          if(ATTENUATION) then
-             do i_sls = 1,N_SLS
-                R_xx_val = R_xx(i,j,k,ispec,i_sls)
-                R_yy_val = R_yy(i,j,k,ispec,i_sls)
-                sigma_xx = sigma_xx - R_xx_val
-                sigma_yy = sigma_yy - R_yy_val
-                sigma_zz = sigma_zz + R_xx_val + R_yy_val
-                sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-             enddo
-          endif
-
-          if( .not. PML_CONDITIONS ) then 
-             ! define symmetric components of sigma
-             sigma_yx = sigma_xy
-             sigma_zx = sigma_xz
-             sigma_zy = sigma_yz
-
-             ! form dot product with test vector, non-symmetric form
-             tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-             tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-             tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-             tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-             tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-             tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-             tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-             tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-             tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-          endif
-
-        enddo
-      enddo
-    enddo
-
-    if( PML_CONDITIONS .and. CPML_mask_ibool(ispec) ) then
-       ! sets C-PML elastic memory variables to compute stress sigma and form dot product with test vector
-       call pml_set_memory_variables(ispec,ispec_CPML,deltat,jacobianl,tempx1,tempy1,tempz1,tempx2,tempy2,tempz2, &
-                                    tempx3,tempy3,tempz3,sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz, &
-                                    sigma_yx,sigma_zx,sigma_zy,lambdal,mul,lambdalplus2mul,xixl,xiyl,xizl, &
-                                    etaxl,etayl,etazl,gammaxl,gammayl,gammazl)
-
-       ! calculates contribution from each C-PML element to update acceleration
-       call pml_set_accel_contribution(ispec,ispec_CPML,deltat,jacobianl,accel_elastic_CPML)
-    endif
-
-    ! second double-loop over GLL to compute all the terms
-    do k=1,NGLLZ
-       do j=1,NGLLY
-          do i=1,NGLLX
-
-          tempx1l = 0._CUSTOM_REAL
-          tempy1l = 0._CUSTOM_REAL
-          tempz1l = 0._CUSTOM_REAL
-
-          tempx2l = 0._CUSTOM_REAL
-          tempy2l = 0._CUSTOM_REAL
-          tempz2l = 0._CUSTOM_REAL
-
-          tempx3l = 0._CUSTOM_REAL
-          tempy3l = 0._CUSTOM_REAL
-          tempz3l = 0._CUSTOM_REAL
-
-          do l=1,NGLLX
-            fac1 = hprimewgll_xx(l,i)
-            tempx1l = tempx1l + tempx1(l,j,k)*fac1
-            tempy1l = tempy1l + tempy1(l,j,k)*fac1
-            tempz1l = tempz1l + tempz1(l,j,k)*fac1
-
-            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
-            fac2 = hprimewgll_yy(l,j)
-            tempx2l = tempx2l + tempx2(i,l,k)*fac2
-            tempy2l = tempy2l + tempy2(i,l,k)*fac2
-            tempz2l = tempz2l + tempz2(i,l,k)*fac2
-
-            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
-            fac3 = hprimewgll_zz(l,k)
-            tempx3l = tempx3l + tempx3(i,j,l)*fac3
-            tempy3l = tempy3l + tempy3(i,j,l)*fac3
-            tempz3l = tempz3l + tempz3(i,j,l)*fac3
-          enddo
-
-          fac1 = wgllwgll_yz(j,k)
-          fac2 = wgllwgll_xz(i,k)
-          fac3 = wgllwgll_xy(i,j)
-
-          ! sum contributions from each element to the global mesh
-          iglob = ibool(i,j,k,ispec)
-
-          accel(1,iglob) = accel(1,iglob) - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
-          accel(2,iglob) = accel(2,iglob) - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
-          accel(3,iglob) = accel(3,iglob) - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
-
-          ! updates acceleration with contribution from each C-PML element
-          if( PML_CONDITIONS .and. CPML_mask_ibool(ispec) ) then
-             accel(1,iglob) = accel(1,iglob) - accel_elastic_CPML(1,i,j,k,ispec_CPML)
-             accel(2,iglob) = accel(2,iglob) - accel_elastic_CPML(2,i,j,k,ispec_CPML)
-             accel(3,iglob) = accel(3,iglob) - accel_elastic_CPML(3,i,j,k,ispec_CPML)
-          endif
-
-          !  update memory variables based upon the Runge-Kutta scheme
-          if(ATTENUATION) then
-
-             ! use Runge-Kutta scheme to march in time
-             do i_sls = 1,N_SLS
-
-                factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                alphaval_loc = alphaval(i_sls)
-                betaval_loc = betaval(i_sls)
-                gammaval_loc = gammaval(i_sls)
-
-                ! term in xx
-                Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                                  betaval_loc * Sn + gammaval_loc * Snp1
-
-                ! term in yy
-                Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                                  betaval_loc * Sn + gammaval_loc * Snp1
-
-                ! term in zz not computed since zero trace
-
-                ! term in xy
-                Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                                  betaval_loc * Sn + gammaval_loc * Snp1
-
-                ! term in xz
-                Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                                  betaval_loc * Sn + gammaval_loc * Snp1
-
-                ! term in yz
-                Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                                  betaval_loc * Sn + gammaval_loc * Snp1
-
-             enddo   ! end of loop on memory variables
-
-          endif  !  end attenuation
-
-        enddo
-      enddo
-    enddo
-
-    ! save deviatoric strain for Runge-Kutta scheme
-    if ( COMPUTE_AND_STORE_STRAIN ) then
-      epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-      epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-      epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-      epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-      epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-    endif
-
-  enddo  ! spectral element loop
-  
-  ! C-PML boundary 
-  if( PML_CONDITIONS ) then
-     ! xmin
-     do ispec2D=1,nspec2D_xmin
-        ispec = ibelm_xmin(ispec2D)
-
-        i = 1
-
-        do k=1,NGLLZ
-           do j=1,NGLLY
-              iglob = ibool(i,j,k,ispec)
-
-              accel(1,iglob) = 0._CUSTOM_REAL
-              accel(2,iglob) = 0._CUSTOM_REAL
-              accel(3,iglob) = 0._CUSTOM_REAL
-
-              veloc(1,iglob) = 0._CUSTOM_REAL
-              veloc(2,iglob) = 0._CUSTOM_REAL
-              veloc(3,iglob) = 0._CUSTOM_REAL
-
-              displ(1,iglob) = 0._CUSTOM_REAL
-              displ(2,iglob) = 0._CUSTOM_REAL
-              displ(3,iglob) = 0._CUSTOM_REAL
-           enddo
-        enddo
-     enddo
-
-     ! xmax
-     do ispec2D=1,nspec2D_xmax
-        ispec = ibelm_xmax(ispec2D)
-
-        i = NGLLX
-
-        do k=1,NGLLZ
-           do j=1,NGLLY
-              iglob = ibool(i,j,k,ispec)
-
-              accel(1,iglob) = 0._CUSTOM_REAL
-              accel(2,iglob) = 0._CUSTOM_REAL
-              accel(3,iglob) = 0._CUSTOM_REAL
-
-              veloc(1,iglob) = 0._CUSTOM_REAL
-              veloc(2,iglob) = 0._CUSTOM_REAL
-              veloc(3,iglob) = 0._CUSTOM_REAL
-
-              displ(1,iglob) = 0._CUSTOM_REAL
-              displ(2,iglob) = 0._CUSTOM_REAL
-              displ(3,iglob) = 0._CUSTOM_REAL
-           enddo
-        enddo
-     enddo
-
-     ! ymin
-     do ispec2D=1,nspec2D_ymin
-        ispec = ibelm_ymin(ispec2D)
-
-        j = 1
-
-        do k=1,NGLLZ
-           do i=1,NGLLX
-              iglob = ibool(i,j,k,ispec)
-
-              accel(1,iglob) = 0._CUSTOM_REAL
-              accel(2,iglob) = 0._CUSTOM_REAL
-              accel(3,iglob) = 0._CUSTOM_REAL
-
-              veloc(1,iglob) = 0._CUSTOM_REAL
-              veloc(2,iglob) = 0._CUSTOM_REAL
-              veloc(3,iglob) = 0._CUSTOM_REAL
-
-              displ(1,iglob) = 0._CUSTOM_REAL
-              displ(2,iglob) = 0._CUSTOM_REAL
-              displ(3,iglob) = 0._CUSTOM_REAL
-           enddo
-        enddo
-     enddo
-
-     ! ymax
-     do ispec2D=1,nspec2D_ymax
-        ispec = ibelm_ymax(ispec2D)
-
-        j = NGLLY
-
-        do k=1,NGLLZ
-           do i=1,NGLLX
-              iglob = ibool(i,j,k,ispec)
-
-              accel(1,iglob) = 0._CUSTOM_REAL
-              accel(2,iglob) = 0._CUSTOM_REAL
-              accel(3,iglob) = 0._CUSTOM_REAL
-
-              veloc(1,iglob) = 0._CUSTOM_REAL
-              veloc(2,iglob) = 0._CUSTOM_REAL
-              veloc(3,iglob) = 0._CUSTOM_REAL
-
-              displ(1,iglob) = 0._CUSTOM_REAL
-              displ(2,iglob) = 0._CUSTOM_REAL
-              displ(3,iglob) = 0._CUSTOM_REAL
-           enddo
-        enddo
-     enddo
-
-     ! bottom (zmin)
-     do ispec2D=1,NSPEC2D_BOTTOM
-        ispec = ibelm_bottom(ispec2D)
-
-        k = 1
-
-        do j=1,NGLLY
-           do i=1,NGLLX
-              iglob = ibool(i,j,k,ispec)
-
-              accel(1,iglob) = 0._CUSTOM_REAL
-              accel(2,iglob) = 0._CUSTOM_REAL
-              accel(3,iglob) = 0._CUSTOM_REAL
-
-              veloc(1,iglob) = 0._CUSTOM_REAL
-              veloc(2,iglob) = 0._CUSTOM_REAL
-              veloc(3,iglob) = 0._CUSTOM_REAL
-
-              displ(1,iglob) = 0._CUSTOM_REAL
-              displ(2,iglob) = 0._CUSTOM_REAL
-              displ(3,iglob) = 0._CUSTOM_REAL
-           enddo
-        enddo
-     enddo
-
-     ! top (zmax)
-     do ispec2D=1,NSPEC2D_BOTTOM
-        ispec = ibelm_top(ispec2D)
-
-        k = NGLLZ
-
-        do j=1,NGLLY
-           do i=1,NGLLX
-              iglob = ibool(i,j,k,ispec)
-
-              accel(1,iglob) = 0._CUSTOM_REAL
-              accel(2,iglob) = 0._CUSTOM_REAL
-              accel(3,iglob) = 0._CUSTOM_REAL
-
-              veloc(1,iglob) = 0._CUSTOM_REAL
-              veloc(2,iglob) = 0._CUSTOM_REAL
-              veloc(3,iglob) = 0._CUSTOM_REAL
-
-              displ(1,iglob) = 0._CUSTOM_REAL
-              displ(2,iglob) = 0._CUSTOM_REAL
-              displ(3,iglob) = 0._CUSTOM_REAL
-           enddo
-        enddo
-     enddo
-
-  endif ! if( PML_CONDITIONS )
-
-end subroutine compute_forces_elastic_noDev
-

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_poroelastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_poroelastic.f90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_poroelastic.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -190,7 +190,7 @@
       ! adjoint simulations
 ! chris:'adjoint elastic-poroelastic simulation not implemented yet'
 !      if( SIMULATION_TYPE == 3 ) &
-!        call compute_coupling_elastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
+!        call compute_coupling_viscoelastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
 !                        ibool,b_accel,b_potential_dot_dot_acoustic, &
 !                        num_coupling_ac_el_faces, &
 !                        coupling_ac_el_ispec,coupling_ac_el_ijk, &

Copied: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_Dev.f90 (from rev 21238, seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_Dev.f90)
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_Dev.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_Dev.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -0,0 +1,844 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 1
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+!                             July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+! Deville routine for NGLL == 5 (default)
+
+  subroutine compute_forces_viscoelastic_Dev_5p( iphase ,NSPEC_AB,NGLOB_AB, &
+                                    displ,veloc,accel, &
+                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                    hprime_xx,hprime_xxT, &
+                                    hprimewgll_xx,hprimewgll_xxT, &
+                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                                    kappastore,mustore,jacobian,ibool, &
+                                    ATTENUATION,deltat, &
+                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                                    NSPEC_ATTENUATION_AB, &
+                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                                    ANISOTROPY,NSPEC_ANISO, &
+                                    c11store,c12store,c13store,c14store,c15store,c16store,&
+                                    c22store,c23store,c24store,c25store,c26store,c33store,&
+                                    c34store,c35store,c36store,c44store,c45store,c46store,&
+                                    c55store,c56store,c66store, &
+                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                                    is_moho_top,is_moho_bot, &
+                                    dsdx_top,dsdx_bot, &
+                                    ispec2D_moho_top,ispec2D_moho_bot, &
+                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                                    phase_ispec_inner_elastic)
+
+
+! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+                      N_SLS,SAVE_MOHO_MESH, &
+                      ONE_THIRD,FOUR_THIRDS,m1,m2
+  use fault_solver_dynamic, only : Kelvin_Voigt_eta
+
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement, velocity and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,veloc,accel
+
+! time step
+  real(kind=CUSTOM_REAL) :: deltat
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,5) :: hprime_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(5,NGLLX) :: hprime_xxT,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+      R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+    dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+! local parameters
+  real(kind=CUSTOM_REAL), dimension(5,5,5) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+  real(kind=CUSTOM_REAL), dimension(5,5,5) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  real(kind=CUSTOM_REAL) duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att
+  real(kind=CUSTOM_REAL) duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att
+
+  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(5,25) :: B1_m1_m2_5points,B2_m1_m2_5points,B3_m1_m2_5points
+  real(kind=CUSTOM_REAL), dimension(5,25) :: C1_m1_m2_5points,C2_m1_m2_5points,C3_m1_m2_5points
+  real(kind=CUSTOM_REAL), dimension(5,25) :: E1_m1_m2_5points,E2_m1_m2_5points,E3_m1_m2_5points
+
+  equivalence(dummyx_loc,B1_m1_m2_5points)
+  equivalence(dummyy_loc,B2_m1_m2_5points)
+  equivalence(dummyz_loc,B3_m1_m2_5points)
+  equivalence(tempx1,C1_m1_m2_5points)
+  equivalence(tempy1,C2_m1_m2_5points)
+  equivalence(tempz1,C3_m1_m2_5points)
+  equivalence(newtempx1,E1_m1_m2_5points)
+  equivalence(newtempy1,E2_m1_m2_5points)
+  equivalence(newtempz1,E3_m1_m2_5points)
+
+  real(kind=CUSTOM_REAL), dimension(5,5,5) :: &
+    tempx1_att,tempx2_att,tempx3_att,tempy1_att,tempy2_att,tempy3_att,tempz1_att,tempz2_att,tempz3_att
+  real(kind=CUSTOM_REAL), dimension(5,5,5) :: dummyx_loc_att,dummyy_loc_att,dummyz_loc_att
+  real(kind=CUSTOM_REAL), dimension(5,25) :: B1_m1_m2_5points_att,B2_m1_m2_5points_att,B3_m1_m2_5points_att
+  real(kind=CUSTOM_REAL), dimension(5,25) :: C1_m1_m2_5points_att,C2_m1_m2_5points_att,C3_m1_m2_5points_att
+
+  equivalence(dummyx_loc_att,B1_m1_m2_5points_att)
+  equivalence(dummyy_loc_att,B2_m1_m2_5points_att)
+  equivalence(dummyz_loc_att,B3_m1_m2_5points_att)
+  equivalence(tempx1_att,C1_m1_m2_5points_att)
+  equivalence(tempy1_att,C2_m1_m2_5points_att)
+  equivalence(tempz1_att,C3_m1_m2_5points_att)
+
+  real(kind=CUSTOM_REAL), dimension(25,5) :: &
+    A1_mxm_m2_m1_5points,A2_mxm_m2_m1_5points,A3_mxm_m2_m1_5points
+  real(kind=CUSTOM_REAL), dimension(25,5) :: &
+    C1_mxm_m2_m1_5points,C2_mxm_m2_m1_5points,C3_mxm_m2_m1_5points
+  real(kind=CUSTOM_REAL), dimension(25,5) :: &
+    E1_mxm_m2_m1_5points,E2_mxm_m2_m1_5points,E3_mxm_m2_m1_5points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_5points)
+  equivalence(dummyy_loc,A2_mxm_m2_m1_5points)
+  equivalence(dummyz_loc,A3_mxm_m2_m1_5points)
+  equivalence(tempx3,C1_mxm_m2_m1_5points)
+  equivalence(tempy3,C2_mxm_m2_m1_5points)
+  equivalence(tempz3,C3_mxm_m2_m1_5points)
+  equivalence(newtempx3,E1_mxm_m2_m1_5points)
+  equivalence(newtempy3,E2_mxm_m2_m1_5points)
+  equivalence(newtempz3,E3_mxm_m2_m1_5points)
+
+  real(kind=CUSTOM_REAL), dimension(25,5) :: &
+    A1_mxm_m2_m1_5points_att,A2_mxm_m2_m1_5points_att,A3_mxm_m2_m1_5points_att
+  real(kind=CUSTOM_REAL), dimension(25,5) :: &
+    C1_mxm_m2_m1_5points_att,C2_mxm_m2_m1_5points_att,C3_mxm_m2_m1_5points_att
+
+  equivalence(dummyx_loc_att,A1_mxm_m2_m1_5points_att)
+  equivalence(dummyy_loc_att,A2_mxm_m2_m1_5points_att)
+  equivalence(dummyz_loc_att,A3_mxm_m2_m1_5points_att)
+  equivalence(tempx3_att,C1_mxm_m2_m1_5points_att)
+  equivalence(tempy3_att,C2_mxm_m2_m1_5points_att)
+  equivalence(tempz3_att,C3_mxm_m2_m1_5points_att)
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+
+  real(kind=CUSTOM_REAL) :: eta
+
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+    num_elements = nspec_outer_elastic
+  else
+    num_elements = nspec_inner_elastic
+  endif
+
+  do ispec_p = 1,num_elements
+
+        ! returns element id from stored element list
+        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+        ! adjoint simulations: moho kernel
+        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+            ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+            ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+        endif ! adjoint
+
+       ! Kelvin Voigt damping: artificial viscosity around dynamic faults
+
+        ! stores displacment values in local array
+        if (allocated(Kelvin_Voigt_eta)) then
+          eta = Kelvin_Voigt_eta(ispec)   
+          do k=1,NGLLZ
+            do j=1,NGLLY
+              do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k) = displ(1,iglob) + eta*veloc(1,iglob)
+                dummyy_loc(i,j,k) = displ(2,iglob) + eta*veloc(2,iglob)
+                dummyz_loc(i,j,k) = displ(3,iglob) + eta*veloc(3,iglob)
+              enddo
+            enddo
+          enddo
+
+        else
+          do k=1,NGLLZ
+            do j=1,NGLLY
+              do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k) = displ(1,iglob)
+                dummyy_loc(i,j,k) = displ(2,iglob)
+                dummyz_loc(i,j,k) = displ(3,iglob)
+              enddo
+            enddo
+          enddo
+        endif
+
+        ! use first order Taylor expansion of displacement for local storage of stresses 
+        ! at this current time step, to fix attenuation in a consistent way
+        if(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
+           do k=1,NGLLZ
+              do j=1,NGLLY
+                 do i=1,NGLLX
+                    iglob = ibool(i,j,k,ispec)
+                    dummyx_loc_att(i,j,k) = deltat*veloc(1,iglob)
+                    dummyy_loc_att(i,j,k) = deltat*veloc(2,iglob)
+                    dummyz_loc_att(i,j,k) = deltat*veloc(3,iglob)
+                 enddo
+              enddo
+           enddo
+        endif
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_5points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+        do j=1,m2
+          do i=1,m1
+            C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
+                                  hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
+                                  hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
+                                  hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
+                                  hprime_xx(i,5)*B1_m1_m2_5points(5,j)
+            C2_m1_m2_5points(i,j) = hprime_xx(i,1)*B2_m1_m2_5points(1,j) + &
+                                  hprime_xx(i,2)*B2_m1_m2_5points(2,j) + &
+                                  hprime_xx(i,3)*B2_m1_m2_5points(3,j) + &
+                                  hprime_xx(i,4)*B2_m1_m2_5points(4,j) + &
+                                  hprime_xx(i,5)*B2_m1_m2_5points(5,j)
+            C3_m1_m2_5points(i,j) = hprime_xx(i,1)*B3_m1_m2_5points(1,j) + &
+                                  hprime_xx(i,2)*B3_m1_m2_5points(2,j) + &
+                                  hprime_xx(i,3)*B3_m1_m2_5points(3,j) + &
+                                  hprime_xx(i,4)*B3_m1_m2_5points(4,j) + &
+                                  hprime_xx(i,5)*B3_m1_m2_5points(5,j)
+          enddo
+        enddo
+
+        if(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
+           ! temporary variables used for fixing attenuation in a consistent way
+           do j=1,m2
+              do i=1,m1
+                 C1_m1_m2_5points_att(i,j) = C1_m1_m2_5points(i,j) + & 
+                      hprime_xx(i,1)*B1_m1_m2_5points_att(1,j) + &
+                      hprime_xx(i,2)*B1_m1_m2_5points_att(2,j) + &
+                      hprime_xx(i,3)*B1_m1_m2_5points_att(3,j) + &
+                      hprime_xx(i,4)*B1_m1_m2_5points_att(4,j) + &
+                      hprime_xx(i,5)*B1_m1_m2_5points_att(5,j)
+
+                 C2_m1_m2_5points_att(i,j) = C2_m1_m2_5points(i,j) + &
+                      hprime_xx(i,1)*B2_m1_m2_5points_att(1,j) + &
+                      hprime_xx(i,2)*B2_m1_m2_5points_att(2,j) + &
+                      hprime_xx(i,3)*B2_m1_m2_5points_att(3,j) + &
+                      hprime_xx(i,4)*B2_m1_m2_5points_att(4,j) + &
+                      hprime_xx(i,5)*B2_m1_m2_5points_att(5,j)
+
+                 C3_m1_m2_5points_att(i,j) = C3_m1_m2_5points(i,j) + &
+                      hprime_xx(i,1)*B3_m1_m2_5points_att(1,j) + &
+                      hprime_xx(i,2)*B3_m1_m2_5points_att(2,j) + &
+                      hprime_xx(i,3)*B3_m1_m2_5points_att(3,j) + &
+                      hprime_xx(i,4)*B3_m1_m2_5points_att(4,j) + &
+                      hprime_xx(i,5)*B3_m1_m2_5points_att(5,j)
+              enddo
+           enddo
+        endif
+
+        !   call mxm_m1_m1_5points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+        do j=1,m1
+          do i=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyx_loc(i,5,k)*hprime_xxT(5,j)
+              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyy_loc(i,5,k)*hprime_xxT(5,j)
+              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyz_loc(i,5,k)*hprime_xxT(5,j)
+            enddo
+          enddo
+        enddo
+
+        if(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
+           ! temporary variables used for fixing attenuation in a consistent way
+           do j=1,m1
+              do i=1,m1
+                 ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+                 do k = 1,NGLLX
+                    tempx2_att(i,j,k) = tempx2(i,j,k) + &
+                         dummyx_loc_att(i,1,k)*hprime_xxT(1,j) + &
+                         dummyx_loc_att(i,2,k)*hprime_xxT(2,j) + &
+                         dummyx_loc_att(i,3,k)*hprime_xxT(3,j) + &
+                         dummyx_loc_att(i,4,k)*hprime_xxT(4,j) + &
+                         dummyx_loc_att(i,5,k)*hprime_xxT(5,j)
+
+                    tempy2_att(i,j,k) = tempy2(i,j,k) + &
+                         dummyy_loc_att(i,1,k)*hprime_xxT(1,j) + &
+                         dummyy_loc_att(i,2,k)*hprime_xxT(2,j) + &
+                         dummyy_loc_att(i,3,k)*hprime_xxT(3,j) + &
+                         dummyy_loc_att(i,4,k)*hprime_xxT(4,j) + &
+                         dummyy_loc_att(i,5,k)*hprime_xxT(5,j)
+
+                    tempz2_att(i,j,k) = tempz2(i,j,k) + &
+                         dummyz_loc_att(i,1,k)*hprime_xxT(1,j) + &
+                         dummyz_loc_att(i,2,k)*hprime_xxT(2,j) + &
+                         dummyz_loc_att(i,3,k)*hprime_xxT(3,j) + &
+                         dummyz_loc_att(i,4,k)*hprime_xxT(4,j) + &
+                         dummyz_loc_att(i,5,k)*hprime_xxT(5,j)
+                 enddo
+              enddo
+           enddo
+        endif
+
+        ! call mxm_m2_m1_5points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+        do j=1,m1
+          do i=1,m2
+            C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+                                      A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+                                      A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+                                      A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+                                      A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
+            C2_mxm_m2_m1_5points(i,j) = A2_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+                                      A2_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+                                      A2_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+                                      A2_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+                                      A2_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
+            C3_mxm_m2_m1_5points(i,j) = A3_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+                                      A3_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+                                      A3_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+                                      A3_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+                                      A3_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
+          enddo
+        enddo
+
+        if(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
+           ! temporary variables used for fixing attenuation in a consistent way
+           do j=1,m1
+              do i=1,m2
+                 C1_mxm_m2_m1_5points_att(i,j) = C1_mxm_m2_m1_5points(i,j) + &
+                      A1_mxm_m2_m1_5points_att(i,1)*hprime_xxT(1,j) + &
+                      A1_mxm_m2_m1_5points_att(i,2)*hprime_xxT(2,j) + &
+                      A1_mxm_m2_m1_5points_att(i,3)*hprime_xxT(3,j) + &
+                      A1_mxm_m2_m1_5points_att(i,4)*hprime_xxT(4,j) + &
+                      A1_mxm_m2_m1_5points_att(i,5)*hprime_xxT(5,j)
+
+                 C2_mxm_m2_m1_5points_att(i,j) = C2_mxm_m2_m1_5points(i,j) + &
+                      A2_mxm_m2_m1_5points_att(i,1)*hprime_xxT(1,j) + &
+                      A2_mxm_m2_m1_5points_att(i,2)*hprime_xxT(2,j) + &
+                      A2_mxm_m2_m1_5points_att(i,3)*hprime_xxT(3,j) + &
+                      A2_mxm_m2_m1_5points_att(i,4)*hprime_xxT(4,j) + &
+                      A2_mxm_m2_m1_5points_att(i,5)*hprime_xxT(5,j)
+
+                 C3_mxm_m2_m1_5points_att(i,j) = C3_mxm_m2_m1_5points(i,j) + &
+                      A3_mxm_m2_m1_5points_att(i,1)*hprime_xxT(1,j) + &
+                      A3_mxm_m2_m1_5points_att(i,2)*hprime_xxT(2,j) + &
+                      A3_mxm_m2_m1_5points_att(i,3)*hprime_xxT(3,j) + &
+                      A3_mxm_m2_m1_5points_att(i,4)*hprime_xxT(4,j) + &
+                      A3_mxm_m2_m1_5points_att(i,5)*hprime_xxT(5,j)
+              enddo
+           enddo
+        endif
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+              ! save strain on the Moho boundary
+              if (SAVE_MOHO_MESH ) then
+                if (is_moho_top(ispec)) then
+                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                else if (is_moho_bot(ispec)) then
+                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                endif
+              endif
+
+              ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+              if ( ATTENUATION .and. COMPUTE_AND_STORE_STRAIN ) then
+                 ! temporary variables used for fixing attenuation in a consistent way
+                 duxdxl_att = xixl*tempx1_att(i,j,k) + etaxl*tempx2_att(i,j,k) + gammaxl*tempx3_att(i,j,k)
+                 duxdyl_att = xiyl*tempx1_att(i,j,k) + etayl*tempx2_att(i,j,k) + gammayl*tempx3_att(i,j,k)
+                 duxdzl_att = xizl*tempx1_att(i,j,k) + etazl*tempx2_att(i,j,k) + gammazl*tempx3_att(i,j,k)
+
+                 duydxl_att = xixl*tempy1_att(i,j,k) + etaxl*tempy2_att(i,j,k) + gammaxl*tempy3_att(i,j,k)
+                 duydyl_att = xiyl*tempy1_att(i,j,k) + etayl*tempy2_att(i,j,k) + gammayl*tempy3_att(i,j,k)
+                 duydzl_att = xizl*tempy1_att(i,j,k) + etazl*tempy2_att(i,j,k) + gammazl*tempy3_att(i,j,k)
+
+                 duzdxl_att = xixl*tempz1_att(i,j,k) + etaxl*tempz2_att(i,j,k) + gammaxl*tempz3_att(i,j,k)
+                 duzdyl_att = xiyl*tempz1_att(i,j,k) + etayl*tempz2_att(i,j,k) + gammayl*tempz3_att(i,j,k)
+                 duzdzl_att = xizl*tempz1_att(i,j,k) + etazl*tempz2_att(i,j,k) + gammazl*tempz3_att(i,j,k)
+
+                 ! precompute some sums to save CPU time
+                 duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att
+                 duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att
+                 duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att
+
+                 ! compute deviatoric strain
+                 templ = ONE_THIRD * (duxdxl_att + duydyl_att + duzdzl_att)
+                 if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                 epsilondev_xx_loc(i,j,k) = duxdxl_att - templ
+                 epsilondev_yy_loc(i,j,k) = duydyl_att - templ
+                 epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl_att
+                 epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl_att
+                 epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl_att
+              else
+                 ! computes deviatoric strain attenuation and/or for kernel calculations
+                 if (COMPUTE_AND_STORE_STRAIN) then
+                    templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                    if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                    epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                    epsilondev_yy_loc(i,j,k) = duydyl - templ
+                    epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                    epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                    epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+                 endif
+              endif
+
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+
+              ! attenuation
+              if(ATTENUATION) then
+                ! use unrelaxed parameters if attenuation
+                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+              endif
+
+  ! full anisotropic case, stress calculations
+              if(ANISOTROPY) then
+                c11 = c11store(i,j,k,ispec)
+                c12 = c12store(i,j,k,ispec)
+                c13 = c13store(i,j,k,ispec)
+                c14 = c14store(i,j,k,ispec)
+                c15 = c15store(i,j,k,ispec)
+                c16 = c16store(i,j,k,ispec)
+                c22 = c22store(i,j,k,ispec)
+                c23 = c23store(i,j,k,ispec)
+                c24 = c24store(i,j,k,ispec)
+                c25 = c25store(i,j,k,ispec)
+                c26 = c26store(i,j,k,ispec)
+                c33 = c33store(i,j,k,ispec)
+                c34 = c34store(i,j,k,ispec)
+                c35 = c35store(i,j,k,ispec)
+                c36 = c36store(i,j,k,ispec)
+                c44 = c44store(i,j,k,ispec)
+                c45 = c45store(i,j,k,ispec)
+                c46 = c46store(i,j,k,ispec)
+                c55 = c55store(i,j,k,ispec)
+                c56 = c56store(i,j,k,ispec)
+                c66 = c66store(i,j,k,ispec)
+
+                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+              else
+
+  ! isotropic case
+                lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                lambdal = lambdalplus2mul - 2.*mul
+
+                ! compute stress sigma
+                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                sigma_xy = mul*duxdyl_plus_duydxl
+                sigma_xz = mul*duzdxl_plus_duxdzl
+                sigma_yz = mul*duzdyl_plus_duydzl
+
+              endif ! ANISOTROPY
+
+              ! subtract memory variables if attenuation
+              if(ATTENUATION) then
+! way 1
+!                do i_sls = 1,N_SLS
+!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+!                  sigma_xx = sigma_xx - R_xx_val
+!                  sigma_yy = sigma_yy - R_yy_val
+!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+!                enddo
+
+! way 2
+! note: this should help compilers to pipeline the code and make better use of the cache;
+!          depending on compilers, it can further decrease the computation time by ~ 30%.
+!          by default, N_SLS = 3, therefore we take steps of 3
+              if(imodulo_N_SLS >= 1) then
+                do i_sls = 1,imodulo_N_SLS
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                enddo
+              endif
+
+              if(N_SLS >= imodulo_N_SLS+1) then
+                do i_sls = imodulo_N_SLS+1,N_SLS,3
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                  sigma_xx = sigma_xx - R_xx_val2
+                  sigma_yy = sigma_yy - R_yy_val2
+                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                  sigma_xx = sigma_xx - R_xx_val3
+                  sigma_yy = sigma_yy - R_yy_val3
+                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                enddo
+              endif
+
+
+              endif
+
+            ! define symmetric components of sigma
+            sigma_yx = sigma_xy
+            sigma_zx = sigma_xz
+            sigma_zy = sigma_yz
+
+            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
+            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_5points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+        do j=1,m2
+          do i=1,m1
+            E1_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_5points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C1_m1_m2_5points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C1_m1_m2_5points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C1_m1_m2_5points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C1_m1_m2_5points(5,j)
+            E2_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_5points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C2_m1_m2_5points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C2_m1_m2_5points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C2_m1_m2_5points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C2_m1_m2_5points(5,j)
+            E3_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_5points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C3_m1_m2_5points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C3_m1_m2_5points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C3_m1_m2_5points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C3_m1_m2_5points(5,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_5points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+        do i=1,m1
+          do j=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempx2(i,5,k)*hprimewgll_xx(5,j)
+              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempy2(i,5,k)*hprimewgll_xx(5,j)
+              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempz2(i,5,k)*hprimewgll_xx(5,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_5points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+        do j=1,m1
+          do i=1,m2
+            E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+                                      C1_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+                                      C1_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+                                      C1_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+                                      C1_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
+            E2_mxm_m2_m1_5points(i,j) = C2_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+                                      C2_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+                                      C2_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+                                      C2_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+                                      C2_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
+            E3_mxm_m2_m1_5points(i,j) = C3_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+                                      C3_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+                                      C3_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+                                      C3_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+                                      C3_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+
+              fac1 = wgllwgll_yz(j,k)
+              fac2 = wgllwgll_xz(i,k)
+              fac3 = wgllwgll_xy(i,j)
+
+              ! sum contributions from each element to the global mesh using indirect addressing
+              iglob = ibool(i,j,k,ispec)
+              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
+                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
+              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
+                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
+              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
+                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
+
+              !  update memory variables based upon the Runge-Kutta scheme
+              if(ATTENUATION) then
+
+                 ! use Runge-Kutta scheme to march in time
+                 do i_sls = 1,N_SLS
+
+                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                    alphaval_loc = alphaval(i_sls)
+                    betaval_loc = betaval(i_sls)
+                    gammaval_loc = gammaval(i_sls)
+
+                    ! term in xx
+                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yy
+                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in zz not computed since zero trace
+                    ! term in xy
+                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in xz
+                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yz
+                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+
+                 enddo   ! end of loop on memory variables
+
+              endif  !  end attenuation
+
+            enddo
+          enddo
+        enddo
+
+        ! save deviatoric strain for Runge-Kutta scheme
+        if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+        endif
+
+  enddo  ! spectral element loop
+
+  end subroutine compute_forces_viscoelastic_Dev_5p
+

Copied: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90 (from rev 21239, seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_calling_routine.F90)
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -0,0 +1,554 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 1
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+!                             July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+! elastic solver
+
+subroutine compute_forces_viscoelastic()
+
+  use specfem_par
+  use specfem_par_acoustic
+  use specfem_par_elastic
+  use specfem_par_poroelastic
+  use fault_solver_dynamic, only : bc_dynflt_set3d_all,SIMULATION_TYPE_DYN
+  use fault_solver_kinematic, only : bc_kinflt_set_all,SIMULATION_TYPE_KIN
+
+  implicit none
+
+  integer:: iphase
+  logical:: phase_is_inner
+
+! distinguishes two runs: for points on MPI interfaces, and points within the partitions
+  do iphase=1,2
+
+    !first for points on MPI interfaces
+    if( iphase == 1 ) then
+      phase_is_inner = .false.
+    else
+      phase_is_inner = .true.
+    endif
+
+
+! elastic term
+    if( .NOT. GPU_MODE ) then
+      if(USE_DEVILLE_PRODUCTS) then
+        ! uses Deville (2002) optimizations
+        call compute_forces_viscoelastic_Dev_sim1(iphase)
+
+        ! adjoint simulations: backward/reconstructed wavefield
+        if( SIMULATION_TYPE == 3 ) &
+          call compute_forces_viscoelastic_Dev_sim3(iphase)
+
+      else
+        ! no optimizations used
+        call compute_forces_viscoelastic_noDev( iphase, NSPEC_AB,NGLOB_AB,displ,veloc,accel, &
+                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                        hprime_xx,hprime_yy,hprime_zz, &
+                        hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                        kappastore,mustore,jacobian,ibool, &
+                        ATTENUATION,deltat,PML_CONDITIONS, &
+                        nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax,NSPEC2D_BOTTOM,NSPEC2D_TOP, &
+                        ibelm_xmin,ibelm_xmax,ibelm_ymin,ibelm_ymax,ibelm_bottom,ibelm_top, &
+                        one_minus_sum_beta,factor_common, &
+                        alphaval,betaval,gammaval,&
+                        NSPEC_ATTENUATION_AB, &
+                        R_xx,R_yy,R_xy,R_xz,R_yz, &
+                        epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                        epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                        ANISOTROPY,NSPEC_ANISO, &
+                        c11store,c12store,c13store,c14store,c15store,c16store, &
+                        c22store,c23store,c24store,c25store,c26store,c33store, &
+                        c34store,c35store,c36store,c44store,c45store,c46store, &
+                        c55store,c56store,c66store, &
+                        SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                        NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                        is_moho_top,is_moho_bot, &
+                        dsdx_top,dsdx_bot, &
+                        ispec2D_moho_top,ispec2D_moho_bot, &
+                        num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic, &
+                        phase_ispec_inner_elastic,ispec_is_elastic )
+
+        ! adjoint simulations: backward/reconstructed wavefield
+        if( SIMULATION_TYPE == 3 ) &
+          call compute_forces_viscoelastic_noDev( iphase, NSPEC_AB,NGLOB_AB, &
+                        b_displ,b_veloc,b_accel, &
+                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                        hprime_xx,hprime_yy,hprime_zz, &
+                        hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                        kappastore,mustore,jacobian,ibool, &
+                        ATTENUATION,deltat,PML_CONDITIONS, &
+                        nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax,NSPEC2D_BOTTOM,NSPEC2D_TOP, &
+                        ibelm_xmin,ibelm_xmax,ibelm_ymin,ibelm_ymax,ibelm_bottom,ibelm_top, &
+                        one_minus_sum_beta,factor_common, &
+                        b_alphaval,b_betaval,b_gammaval, &
+                        NSPEC_ATTENUATION_AB, &
+                        b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
+                        b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, &
+                        b_epsilondev_xz,b_epsilondev_yz,b_epsilon_trace_over_3, &
+                        ANISOTROPY,NSPEC_ANISO, &
+                        c11store,c12store,c13store,c14store,c15store,c16store, &
+                        c22store,c23store,c24store,c25store,c26store,c33store, &
+                        c34store,c35store,c36store,c44store,c45store,c46store, &
+                        c55store,c56store,c66store, &
+                        SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                        NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                        is_moho_top,is_moho_bot, &
+                        b_dsdx_top,b_dsdx_bot, &
+                        ispec2D_moho_top,ispec2D_moho_bot, &
+                        num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic, &
+                        phase_ispec_inner_elastic  )
+
+      endif
+
+    else
+      ! on GPU
+      ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
+      call compute_forces_viscoelastic_cuda(Mesh_pointer, iphase, deltat, &
+                                      nspec_outer_elastic, &
+                                      nspec_inner_elastic, &
+                                      COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
+
+      if(phase_is_inner .eqv. .true.) then
+         ! while Inner elements compute "Kernel_2", we wait for MPI to
+         ! finish and transfer the boundary terms to the device
+         ! asynchronously
+
+         !daniel: todo - this avoids calling the fortran vector send from CUDA routine
+         ! wait for asynchronous copy to finish
+         call sync_copy_from_device(Mesh_pointer,iphase,buffer_send_vector_ext_mesh)
+         ! sends mpi buffers
+         call assemble_MPI_vector_send_cuda(NPROC, &
+                  buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
+                  num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                  nibool_interfaces_ext_mesh,&
+                  my_neighbours_ext_mesh, &
+                  request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+
+         ! transfers mpi buffers onto GPU
+         call transfer_boundary_to_device(NPROC,Mesh_pointer,buffer_recv_vector_ext_mesh, &
+                  num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                  request_recv_vector_ext_mesh)
+      endif ! inner elements
+
+   endif ! GPU_MODE
+
+
+! adds elastic absorbing boundary term to acceleration (Stacey conditions)
+    if( ABSORBING_CONDITIONS ) then
+       call compute_stacey_viscoelastic(NSPEC_AB,NGLOB_AB,accel, &
+                        ibool,ispec_is_inner,phase_is_inner, &
+                        abs_boundary_normal,abs_boundary_jacobian2Dw, &
+                        abs_boundary_ijk,abs_boundary_ispec, &
+                        num_abs_boundary_faces, &
+                        veloc,rho_vp,rho_vs, &
+                        ispec_is_elastic,SIMULATION_TYPE,SAVE_FORWARD, &
+                        NSTEP,it,NGLOB_ADJOINT,b_accel, &
+                        b_num_abs_boundary_faces,b_reclen_field,b_absorb_field,&
+                        GPU_MODE,Mesh_pointer)
+    endif
+
+
+! acoustic coupling
+    if( ACOUSTIC_SIMULATION ) then
+      if( num_coupling_ac_el_faces > 0 ) then
+        if( .NOT. GPU_MODE ) then
+          if( SIMULATION_TYPE == 1 ) then
+            ! forward definition: pressure=-potential_dot_dot
+            call compute_coupling_viscoelastic_ac(NSPEC_AB,NGLOB_AB, &
+                        ibool,accel,potential_dot_dot_acoustic, &
+                        num_coupling_ac_el_faces, &
+                        coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                        coupling_ac_el_normal, &
+                        coupling_ac_el_jacobian2Dw, &
+                        ispec_is_inner,phase_is_inner)
+          else
+            ! handles adjoint runs coupling between adjoint potential and adjoint elastic wavefield
+            ! adoint definition: pressure^\dagger=potential^\dagger
+            call compute_coupling_viscoelastic_ac(NSPEC_AB,NGLOB_AB, &
+                              ibool,accel,-potential_acoustic_adj_coupling, &
+                              num_coupling_ac_el_faces, &
+                              coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                              coupling_ac_el_normal, &
+                              coupling_ac_el_jacobian2Dw, &
+                              ispec_is_inner,phase_is_inner)
+          endif
+
+        ! adjoint simulations
+        if( SIMULATION_TYPE == 3 ) &
+          call compute_coupling_viscoelastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
+                        ibool,b_accel,b_potential_dot_dot_acoustic, &
+                        num_coupling_ac_el_faces, &
+                        coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                        coupling_ac_el_normal, &
+                        coupling_ac_el_jacobian2Dw, &
+                        ispec_is_inner,phase_is_inner)
+
+        else
+          ! on GPU
+          call compute_coupling_el_ac_cuda(Mesh_pointer,phase_is_inner, &
+                                           num_coupling_ac_el_faces)
+        endif ! GPU_MODE
+      endif ! num_coupling_ac_el_faces
+    endif
+
+
+! poroelastic coupling
+    if( POROELASTIC_SIMULATION ) then
+      call compute_coupling_viscoelastic_po(NSPEC_AB,NGLOB_AB,ibool,&
+                        displs_poroelastic,displw_poroelastic,&
+                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                        hprime_xx,hprime_yy,hprime_zz,&
+                        kappaarraystore,rhoarraystore,mustore, &
+                        phistore,tortstore,jacobian,&
+                        displ,accel,kappastore, &
+                        ANISOTROPY,NSPEC_ANISO, &
+                        c11store,c12store,c13store,c14store,c15store,c16store,&
+                        c22store,c23store,c24store,c25store,c26store,c33store,&
+                        c34store,c35store,c36store,c44store,c45store,c46store,&
+                        c55store,c56store,c66store, &
+                        SIMULATION_TYPE,NGLOB_ADJOINT,NSPEC_ADJOINT, &
+                        num_coupling_el_po_faces, &
+                        coupling_el_po_ispec,coupling_po_el_ispec, &
+                        coupling_el_po_ijk,coupling_po_el_ijk, &
+                        coupling_el_po_normal, &
+                        coupling_el_po_jacobian2Dw, &
+                        ispec_is_inner,phase_is_inner)
+    endif
+
+! adds source term (single-force/moment-tensor solution)
+    call compute_add_sources_viscoelastic( NSPEC_AB,NGLOB_AB,accel, &
+                        ibool,ispec_is_inner,phase_is_inner, &
+                        NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,&
+                        hdur,hdur_gaussian,tshift_src,dt,t0,sourcearrays, &
+                        ispec_is_elastic,SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT, &
+                        nrec,islice_selected_rec,ispec_selected_rec, &
+                        nadj_rec_local,adj_sourcearrays,b_accel, &
+                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY, &
+                        GPU_MODE, Mesh_pointer )
+
+    ! assemble all the contributions between slices using MPI
+    if( phase_is_inner .eqv. .false. ) then
+       ! sends accel values to corresponding MPI interface neighbors
+       if(.NOT. GPU_MODE) then
+          call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB,accel, &
+               buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
+               num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+               nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
+               my_neighbours_ext_mesh, &
+               request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+       else ! GPU_MODE==1
+          ! transfers boundary region to host asynchronously. The
+          ! MPI-send is done from within compute_forces_viscoelastic_cuda,
+          ! once the inner element kernels are launched, and the
+          ! memcpy has finished. see compute_forces_viscoelastic_cuda:1655
+          call transfer_boundary_from_device_a(Mesh_pointer,nspec_outer_elastic)
+       endif ! GPU_MODE
+
+       ! adjoint simulations
+       if( SIMULATION_TYPE == 3 ) then
+          if(.NOT. GPU_MODE) then
+             call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_accel, &
+                  b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
+                  num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                  nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
+                  my_neighbours_ext_mesh, &
+                  b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
+          else ! GPU_MODE == 1
+             call transfer_boun_accel_from_device(NGLOB_AB*NDIM, Mesh_pointer, b_accel,&
+                       b_buffer_send_vector_ext_mesh,&
+                       num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh,&
+                       nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh,3) ! <-- 3 == adjoint b_accel
+             call assemble_MPI_vector_send_cuda(NPROC, &
+                  b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
+                  num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                  nibool_interfaces_ext_mesh,&
+                  my_neighbours_ext_mesh, &
+                  b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
+          endif ! GPU
+       endif !adjoint
+
+    else
+      ! waits for send/receive requests to be completed and assembles values
+      if(.NOT. GPU_MODE) then
+         call assemble_MPI_vector_ext_mesh_w_ordered(NPROC,NGLOB_AB,accel, &
+                            buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
+                            max_nibool_interfaces_ext_mesh, &
+                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
+                            my_neighbours_ext_mesh,myrank)
+
+      else ! GPU_MODE == 1
+         call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,accel, Mesh_pointer,&
+                            buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
+                            max_nibool_interfaces_ext_mesh, &
+                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh,1)
+      endif
+      ! adjoint simulations
+      if( SIMULATION_TYPE == 3 ) then
+         if(.NOT. GPU_MODE) then
+            call assemble_MPI_vector_ext_mesh_w_ordered(NPROC,NGLOB_ADJOINT,b_accel, &
+                             b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
+                             max_nibool_interfaces_ext_mesh, &
+                             nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                             b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh, &
+                             my_neighbours_ext_mesh,myrank)
+
+         else ! GPU_MODE == 1
+            call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,b_accel, Mesh_pointer,&
+                              b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
+                              max_nibool_interfaces_ext_mesh, &
+                              nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                              b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh,3)
+         endif
+      endif !adjoint
+
+    endif
+
+    !! DK DK May 2009: removed this because now each slice of a CUBIT + SCOTCH mesh
+    !! DK DK May 2009: has a different number of spectral elements and therefore
+    !! DK DK May 2009: only the general non-blocking MPI routines assemble_MPI_vector_ext_mesh_s
+    !! DK DK May 2009: and assemble_MPI_vector_ext_mesh_w above can be used.
+    !! DK DK May 2009: For adjoint runs below (SIMULATION_TYPE == 3) they should be used as well.
+
+ enddo
+
+!Percy , Fault boundary term B*tau is added to the assembled forces 
+!        which at this point are stored in the array 'accel'
+  if (SIMULATION_TYPE_DYN) call bc_dynflt_set3d_all(accel,veloc,displ)
+  
+  if (SIMULATION_TYPE_KIN) call bc_kinflt_set_all(accel,veloc,displ)
+
+ ! multiplies with inverse of mass matrix (note: rmass has been inverted already)
+ if(.NOT. GPU_MODE) then
+    accel(1,:) = accel(1,:)*rmassx(:)
+    accel(2,:) = accel(2,:)*rmassy(:)
+    accel(3,:) = accel(3,:)*rmassz(:)
+    ! adjoint simulations
+    if (SIMULATION_TYPE == 3) then
+       b_accel(1,:) = b_accel(1,:)*rmassx(:)
+       b_accel(2,:) = b_accel(2,:)*rmassy(:)
+       b_accel(3,:) = b_accel(3,:)*rmassz(:)
+    endif !adjoint
+ else ! GPU_MODE == 1
+    call kernel_3_a_cuda(Mesh_pointer, NGLOB_AB, deltatover2,b_deltatover2,OCEANS)
+ endif
+
+! updates acceleration with ocean load term
+  if(OCEANS) then
+    if( .NOT. GPU_MODE ) then
+      call compute_coupling_ocean(NSPEC_AB,NGLOB_AB, &
+                                  ibool,rmassx,rmassy,rmassz, &
+                                  rmass_ocean_load,accel, &
+                                  free_surface_normal,free_surface_ijk,free_surface_ispec, &
+                                  num_free_surface_faces,SIMULATION_TYPE, &
+                                  NGLOB_ADJOINT,b_accel)
+    else
+      ! on GPU
+      call compute_coupling_ocean_cuda(Mesh_pointer)
+    endif
+  endif
+
+! updates velocities
+! Newmark finite-difference time scheme with elastic domains:
+! (see e.g. Hughes, 1987; Chaljub et al., 2003)
+!
+! u(t+delta_t) = u(t) + delta_t  v(t) + 1/2  delta_t**2 a(t)
+! v(t+delta_t) = v(t) + 1/2 delta_t a(t) + 1/2 delta_t a(t+delta_t)
+! a(t+delta_t) = 1/M_elastic ( -K_elastic u(t+delta) + B_elastic chi_dot_dot(t+delta_t) + f( t+delta_t) )
+!
+! where
+!   u, v, a are displacement,velocity & acceleration
+!   M is mass matrix, K stiffness matrix and B boundary term for acoustic/elastic domains
+!   f denotes a source term (acoustic/elastic)
+!   chi_dot_dot is acoustic (fluid) potential ( dotted twice with respect to time)
+!
+! corrector:
+!   updates the velocity term which requires a(t+delta)
+! GPU_MODE: this is handled in 'kernel_3' at the same time as accel*rmass
+  if(.NOT. GPU_MODE) then
+     veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
+     ! adjoint simulations
+     if (SIMULATION_TYPE == 3) b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
+  else ! GPU_MODE == 1
+    if( OCEANS ) call kernel_3_b_cuda(Mesh_pointer, NGLOB_AB, deltatover2,b_deltatover2)
+  endif
+
+
+end subroutine compute_forces_viscoelastic
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! distributes routines according to chosen NGLLX in constants.h
+
+!daniel: note -- i put it here rather than in compute_forces_viscoelastic_Dev.f90 because compiler complains that:
+! " The storage extent of the dummy argument exceeds that of the actual argument. "
+
+subroutine compute_forces_viscoelastic_Dev_sim1(iphase)
+
+! forward simulations
+
+  use specfem_par
+  use specfem_par_acoustic
+  use specfem_par_elastic
+  use specfem_par_poroelastic
+
+  implicit none
+
+  integer,intent(in) :: iphase
+
+  select case(NGLLX)
+
+  case (5)
+
+!----------------------------------------------------------------------------------------------
+
+! OpenMP routine flag for testing & benchmarking forward runs only
+! configure additional flag, e.g.: FLAGS_NO_CHECK="-O3 -DOPENMP_MODE -openmp"
+
+!----------------------------------------------------------------------------------------------
+#ifdef OPENMP_MODE
+!! DK DK Jan 2013: beware, that OpenMP version is not maintained / supported and thus probably does not work
+    call compute_forces_viscoelastic_Dev_openmp(iphase, NSPEC_AB,NGLOB_AB,displ,veloc,accel, &
+           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+           hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
+           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+           kappastore,mustore,jacobian,ibool, &
+           ATTENUATION,deltat, &
+           one_minus_sum_beta,factor_common, &
+           alphaval,betaval,gammaval, &
+           NSPEC_ATTENUATION_AB, &
+           R_xx,R_yy,R_xy,R_xz,R_yz, &
+           epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+           epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+           ANISOTROPY,NSPEC_ANISO, &
+           c11store,c12store,c13store,c14store,c15store,c16store,&
+           c22store,c23store,c24store,c25store,c26store,c33store,&
+           c34store,c35store,c36store,c44store,c45store,c46store,&
+           c55store,c56store,c66store, &
+           SIMULATION_TYPE, COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+           NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
+           is_moho_top,is_moho_bot, &
+           dsdx_top,dsdx_bot, &
+           ispec2D_moho_top,ispec2D_moho_bot, &
+           num_phase_ispec_elastic,&
+           phase_ispec_inner_elastic,&
+           num_colors_outer_elastic,num_colors_inner_elastic)
+#else
+    call compute_forces_viscoelastic_Dev_5p(iphase, NSPEC_AB,NGLOB_AB,displ,veloc,accel, &
+             xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+             hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
+             wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+             kappastore,mustore,jacobian,ibool, &
+             ATTENUATION,deltat, &
+             one_minus_sum_beta,factor_common, &
+             alphaval,betaval,gammaval, &
+             NSPEC_ATTENUATION_AB, &
+             R_xx,R_yy,R_xy,R_xz,R_yz, &
+             epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+             epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+             ANISOTROPY,NSPEC_ANISO, &
+             c11store,c12store,c13store,c14store,c15store,c16store,&
+             c22store,c23store,c24store,c25store,c26store,c33store,&
+             c34store,c35store,c36store,c44store,c45store,c46store,&
+             c55store,c56store,c66store, &
+             SIMULATION_TYPE, COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+             NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
+             is_moho_top,is_moho_bot, &
+             dsdx_top,dsdx_bot, &
+             ispec2D_moho_top,ispec2D_moho_bot, &
+             num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+             phase_ispec_inner_elastic )
+#endif
+
+  case default
+
+    stop 'error no Deville routine available for chosen NGLLX'
+
+  end select
+
+end subroutine compute_forces_viscoelastic_Dev_sim1
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+
+subroutine compute_forces_viscoelastic_Dev_sim3(iphase)
+
+! uses backward/reconstructed displacement and acceleration arrays
+
+  use specfem_par
+  use specfem_par_acoustic
+  use specfem_par_elastic
+  use specfem_par_poroelastic
+
+  implicit none
+
+  integer,intent(in) :: iphase
+
+  select case(NGLLX)
+
+  case (5)
+    call compute_forces_viscoelastic_Dev_5p(iphase, NSPEC_AB,NGLOB_AB, &
+                  b_displ,b_veloc,b_accel, &
+                  xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                  hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
+                  wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                  kappastore,mustore,jacobian,ibool, &
+                  ATTENUATION,deltat, &
+                  one_minus_sum_beta,factor_common, &
+                  b_alphaval,b_betaval,b_gammaval, &
+                  NSPEC_ATTENUATION_AB, &
+                  b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
+                  b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, &
+                  b_epsilondev_xz,b_epsilondev_yz,b_epsilon_trace_over_3, &
+                  ANISOTROPY,NSPEC_ANISO, &
+                  c11store,c12store,c13store,c14store,c15store,c16store,&
+                  c22store,c23store,c24store,c25store,c26store,c33store,&
+                  c34store,c35store,c36store,c44store,c45store,c46store,&
+                  c55store,c56store,c66store, &
+                  SIMULATION_TYPE, COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY,&
+                  NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
+                  is_moho_top,is_moho_bot, &
+                  b_dsdx_top,b_dsdx_bot, &
+                  ispec2D_moho_top,ispec2D_moho_bot, &
+                  num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                  phase_ispec_inner_elastic )
+
+  case default
+
+    stop 'error no Deville routine available for chosen NGLLX'
+
+  end select
+
+
+end subroutine compute_forces_viscoelastic_Dev_sim3
+

Copied: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_noDev.f90 (from rev 21238, seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic_noDev.f90)
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_noDev.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_noDev.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -0,0 +1,825 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 1
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+!                             July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+subroutine compute_forces_viscoelastic_noDev( iphase, &
+                        NSPEC_AB,NGLOB_AB,displ,veloc,accel, &
+                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                        hprime_xx,hprime_yy,hprime_zz, &
+                        hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,&
+                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                        kappastore,mustore,jacobian,ibool, &
+                        ATTENUATION,deltat,PML_CONDITIONS, &
+                        nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax,NSPEC2D_BOTTOM,NSPEC2D_TOP, &
+                        ibelm_xmin,ibelm_xmax,ibelm_ymin,ibelm_ymax,ibelm_bottom,ibelm_top, &
+                        one_minus_sum_beta,factor_common, &
+                        alphaval,betaval,gammaval, &
+                        NSPEC_ATTENUATION_AB, &
+                        R_xx,R_yy,R_xy,R_xz,R_yz, &
+                        epsilondev_xx,epsilondev_yy,epsilondev_xy,&
+                        epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                        ANISOTROPY,NSPEC_ANISO, &
+                        c11store,c12store,c13store,c14store,c15store,c16store, &
+                        c22store,c23store,c24store,c25store,c26store,c33store, &
+                        c34store,c35store,c36store,c44store,c45store,c46store, &
+                        c55store,c56store,c66store, &
+                        SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                        NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                        is_moho_top,is_moho_bot, &
+                        dsdx_top,dsdx_bot, &
+                        ispec2D_moho_top,ispec2D_moho_bot, &
+                        num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic, &
+                        phase_ispec_inner_elastic)
+
+  use constants, only: NGLLX,NGLLY,NGLLZ,NDIM,N_SLS,SAVE_MOHO_MESH,ONE_THIRD,FOUR_THIRDS
+  use pml_par
+  use fault_solver_dynamic, only : Kelvin_Voigt_eta
+
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement, velocity and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,veloc,accel
+
+! time step
+  real(kind=CUSTOM_REAL) :: deltat
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
+  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! communication overlap
+!  logical, dimension(NSPEC_AB) :: ispec_is_inner
+!  logical :: phase_is_inner
+
+! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+       R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+! New dloc = displ + Kelvin Voigt damping*veloc
+ real(kind=CUSTOM_REAL), dimension(3,NGLLX,NGLLY,NGLLZ) :: dloc
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+       dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+! C-PML absorbing boundary conditions
+  logical :: PML_CONDITIONS
+  integer :: nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax,NSPEC2D_BOTTOM,NSPEC2D_TOP
+  integer, dimension(nspec2D_xmin) :: ibelm_xmin
+  integer, dimension(nspec2D_xmax) :: ibelm_xmax
+  integer, dimension(nspec2D_ymin) :: ibelm_ymin
+  integer, dimension(nspec2D_ymax) :: ibelm_ymax
+  integer, dimension(NSPEC2D_BOTTOM) :: ibelm_bottom
+  integer, dimension(NSPEC2D_TOP) :: ibelm_top
+
+! local parameters
+  integer :: i_SLS
+  integer :: ispec,ispec2D,iglob,ispec_p,num_elements
+  integer :: i,j,k,l
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
+       tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) :: duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) :: duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) :: duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) :: sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+
+  real(kind=CUSTOM_REAL) :: hp1,hp2,hp3
+  real(kind=CUSTOM_REAL) :: fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) :: tempx1l,tempx2l,tempx3l
+  real(kind=CUSTOM_REAL) :: tempy1l,tempy2l,tempy3l
+  real(kind=CUSTOM_REAL) :: tempz1l,tempz2l,tempz3l
+
+  real(kind=CUSTOM_REAL) :: lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) :: kappal
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) :: c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) :: R_xx_val,R_yy_val
+  real(kind=CUSTOM_REAL) :: factor_loc,alphaval_loc,betaval_loc,gammaval_loc,Sn,Snp1
+  real(kind=CUSTOM_REAL) :: templ
+
+  real(kind=CUSTOM_REAL) :: tempx1l_new,tempx2l_new,tempx3l_new
+  real(kind=CUSTOM_REAL) :: tempy1l_new,tempy2l_new,tempy3l_new
+  real(kind=CUSTOM_REAL) :: tempz1l_new,tempz2l_new,tempz3l_new
+
+  real(kind=CUSTOM_REAL) :: duxdxl_new,duxdyl_new,duxdzl_new,duydxl_new
+  real(kind=CUSTOM_REAL) :: duydyl_new,duydzl_new,duzdxl_new,duzdyl_new,duzdzl_new;
+  real(kind=CUSTOM_REAL) :: duxdyl_plus_duydxl_new,duzdxl_plus_duxdzl_new,duzdyl_plus_duydzl_new;
+
+  real(kind=CUSTOM_REAL) :: eta
+
+  ! local C-PML absorbing boundary conditions parameters
+  integer :: ispec_CPML
+
+  if( iphase == 1 ) then
+    num_elements = nspec_outer_elastic
+  else
+    num_elements = nspec_inner_elastic
+  endif
+
+  do ispec_p = 1,num_elements
+
+     ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+     ! adjoint simulations: moho kernel
+     ! note: call this only once
+     if (SAVE_MOHO_MESH .and. SIMULATION_TYPE == 3) then
+        if (is_moho_top(ispec)) then
+           ispec2D_moho_top = ispec2D_moho_top + 1
+        else if (is_moho_bot(ispec)) then
+           ispec2D_moho_bot = ispec2D_moho_bot + 1
+        endif
+     endif
+
+     ! Kelvin Voigt damping: artificial viscosity around dynamic faults
+     if (allocated(Kelvin_Voigt_eta)) then
+        eta = Kelvin_Voigt_eta(ispec)   
+        do k=1,NGLLZ
+           do j=1,NGLLY
+              do i=1,NGLLX
+                 iglob = ibool(i,j,k,ispec)
+                 dloc(:,i,j,k) = displ(:,iglob) + eta*veloc(:,iglob)
+              enddo
+           enddo
+        enddo
+
+     else
+        do k=1,NGLLZ
+           do j=1,NGLLY
+              do i=1,NGLLX
+                 iglob = ibool(i,j,k,ispec)
+                 dloc(:,i,j,k) = displ(:,iglob) 
+              enddo
+           enddo
+        enddo
+     endif
+
+     do k=1,NGLLZ
+        do j=1,NGLLY
+          do i=1,NGLLX
+
+          tempx1l = 0._CUSTOM_REAL
+          tempx2l = 0._CUSTOM_REAL
+          tempx3l = 0._CUSTOM_REAL
+
+          tempy1l = 0._CUSTOM_REAL
+          tempy2l = 0._CUSTOM_REAL
+          tempy3l = 0._CUSTOM_REAL
+
+          tempz1l = 0._CUSTOM_REAL
+          tempz2l = 0._CUSTOM_REAL
+          tempz3l = 0._CUSTOM_REAL
+
+          do l=1,NGLLX
+            hp1 = hprime_xx(i,l)
+            tempx1l = tempx1l + dloc(1,l,j,k)*hp1
+            tempy1l = tempy1l + dloc(2,l,j,k)*hp1
+            tempz1l = tempz1l + dloc(3,l,j,k)*hp1
+
+            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+            hp2 = hprime_yy(j,l)
+            tempx2l = tempx2l + dloc(1,i,l,k)*hp2
+            tempy2l = tempy2l + dloc(2,i,l,k)*hp2
+            tempz2l = tempz2l + dloc(3,i,l,k)*hp2
+
+            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+            hp3 = hprime_zz(k,l)
+            tempx3l = tempx3l + dloc(1,i,j,l)*hp3
+            tempy3l = tempy3l + dloc(2,i,j,l)*hp3
+            tempz3l = tempz3l + dloc(3,i,j,l)*hp3
+          enddo
+
+          if( (ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) .or. &
+               (PML_CONDITIONS .and. CPML_mask_ibool(ispec)) ) then
+             tempx1l_new = tempx1l
+             tempx2l_new = tempx2l
+             tempx3l_new = tempx3l
+
+             tempy1l_new = tempy1l
+             tempy2l_new = tempy2l
+             tempy3l_new = tempy3l
+
+             tempz1l_new = tempz1l
+             tempz2l_new = tempz2l
+             tempz3l_new = tempz3l
+
+             ! use first order Taylor expansion of displacement for local storage of stresses 
+             ! at this current time step, to fix attenuation in a consistent way
+             do l=1,NGLLX
+                hp1 = hprime_xx(i,l)
+                iglob = ibool(l,j,k,ispec)
+                tempx1l_new = tempx1l_new + deltat*veloc(1,iglob)*hp1
+                tempy1l_new = tempy1l_new + deltat*veloc(2,iglob)*hp1
+                tempz1l_new = tempz1l_new + deltat*veloc(3,iglob)*hp1
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+                hp2 = hprime_yy(j,l)
+                iglob = ibool(i,l,k,ispec)
+                tempx2l_new = tempx2l_new + deltat*veloc(1,iglob)*hp2
+                tempy2l_new = tempy2l_new + deltat*veloc(2,iglob)*hp2
+                tempz2l_new = tempz2l_new + deltat*veloc(3,iglob)*hp2
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+                hp3 = hprime_zz(k,l)
+                iglob = ibool(i,j,l,ispec)
+                tempx3l_new = tempx3l_new + deltat*veloc(1,iglob)*hp3
+                tempy3l_new = tempy3l_new + deltat*veloc(2,iglob)*hp3
+                tempz3l_new = tempz3l_new + deltat*veloc(3,iglob)*hp3
+             enddo
+          endif
+
+          ! get derivatives of ux, uy and uz with respect to x, y and z
+          xixl = xix(i,j,k,ispec)
+          xiyl = xiy(i,j,k,ispec)
+          xizl = xiz(i,j,k,ispec)
+          etaxl = etax(i,j,k,ispec)
+          etayl = etay(i,j,k,ispec)
+          etazl = etaz(i,j,k,ispec)
+          gammaxl = gammax(i,j,k,ispec)
+          gammayl = gammay(i,j,k,ispec)
+          gammazl = gammaz(i,j,k,ispec)
+          jacobianl = jacobian(i,j,k,ispec)
+
+          duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
+          duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
+          duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+
+          duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
+          duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
+          duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
+
+          duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
+          duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
+          duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
+
+          ! stores derivatives of ux, uy and uz with respect to x, y and z
+          if( PML_CONDITIONS .and. CPML_mask_ibool(ispec) ) then
+             ispec_CPML = spec_to_CPML(ispec)
+
+             PML_dux_dxl(i,j,k,ispec_CPML) = duxdxl
+             PML_dux_dyl(i,j,k,ispec_CPML) = duxdyl
+             PML_dux_dzl(i,j,k,ispec_CPML) = duxdzl
+
+             PML_duy_dxl(i,j,k,ispec_CPML) = duydxl
+             PML_duy_dyl(i,j,k,ispec_CPML) = duydyl
+             PML_duy_dzl(i,j,k,ispec_CPML) = duydzl
+
+             PML_duz_dxl(i,j,k,ispec_CPML) = duzdxl
+             PML_duz_dyl(i,j,k,ispec_CPML) = duzdyl
+             PML_duz_dzl(i,j,k,ispec_CPML) = duzdzl
+          endif
+
+          ! adjoint simulations: save strain on the Moho boundary
+          if (SAVE_MOHO_MESH ) then
+            if (is_moho_top(ispec)) then
+              dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+              dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+              dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+              dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+              dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+              dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+              dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+              dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+              dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+            else if (is_moho_bot(ispec)) then
+              dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+              dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+              dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+              dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+              dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+              dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+              dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+              dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+              dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+            endif
+          endif
+
+          ! precompute some sums to save CPU time
+          duxdxl_plus_duydyl = duxdxl + duydyl
+          duxdxl_plus_duzdzl = duxdxl + duzdzl
+          duydyl_plus_duzdzl = duydyl + duzdzl
+          duxdyl_plus_duydxl = duxdyl + duydxl
+          duzdxl_plus_duxdzl = duzdxl + duxdzl
+          duzdyl_plus_duydzl = duzdyl + duydzl
+
+          if( (ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) .or. &
+               (PML_CONDITIONS .and. CPML_mask_ibool(ispec)) ) then
+             ! temporary variables used for fixing attenuation in a consistent way
+             duxdxl_new = xixl*tempx1l_new + etaxl*tempx2l_new + gammaxl*tempx3l_new
+             duxdyl_new = xiyl*tempx1l_new + etayl*tempx2l_new + gammayl*tempx3l_new
+             duxdzl_new = xizl*tempx1l_new + etazl*tempx2l_new + gammazl*tempx3l_new
+
+             duydxl_new = xixl*tempy1l_new + etaxl*tempy2l_new + gammaxl*tempy3l_new
+             duydyl_new = xiyl*tempy1l_new + etayl*tempy2l_new + gammayl*tempy3l_new
+             duydzl_new = xizl*tempy1l_new + etazl*tempy2l_new + gammazl*tempy3l_new
+
+             duzdxl_new = xixl*tempz1l_new + etaxl*tempz2l_new + gammaxl*tempz3l_new
+             duzdyl_new = xiyl*tempz1l_new + etayl*tempz2l_new + gammayl*tempz3l_new
+             duzdzl_new = xizl*tempz1l_new + etazl*tempz2l_new + gammazl*tempz3l_new
+
+             if( ATTENUATION .and. COMPUTE_AND_STORE_STRAIN ) then
+                ! precompute some sums to save CPU time
+                duxdyl_plus_duydxl_new = duxdyl_new + duydxl_new
+                duzdxl_plus_duxdzl_new = duzdxl_new + duxdzl_new
+                duzdyl_plus_duydzl_new = duzdyl_new + duydzl_new
+
+                ! compute deviatoric strain
+                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = ONE_THIRD * (duxdxl_new + duydyl_new + duzdzl_new)
+                epsilondev_xx_loc(i,j,k) = duxdxl_new - epsilon_trace_over_3(i,j,k,ispec)
+                epsilondev_yy_loc(i,j,k) = duydyl_new - epsilon_trace_over_3(i,j,k,ispec)
+                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl_new
+                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl_new
+                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl_new
+             endif
+
+             if( PML_CONDITIONS .and. CPML_mask_ibool(ispec) ) then
+                PML_dux_dxl_new(i,j,k,ispec_CPML) = duxdxl_new
+                PML_dux_dyl_new(i,j,k,ispec_CPML) = duxdyl_new
+                PML_dux_dzl_new(i,j,k,ispec_CPML) = duxdzl_new
+
+                PML_duy_dxl_new(i,j,k,ispec_CPML) = duydxl_new
+                PML_duy_dyl_new(i,j,k,ispec_CPML) = duydyl_new
+                PML_duy_dzl_new(i,j,k,ispec_CPML) = duydzl_new
+
+                PML_duz_dxl_new(i,j,k,ispec_CPML) = duzdxl_new
+                PML_duz_dyl_new(i,j,k,ispec_CPML) = duzdyl_new
+                PML_duz_dzl_new(i,j,k,ispec_CPML) = duzdzl_new 
+             endif
+
+          elseif( .not.(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) ) then
+
+             ! computes deviatoric strain attenuation and/or for kernel calculations
+             if (COMPUTE_AND_STORE_STRAIN) then
+                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                epsilondev_yy_loc(i,j,k) = duydyl - templ
+                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+             endif
+          endif
+
+          kappal = kappastore(i,j,k,ispec)
+          mul = mustore(i,j,k,ispec)
+
+          if(ATTENUATION) then
+            ! use unrelaxed parameters if attenuation
+            mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+          endif
+
+          ! full anisotropic case, stress calculations
+          if(ANISOTROPY) then
+            c11 = c11store(i,j,k,ispec)
+            c12 = c12store(i,j,k,ispec)
+            c13 = c13store(i,j,k,ispec)
+            c14 = c14store(i,j,k,ispec)
+            c15 = c15store(i,j,k,ispec)
+            c16 = c16store(i,j,k,ispec)
+            c22 = c22store(i,j,k,ispec)
+            c23 = c23store(i,j,k,ispec)
+            c24 = c24store(i,j,k,ispec)
+            c25 = c25store(i,j,k,ispec)
+            c26 = c26store(i,j,k,ispec)
+            c33 = c33store(i,j,k,ispec)
+            c34 = c34store(i,j,k,ispec)
+            c35 = c35store(i,j,k,ispec)
+            c36 = c36store(i,j,k,ispec)
+            c44 = c44store(i,j,k,ispec)
+            c45 = c45store(i,j,k,ispec)
+            c46 = c46store(i,j,k,ispec)
+            c55 = c55store(i,j,k,ispec)
+            c56 = c56store(i,j,k,ispec)
+            c66 = c66store(i,j,k,ispec)
+
+            sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                      c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+            sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                      c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+            sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                      c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+            sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                      c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+            sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                      c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+            sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                      c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+          else
+
+            ! isotropic case
+            lambdalplus2mul = kappal + FOUR_THIRDS * mul
+            lambdal = lambdalplus2mul - 2.*mul
+
+            ! compute stress sigma
+            sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+            sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+            sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+            sigma_xy = mul*duxdyl_plus_duydxl
+            sigma_xz = mul*duzdxl_plus_duxdzl
+            sigma_yz = mul*duzdyl_plus_duydzl
+
+          endif ! ANISOTROPY
+
+          ! subtract memory variables if attenuation
+          if(ATTENUATION) then
+             do i_sls = 1,N_SLS
+                R_xx_val = R_xx(i,j,k,ispec,i_sls)
+                R_yy_val = R_yy(i,j,k,ispec,i_sls)
+                sigma_xx = sigma_xx - R_xx_val
+                sigma_yy = sigma_yy - R_yy_val
+                sigma_zz = sigma_zz + R_xx_val + R_yy_val
+                sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+             enddo
+          endif
+
+          if( .not. PML_CONDITIONS ) then 
+             ! define symmetric components of sigma
+             sigma_yx = sigma_xy
+             sigma_zx = sigma_xz
+             sigma_zy = sigma_yz
+
+             ! form dot product with test vector, non-symmetric form
+             tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+             tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+             tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+             tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+             tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+             tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+             tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+             tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+             tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+          endif
+
+        enddo
+      enddo
+    enddo
+
+    if( PML_CONDITIONS .and. CPML_mask_ibool(ispec) ) then
+       ! sets C-PML elastic memory variables to compute stress sigma and form dot product with test vector
+       call pml_set_memory_variables(ispec,ispec_CPML,deltat,jacobianl,tempx1,tempy1,tempz1,tempx2,tempy2,tempz2, &
+                                    tempx3,tempy3,tempz3,sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz, &
+                                    sigma_yx,sigma_zx,sigma_zy,lambdal,mul,lambdalplus2mul,xixl,xiyl,xizl, &
+                                    etaxl,etayl,etazl,gammaxl,gammayl,gammazl)
+
+       ! calculates contribution from each C-PML element to update acceleration
+       call pml_set_accel_contribution(ispec,ispec_CPML,deltat,jacobianl,accel_elastic_CPML)
+    endif
+
+    ! second double-loop over GLL to compute all the terms
+    do k=1,NGLLZ
+       do j=1,NGLLY
+          do i=1,NGLLX
+
+          tempx1l = 0._CUSTOM_REAL
+          tempy1l = 0._CUSTOM_REAL
+          tempz1l = 0._CUSTOM_REAL
+
+          tempx2l = 0._CUSTOM_REAL
+          tempy2l = 0._CUSTOM_REAL
+          tempz2l = 0._CUSTOM_REAL
+
+          tempx3l = 0._CUSTOM_REAL
+          tempy3l = 0._CUSTOM_REAL
+          tempz3l = 0._CUSTOM_REAL
+
+          do l=1,NGLLX
+            fac1 = hprimewgll_xx(l,i)
+            tempx1l = tempx1l + tempx1(l,j,k)*fac1
+            tempy1l = tempy1l + tempy1(l,j,k)*fac1
+            tempz1l = tempz1l + tempz1(l,j,k)*fac1
+
+            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+            fac2 = hprimewgll_yy(l,j)
+            tempx2l = tempx2l + tempx2(i,l,k)*fac2
+            tempy2l = tempy2l + tempy2(i,l,k)*fac2
+            tempz2l = tempz2l + tempz2(i,l,k)*fac2
+
+            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+            fac3 = hprimewgll_zz(l,k)
+            tempx3l = tempx3l + tempx3(i,j,l)*fac3
+            tempy3l = tempy3l + tempy3(i,j,l)*fac3
+            tempz3l = tempz3l + tempz3(i,j,l)*fac3
+          enddo
+
+          fac1 = wgllwgll_yz(j,k)
+          fac2 = wgllwgll_xz(i,k)
+          fac3 = wgllwgll_xy(i,j)
+
+          ! sum contributions from each element to the global mesh
+          iglob = ibool(i,j,k,ispec)
+
+          accel(1,iglob) = accel(1,iglob) - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
+          accel(2,iglob) = accel(2,iglob) - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
+          accel(3,iglob) = accel(3,iglob) - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
+
+          ! updates acceleration with contribution from each C-PML element
+          if( PML_CONDITIONS .and. CPML_mask_ibool(ispec) ) then
+             accel(1,iglob) = accel(1,iglob) - accel_elastic_CPML(1,i,j,k,ispec_CPML)
+             accel(2,iglob) = accel(2,iglob) - accel_elastic_CPML(2,i,j,k,ispec_CPML)
+             accel(3,iglob) = accel(3,iglob) - accel_elastic_CPML(3,i,j,k,ispec_CPML)
+          endif
+
+          !  update memory variables based upon the Runge-Kutta scheme
+          if(ATTENUATION) then
+
+             ! use Runge-Kutta scheme to march in time
+             do i_sls = 1,N_SLS
+
+                factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                alphaval_loc = alphaval(i_sls)
+                betaval_loc = betaval(i_sls)
+                gammaval_loc = gammaval(i_sls)
+
+                ! term in xx
+                Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                                  betaval_loc * Sn + gammaval_loc * Snp1
+
+                ! term in yy
+                Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                                  betaval_loc * Sn + gammaval_loc * Snp1
+
+                ! term in zz not computed since zero trace
+
+                ! term in xy
+                Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                                  betaval_loc * Sn + gammaval_loc * Snp1
+
+                ! term in xz
+                Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                                  betaval_loc * Sn + gammaval_loc * Snp1
+
+                ! term in yz
+                Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                                  betaval_loc * Sn + gammaval_loc * Snp1
+
+             enddo   ! end of loop on memory variables
+
+          endif  !  end attenuation
+
+        enddo
+      enddo
+    enddo
+
+    ! save deviatoric strain for Runge-Kutta scheme
+    if ( COMPUTE_AND_STORE_STRAIN ) then
+      epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+      epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+      epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+      epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+      epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+    endif
+
+  enddo  ! spectral element loop
+  
+  ! C-PML boundary 
+  if( PML_CONDITIONS ) then
+     ! xmin
+     do ispec2D=1,nspec2D_xmin
+        ispec = ibelm_xmin(ispec2D)
+
+        i = 1
+
+        do k=1,NGLLZ
+           do j=1,NGLLY
+              iglob = ibool(i,j,k,ispec)
+
+              accel(1,iglob) = 0._CUSTOM_REAL
+              accel(2,iglob) = 0._CUSTOM_REAL
+              accel(3,iglob) = 0._CUSTOM_REAL
+
+              veloc(1,iglob) = 0._CUSTOM_REAL
+              veloc(2,iglob) = 0._CUSTOM_REAL
+              veloc(3,iglob) = 0._CUSTOM_REAL
+
+              displ(1,iglob) = 0._CUSTOM_REAL
+              displ(2,iglob) = 0._CUSTOM_REAL
+              displ(3,iglob) = 0._CUSTOM_REAL
+           enddo
+        enddo
+     enddo
+
+     ! xmax
+     do ispec2D=1,nspec2D_xmax
+        ispec = ibelm_xmax(ispec2D)
+
+        i = NGLLX
+
+        do k=1,NGLLZ
+           do j=1,NGLLY
+              iglob = ibool(i,j,k,ispec)
+
+              accel(1,iglob) = 0._CUSTOM_REAL
+              accel(2,iglob) = 0._CUSTOM_REAL
+              accel(3,iglob) = 0._CUSTOM_REAL
+
+              veloc(1,iglob) = 0._CUSTOM_REAL
+              veloc(2,iglob) = 0._CUSTOM_REAL
+              veloc(3,iglob) = 0._CUSTOM_REAL
+
+              displ(1,iglob) = 0._CUSTOM_REAL
+              displ(2,iglob) = 0._CUSTOM_REAL
+              displ(3,iglob) = 0._CUSTOM_REAL
+           enddo
+        enddo
+     enddo
+
+     ! ymin
+     do ispec2D=1,nspec2D_ymin
+        ispec = ibelm_ymin(ispec2D)
+
+        j = 1
+
+        do k=1,NGLLZ
+           do i=1,NGLLX
+              iglob = ibool(i,j,k,ispec)
+
+              accel(1,iglob) = 0._CUSTOM_REAL
+              accel(2,iglob) = 0._CUSTOM_REAL
+              accel(3,iglob) = 0._CUSTOM_REAL
+
+              veloc(1,iglob) = 0._CUSTOM_REAL
+              veloc(2,iglob) = 0._CUSTOM_REAL
+              veloc(3,iglob) = 0._CUSTOM_REAL
+
+              displ(1,iglob) = 0._CUSTOM_REAL
+              displ(2,iglob) = 0._CUSTOM_REAL
+              displ(3,iglob) = 0._CUSTOM_REAL
+           enddo
+        enddo
+     enddo
+
+     ! ymax
+     do ispec2D=1,nspec2D_ymax
+        ispec = ibelm_ymax(ispec2D)
+
+        j = NGLLY
+
+        do k=1,NGLLZ
+           do i=1,NGLLX
+              iglob = ibool(i,j,k,ispec)
+
+              accel(1,iglob) = 0._CUSTOM_REAL
+              accel(2,iglob) = 0._CUSTOM_REAL
+              accel(3,iglob) = 0._CUSTOM_REAL
+
+              veloc(1,iglob) = 0._CUSTOM_REAL
+              veloc(2,iglob) = 0._CUSTOM_REAL
+              veloc(3,iglob) = 0._CUSTOM_REAL
+
+              displ(1,iglob) = 0._CUSTOM_REAL
+              displ(2,iglob) = 0._CUSTOM_REAL
+              displ(3,iglob) = 0._CUSTOM_REAL
+           enddo
+        enddo
+     enddo
+
+     ! bottom (zmin)
+     do ispec2D=1,NSPEC2D_BOTTOM
+        ispec = ibelm_bottom(ispec2D)
+
+        k = 1
+
+        do j=1,NGLLY
+           do i=1,NGLLX
+              iglob = ibool(i,j,k,ispec)
+
+              accel(1,iglob) = 0._CUSTOM_REAL
+              accel(2,iglob) = 0._CUSTOM_REAL
+              accel(3,iglob) = 0._CUSTOM_REAL
+
+              veloc(1,iglob) = 0._CUSTOM_REAL
+              veloc(2,iglob) = 0._CUSTOM_REAL
+              veloc(3,iglob) = 0._CUSTOM_REAL
+
+              displ(1,iglob) = 0._CUSTOM_REAL
+              displ(2,iglob) = 0._CUSTOM_REAL
+              displ(3,iglob) = 0._CUSTOM_REAL
+           enddo
+        enddo
+     enddo
+
+     ! top (zmax)
+     do ispec2D=1,NSPEC2D_BOTTOM
+        ispec = ibelm_top(ispec2D)
+
+        k = NGLLZ
+
+        do j=1,NGLLY
+           do i=1,NGLLX
+              iglob = ibool(i,j,k,ispec)
+
+              accel(1,iglob) = 0._CUSTOM_REAL
+              accel(2,iglob) = 0._CUSTOM_REAL
+              accel(3,iglob) = 0._CUSTOM_REAL
+
+              veloc(1,iglob) = 0._CUSTOM_REAL
+              veloc(2,iglob) = 0._CUSTOM_REAL
+              veloc(3,iglob) = 0._CUSTOM_REAL
+
+              displ(1,iglob) = 0._CUSTOM_REAL
+              displ(2,iglob) = 0._CUSTOM_REAL
+              displ(3,iglob) = 0._CUSTOM_REAL
+           enddo
+        enddo
+     enddo
+
+  endif ! if( PML_CONDITIONS )
+
+end subroutine compute_forces_viscoelastic_noDev
+

Deleted: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_elastic.f90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_elastic.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -1,184 +0,0 @@
-!=====================================================================
-!
-!               S p e c f e m 3 D  V e r s i o n  2 . 1
-!               ---------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Princeton University, USA and CNRS / INRIA / University of Pau
-! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
-!                             July 2012
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-! for elastic solver
-
-! absorbing boundary term for elastic media (Stacey conditions)
-
-  subroutine compute_stacey_elastic(NSPEC_AB,NGLOB_AB,accel, &
-                        ibool,ispec_is_inner,phase_is_inner, &
-                        abs_boundary_normal,abs_boundary_jacobian2Dw, &
-                        abs_boundary_ijk,abs_boundary_ispec, &
-                        num_abs_boundary_faces, &
-                        veloc,rho_vp,rho_vs, &
-                        ispec_is_elastic,SIMULATION_TYPE,SAVE_FORWARD, &
-                        NSTEP,it,NGLOB_ADJOINT,b_accel, &
-                        b_num_abs_boundary_faces,b_reclen_field,b_absorb_field, &
-                        GPU_MODE,Mesh_pointer)
-
-  implicit none
-
-  include "constants.h"
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: accel
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-
-! communication overlap
-  logical, dimension(NSPEC_AB) :: ispec_is_inner
-  logical :: phase_is_inner
-
-! Stacey conditions
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: veloc
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: rho_vp,rho_vs
-
-  logical, dimension(NSPEC_AB) :: ispec_is_elastic
-
-! absorbing boundary surface
-  integer :: num_abs_boundary_faces
-  real(kind=CUSTOM_REAL) :: abs_boundary_normal(NDIM,NGLLSQUARE,num_abs_boundary_faces)
-  real(kind=CUSTOM_REAL) :: abs_boundary_jacobian2Dw(NGLLSQUARE,num_abs_boundary_faces)
-  integer :: abs_boundary_ijk(3,NGLLSQUARE,num_abs_boundary_faces)
-  integer :: abs_boundary_ispec(num_abs_boundary_faces)
-
-! adjoint simulations
-  integer:: SIMULATION_TYPE
-  integer:: NSTEP,it,NGLOB_ADJOINT
-  integer:: b_num_abs_boundary_faces,b_reclen_field
-  real(kind=CUSTOM_REAL),dimension(NDIM,NGLLSQUARE,b_num_abs_boundary_faces):: b_absorb_field
-
-  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_ADJOINT):: b_accel
-  logical:: SAVE_FORWARD
-
-  ! GPU_MODE variables
-  integer(kind=8) :: Mesh_pointer
-  logical :: GPU_MODE
-
-! local parameters
-  real(kind=CUSTOM_REAL) vx,vy,vz,nx,ny,nz,tx,ty,tz,vn,jacobianw
-  integer :: ispec,iglob,i,j,k,iface,igll
-  !integer:: reclen1,reclen2
-
-  ! checks if anything to do
-  if( num_abs_boundary_faces == 0 ) return
-
-! adjoint simulations:
-  if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0)  then
-    ! reads in absorbing boundary array when first phase is running
-    if( phase_is_inner .eqv. .false. ) then
-      ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
-      ! uses fortran routine
-      !read(IOABS,rec=NSTEP-it+1) reclen1,b_absorb_field,reclen2
-      !if (reclen1 /= b_reclen_field .or. reclen1 /= reclen2) &
-      !  call exit_mpi(0,'Error reading absorbing contribution b_absorb_field')
-      ! uses c routine for faster reading
-      call read_abs(0,b_absorb_field,b_reclen_field,NSTEP-it+1)
-    endif
-  endif !adjoint
-
-
-  if(.NOT. GPU_MODE) then
-
-     ! absorbs absorbing-boundary surface using Stacey condition (Clayton & Enquist)
-     do iface=1,num_abs_boundary_faces
-
-        ispec = abs_boundary_ispec(iface)
-
-        if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
-
-           if( ispec_is_elastic(ispec) ) then
-
-              ! reference gll points on boundary face
-              do igll = 1,NGLLSQUARE
-
-                 ! gets local indices for GLL point
-                 i = abs_boundary_ijk(1,igll,iface)
-                 j = abs_boundary_ijk(2,igll,iface)
-                 k = abs_boundary_ijk(3,igll,iface)
-
-                 ! gets velocity
-                 iglob=ibool(i,j,k,ispec)
-                 vx=veloc(1,iglob)
-                 vy=veloc(2,iglob)
-                 vz=veloc(3,iglob)
-
-                 ! gets associated normal
-                 nx = abs_boundary_normal(1,igll,iface)
-                 ny = abs_boundary_normal(2,igll,iface)
-                 nz = abs_boundary_normal(3,igll,iface)
-
-                 ! velocity component in normal direction (normal points out of element)
-                 vn = vx*nx + vy*ny + vz*nz
-
-                 ! stacey term: velocity vector component * vp * rho in normal direction + vs * rho component tangential to it
-                 tx = rho_vp(i,j,k,ispec)*vn*nx + rho_vs(i,j,k,ispec)*(vx-vn*nx)
-                 ty = rho_vp(i,j,k,ispec)*vn*ny + rho_vs(i,j,k,ispec)*(vy-vn*ny)
-                 tz = rho_vp(i,j,k,ispec)*vn*nz + rho_vs(i,j,k,ispec)*(vz-vn*nz)
-
-                 ! gets associated, weighted jacobian
-                 jacobianw = abs_boundary_jacobian2Dw(igll,iface)
-
-                 ! adds stacey term (weak form)
-                 accel(1,iglob) = accel(1,iglob) - tx*jacobianw
-                 accel(2,iglob) = accel(2,iglob) - ty*jacobianw
-                 accel(3,iglob) = accel(3,iglob) - tz*jacobianw
-
-                 ! adjoint simulations
-                 if (SIMULATION_TYPE == 3) then
-                    b_accel(:,iglob) = b_accel(:,iglob) - b_absorb_field(:,igll,iface)
-                 else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-                    b_absorb_field(1,igll,iface) = tx*jacobianw
-                    b_absorb_field(2,igll,iface) = ty*jacobianw
-                    b_absorb_field(3,igll,iface) = tz*jacobianw
-                 endif !adjoint
-
-              enddo
-           endif ! ispec_is_elastic
-        endif ! ispec_is_inner
-     enddo
-
-  else
-    ! GPU_MODE == .true.
-    if( num_abs_boundary_faces > 0 ) &
-      call compute_stacey_elastic_cuda(Mesh_pointer,phase_is_inner, &
-                                       SAVE_FORWARD,b_absorb_field)
-  endif
-
-  ! adjoint simulations: stores absorbed wavefield part
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
-    ! writes out absorbing boundary value only when second phase is running
-    if( phase_is_inner .eqv. .true. ) then
-      ! uses fortran routine
-      !write(IOABS,rec=it) b_reclen_field,b_absorb_field,b_reclen_field
-      ! uses c routine
-      call write_abs(0,b_absorb_field,b_reclen_field,it)
-    endif
-  endif
-
-  end subroutine compute_stacey_elastic
-

Copied: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_viscoelastic.f90 (from rev 21238, seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_elastic.f90)
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_viscoelastic.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_viscoelastic.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -0,0 +1,184 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 1
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+!                             July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+! for elastic solver
+
+! absorbing boundary term for elastic media (Stacey conditions)
+
+  subroutine compute_stacey_viscoelastic(NSPEC_AB,NGLOB_AB,accel, &
+                        ibool,ispec_is_inner,phase_is_inner, &
+                        abs_boundary_normal,abs_boundary_jacobian2Dw, &
+                        abs_boundary_ijk,abs_boundary_ispec, &
+                        num_abs_boundary_faces, &
+                        veloc,rho_vp,rho_vs, &
+                        ispec_is_elastic,SIMULATION_TYPE,SAVE_FORWARD, &
+                        NSTEP,it,NGLOB_ADJOINT,b_accel, &
+                        b_num_abs_boundary_faces,b_reclen_field,b_absorb_field, &
+                        GPU_MODE,Mesh_pointer)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: accel
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+
+! communication overlap
+  logical, dimension(NSPEC_AB) :: ispec_is_inner
+  logical :: phase_is_inner
+
+! Stacey conditions
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: veloc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: rho_vp,rho_vs
+
+  logical, dimension(NSPEC_AB) :: ispec_is_elastic
+
+! absorbing boundary surface
+  integer :: num_abs_boundary_faces
+  real(kind=CUSTOM_REAL) :: abs_boundary_normal(NDIM,NGLLSQUARE,num_abs_boundary_faces)
+  real(kind=CUSTOM_REAL) :: abs_boundary_jacobian2Dw(NGLLSQUARE,num_abs_boundary_faces)
+  integer :: abs_boundary_ijk(3,NGLLSQUARE,num_abs_boundary_faces)
+  integer :: abs_boundary_ispec(num_abs_boundary_faces)
+
+! adjoint simulations
+  integer:: SIMULATION_TYPE
+  integer:: NSTEP,it,NGLOB_ADJOINT
+  integer:: b_num_abs_boundary_faces,b_reclen_field
+  real(kind=CUSTOM_REAL),dimension(NDIM,NGLLSQUARE,b_num_abs_boundary_faces):: b_absorb_field
+
+  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_ADJOINT):: b_accel
+  logical:: SAVE_FORWARD
+
+  ! GPU_MODE variables
+  integer(kind=8) :: Mesh_pointer
+  logical :: GPU_MODE
+
+! local parameters
+  real(kind=CUSTOM_REAL) vx,vy,vz,nx,ny,nz,tx,ty,tz,vn,jacobianw
+  integer :: ispec,iglob,i,j,k,iface,igll
+  !integer:: reclen1,reclen2
+
+  ! checks if anything to do
+  if( num_abs_boundary_faces == 0 ) return
+
+! adjoint simulations:
+  if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0)  then
+    ! reads in absorbing boundary array when first phase is running
+    if( phase_is_inner .eqv. .false. ) then
+      ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
+      ! uses fortran routine
+      !read(IOABS,rec=NSTEP-it+1) reclen1,b_absorb_field,reclen2
+      !if (reclen1 /= b_reclen_field .or. reclen1 /= reclen2) &
+      !  call exit_mpi(0,'Error reading absorbing contribution b_absorb_field')
+      ! uses c routine for faster reading
+      call read_abs(0,b_absorb_field,b_reclen_field,NSTEP-it+1)
+    endif
+  endif !adjoint
+
+
+  if(.NOT. GPU_MODE) then
+
+     ! absorbs absorbing-boundary surface using Stacey condition (Clayton & Enquist)
+     do iface=1,num_abs_boundary_faces
+
+        ispec = abs_boundary_ispec(iface)
+
+        if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+
+           if( ispec_is_elastic(ispec) ) then
+
+              ! reference gll points on boundary face
+              do igll = 1,NGLLSQUARE
+
+                 ! gets local indices for GLL point
+                 i = abs_boundary_ijk(1,igll,iface)
+                 j = abs_boundary_ijk(2,igll,iface)
+                 k = abs_boundary_ijk(3,igll,iface)
+
+                 ! gets velocity
+                 iglob=ibool(i,j,k,ispec)
+                 vx=veloc(1,iglob)
+                 vy=veloc(2,iglob)
+                 vz=veloc(3,iglob)
+
+                 ! gets associated normal
+                 nx = abs_boundary_normal(1,igll,iface)
+                 ny = abs_boundary_normal(2,igll,iface)
+                 nz = abs_boundary_normal(3,igll,iface)
+
+                 ! velocity component in normal direction (normal points out of element)
+                 vn = vx*nx + vy*ny + vz*nz
+
+                 ! stacey term: velocity vector component * vp * rho in normal direction + vs * rho component tangential to it
+                 tx = rho_vp(i,j,k,ispec)*vn*nx + rho_vs(i,j,k,ispec)*(vx-vn*nx)
+                 ty = rho_vp(i,j,k,ispec)*vn*ny + rho_vs(i,j,k,ispec)*(vy-vn*ny)
+                 tz = rho_vp(i,j,k,ispec)*vn*nz + rho_vs(i,j,k,ispec)*(vz-vn*nz)
+
+                 ! gets associated, weighted jacobian
+                 jacobianw = abs_boundary_jacobian2Dw(igll,iface)
+
+                 ! adds stacey term (weak form)
+                 accel(1,iglob) = accel(1,iglob) - tx*jacobianw
+                 accel(2,iglob) = accel(2,iglob) - ty*jacobianw
+                 accel(3,iglob) = accel(3,iglob) - tz*jacobianw
+
+                 ! adjoint simulations
+                 if (SIMULATION_TYPE == 3) then
+                    b_accel(:,iglob) = b_accel(:,iglob) - b_absorb_field(:,igll,iface)
+                 else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+                    b_absorb_field(1,igll,iface) = tx*jacobianw
+                    b_absorb_field(2,igll,iface) = ty*jacobianw
+                    b_absorb_field(3,igll,iface) = tz*jacobianw
+                 endif !adjoint
+
+              enddo
+           endif ! ispec_is_elastic
+        endif ! ispec_is_inner
+     enddo
+
+  else
+    ! GPU_MODE == .true.
+    if( num_abs_boundary_faces > 0 ) &
+      call compute_stacey_viscoelastic_cuda(Mesh_pointer,phase_is_inner, &
+                                       SAVE_FORWARD,b_absorb_field)
+  endif
+
+  ! adjoint simulations: stores absorbed wavefield part
+  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
+    ! writes out absorbing boundary value only when second phase is running
+    if( phase_is_inner .eqv. .true. ) then
+      ! uses fortran routine
+      !write(IOABS,rec=it) b_reclen_field,b_absorb_field,b_reclen_field
+      ! uses c routine
+      call write_abs(0,b_absorb_field,b_reclen_field,it)
+    endif
+  endif
+
+  end subroutine compute_stacey_viscoelastic
+

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -80,7 +80,7 @@
 
     ! elastic solver
     ! (needs to be done first, before poroelastic one)
-    if( ELASTIC_SIMULATION ) call compute_forces_elastic()
+    if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic()
 
     ! poroelastic solver
     if( POROELASTIC_SIMULATION ) call compute_forces_poroelastic()

Deleted: seismo/3D/SPECFEM3D/trunk/src/specfem3D/older_compute_forces_elastic_Dev_openmp.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/older_compute_forces_elastic_Dev_openmp.f90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/older_compute_forces_elastic_Dev_openmp.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -1,984 +0,0 @@
-!=====================================================================
-!
-!               S p e c f e m 3 D  V e r s i o n  2 . 1
-!               ---------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Princeton University, USA and CNRS / INRIA / University of Pau
-! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
-!                            October 2011
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-! OpenMP Threaded variant by John Levesque, Max Rietmann and Olaf Schenk
-
-!! DK DK Jan 2013: beware, that OpenMP version is not maintained / supported and thus probably does not work
-
-  subroutine compute_forces_elastic_Dev_openmp(iphase ,NSPEC_AB,NGLOB_AB, &
-                             displ,veloc,accel, &
-                             xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                             hprime_xx,hprime_xxT, &
-                             hprimewgll_xx,hprimewgll_xxT, &
-                             wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                             kappastore,mustore,jacobian,ibool, &
-                             ATTENUATION,deltat, &
-                             one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
-                             NSPEC_ATTENUATION_AB, &
-                             R_xx,R_yy,R_xy,R_xz,R_yz, &
-                             epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                             epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                             ANISOTROPY,NSPEC_ANISO, &
-                             c11store,c12store,c13store,c14store,c15store,c16store,&
-                             c22store,c23store,c24store,c25store,c26store,c33store,&
-                             c34store,c35store,c36store,c44store,c45store,c46store,&
-                             c55store,c56store,c66store, &
-                             SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                             NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                             is_moho_top,is_moho_bot, &
-                             dsdx_top,dsdx_bot, &
-                             ispec2D_moho_top,ispec2D_moho_bot, &
-                             num_phase_ispec_elastic,&
-                             phase_ispec_inner_elastic,&
-                             num_colors_outer_elastic,num_colors_inner_elastic)
-
-
-
-  ! computes elastic tensor term
-
-  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
-       N_SLS,SAVE_MOHO_MESH, &
-       ONE_THIRD,FOUR_THIRDS,m1,m2
-
-  ! Trying to pass these variables as subroutine arguments ran into
-  ! problems, so we reference them from their module, making them
-  ! accessible from this subroutine
-  use specfem_par_elastic, only:dummyx_loc,dummyy_loc,dummyz_loc,newtempx1,newtempx2,newtempx3, &
-       newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3, &
-       tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3,num_elem_colors_elastic, &
-       dummyx_loc_att,dummyy_loc_att,dummyz_loc_att,tempx1_att,tempx2_att,tempx3_att, &
-       tempy1_att,tempy2_att,tempy3_att,tempz1_att,tempz2_att,tempz3_att
-
-  use fault_solver_dynamic, only : Kelvin_Voigt_eta
-
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-  ! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
-
-
-  ! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-       xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-       kappastore,mustore,jacobian
-
-  ! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-  ! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-       R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-  ! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-       c11store,c12store,c13store,c14store,c15store,c16store, &
-       c22store,c23store,c24store,c25store,c26store,c33store, &
-       c34store,c35store,c36store,c44store,c45store,c46store, &
-       c55store,c56store,c66store
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-  ! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-       dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
-
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  real(kind=CUSTOM_REAL) duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att
-  real(kind=CUSTOM_REAL) duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
-
-  integer OMP_get_thread_num
-  integer OMP_GET_MAX_THREADS
-
-  ! timing
-  !double precision omp_get_wtime
-  !double precision start_time
-  !double precision end_time
-  !double precision accumulate_time_start
-  !double precision accumulate_time_stop
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-       c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
-  integer i,j,k
-  integer thread_id
-  integer NUM_THREADS
-  !integer omp_get_num_threads ! function
-
-  ! coloring additions
-  ! integer, dimension(:), allocatable :: num_elem_colors_elastic
-  integer istart, estart, number_of_colors
-  integer num_colors_outer_elastic, num_colors_inner_elastic
-  integer icolor
-
-  real(kind=CUSTOM_REAL) :: eta
-
-  ! write(*,*) "num_elem_colors_elastic(1) = ",num_elem_colors_elastic(1)
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  ! NUM_THREADS = 1
-  NUM_THREADS = OMP_GET_MAX_THREADS()
-
-
-  ! choses inner/outer elements
-  if( iphase == 1 ) then
-     number_of_colors = num_colors_outer_elastic
-     istart = 1
-  else
-     number_of_colors = num_colors_inner_elastic + num_colors_outer_elastic
-     istart = num_colors_outer_elastic+1
-     ! istart = num_colors_outer_elastic
-  endif
-
-  ! "start" timer
-  ! start_time = omp_get_wtime()
-
-  ! The mesh coloring algorithm provides disjoint sets of elements that
-  ! do not share degrees of freedom which is required for the assembly
-  ! step at the "accel(iglob) += update" step. The coloring is
-  ! implemented, such that the element and node indices are ordered by
-  ! color. This requires then only to iterate through the elements in
-  ! order, stopping to synchronize threads after all the elements in a
-  ! color are finished.
-  estart = 1
-  do icolor = istart, number_of_colors
-
-    !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(&
-    !$OMP R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3,&
-    !$OMP factor_loc,alphaval_loc,betaval_loc,gammaval_loc,&
-    !$OMP Sn,Snp1,&
-    !$OMP templ,&
-    !$OMP xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl,&
-    !$OMP duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl,&
-    !$OMP duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,&
-    !$OMP duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,&
-    !$OMP sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,&
-    !$OMP fac1,fac2,fac3,&
-    !$OMP lambdal,mul,lambdalplus2mul,kappal,&
-    !$OMP c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-    !$OMP c33,c34,c35,c36,c44,c45,c46,c55,c56,c66,&
-    !$OMP i_SLS,&
-    !$OMP ispec,iglob,ispec_p,&
-    !$OMP i,j,k,&
-    !$OMP thread_id)
-
-    thread_id = OMP_get_thread_num()+1
-
-    ! we retrive the subset of the total elements determined by the mesh
-    ! coloring. This number changes as we iterate through the colors
-    num_elements = num_elem_colors_elastic(icolor)
-    !$OMP DO
-    do ispec_p = estart,(estart-1)+num_elements
-
-
-       ! returns element id from stored element list
-       ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-       ! adjoint simulations: moho kernel
-       if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
-          if (is_moho_top(ispec)) then
-             ispec2D_moho_top = ispec2D_moho_top + 1
-          else if (is_moho_bot(ispec)) then
-             ispec2D_moho_bot = ispec2D_moho_bot + 1
-          endif
-       endif ! adjoint
-
-       ! Kelvin Voigt damping: artificial viscosity around dynamic faults
-
-        ! stores displacment values in local array
-        if (allocated(Kelvin_Voigt_eta)) then
-          eta = Kelvin_Voigt_eta(ispec)   
-          do k=1,NGLLZ
-            do j=1,NGLLY
-              do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k,thread_id) = displ(1,iglob) + eta*veloc(1,iglob)
-                dummyy_loc(i,j,k,thread_id) = displ(2,iglob) + eta*veloc(2,iglob)
-                dummyz_loc(i,j,k,thread_id) = displ(3,iglob) + eta*veloc(3,iglob)
-              enddo
-            enddo
-          enddo
-
-        else
-          do k=1,NGLLZ
-            do j=1,NGLLY
-              do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k,thread_id) = displ(1,iglob)
-                dummyy_loc(i,j,k,thread_id) = displ(2,iglob)
-                dummyz_loc(i,j,k,thread_id) = displ(3,iglob)
-              enddo
-            enddo
-          enddo
-        endif
-
-       ! stores velocity values in local array
-       do k=1,NGLLZ
-          do j=1,NGLLY
-             do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc_att(i,j,k,thread_id) = veloc(1,iglob)
-                dummyy_loc_att(i,j,k,thread_id) = veloc(2,iglob)
-                dummyz_loc_att(i,j,k,thread_id) = veloc(3,iglob)
-             enddo
-          enddo
-       enddo
-
-       ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-       ! for incompressible fluid flow, Cambridge University Press (2002),
-       ! pages 386 and 389 and Figure 8.3.1
-       ! call mxm_m1_m2_5points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-       do j=1,m2
-          do i=1,m1
-             tempx1(i,j,1,thread_id) = &
-                  hprime_xx(i,1)*dummyx_loc(1,j,1,thread_id) + &
-                  hprime_xx(i,2)*dummyx_loc(2,j,1,thread_id) + &
-                  hprime_xx(i,3)*dummyx_loc(3,j,1,thread_id) + &
-                  hprime_xx(i,4)*dummyx_loc(4,j,1,thread_id) + &
-                  hprime_xx(i,5)*dummyx_loc(5,j,1,thread_id)
-             tempy1(i,j,1,thread_id) = &
-                  hprime_xx(i,1)*dummyy_loc(1,j,1,thread_id) + &
-                  hprime_xx(i,2)*dummyy_loc(2,j,1,thread_id) + &
-                  hprime_xx(i,3)*dummyy_loc(3,j,1,thread_id) + &
-                  hprime_xx(i,4)*dummyy_loc(4,j,1,thread_id) + &
-                  hprime_xx(i,5)*dummyy_loc(5,j,1,thread_id)
-             tempz1(i,j,1,thread_id) = &
-                  hprime_xx(i,1)*dummyz_loc(1,j,1,thread_id) + &
-                  hprime_xx(i,2)*dummyz_loc(2,j,1,thread_id) + &
-                  hprime_xx(i,3)*dummyz_loc(3,j,1,thread_id) + &
-                  hprime_xx(i,4)*dummyz_loc(4,j,1,thread_id) + &
-                  hprime_xx(i,5)*dummyz_loc(5,j,1,thread_id)
-          enddo
-       enddo
-
-       !   call mxm_m1_m1_5points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-       !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-       do j=1,m1
-          do i=1,m1
-             ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-             do k = 1,NGLLX
-                tempx2(i,j,k,thread_id) = dummyx_loc(i,1,k,thread_id)*hprime_xxT(1,j) + &
-                     dummyx_loc(i,2,k,thread_id)*hprime_xxT(2,j) + &
-                     dummyx_loc(i,3,k,thread_id)*hprime_xxT(3,j) + &
-                     dummyx_loc(i,4,k,thread_id)*hprime_xxT(4,j) + &
-                     dummyx_loc(i,5,k,thread_id)*hprime_xxT(5,j)
-                tempy2(i,j,k,thread_id) = dummyy_loc(i,1,k,thread_id)*hprime_xxT(1,j) + &
-                     dummyy_loc(i,2,k,thread_id)*hprime_xxT(2,j) + &
-                     dummyy_loc(i,3,k,thread_id)*hprime_xxT(3,j) + &
-                     dummyy_loc(i,4,k,thread_id)*hprime_xxT(4,j) + &
-                     dummyy_loc(i,5,k,thread_id)*hprime_xxT(5,j)
-                tempz2(i,j,k,thread_id) = dummyz_loc(i,1,k,thread_id)*hprime_xxT(1,j) + &
-                     dummyz_loc(i,2,k,thread_id)*hprime_xxT(2,j) + &
-                     dummyz_loc(i,3,k,thread_id)*hprime_xxT(3,j) + &
-                     dummyz_loc(i,4,k,thread_id)*hprime_xxT(4,j) + &
-                     dummyz_loc(i,5,k,thread_id)*hprime_xxT(5,j)
-             enddo
-          enddo
-       enddo
-
-       ! call mxm_m2_m1_5points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-       do j=1,m1
-          do i=1,m2
-             tempx3(i,1,j,thread_id) = &
-                  dummyx_loc(i,1,1,thread_id)*hprime_xxT(1,j) + &
-                  dummyx_loc(i,1,2,thread_id)*hprime_xxT(2,j) + &
-                  dummyx_loc(i,1,3,thread_id)*hprime_xxT(3,j) + &
-                  dummyx_loc(i,1,4,thread_id)*hprime_xxT(4,j) + &
-                  dummyx_loc(i,1,5,thread_id)*hprime_xxT(5,j)
-             tempy3(i,1,j,thread_id) = &
-                  dummyy_loc(i,1,1,thread_id)*hprime_xxT(1,j) + &
-                  dummyy_loc(i,1,2,thread_id)*hprime_xxT(2,j) + &
-                  dummyy_loc(i,1,3,thread_id)*hprime_xxT(3,j) + &
-                  dummyy_loc(i,1,4,thread_id)*hprime_xxT(4,j) + &
-                  dummyy_loc(i,1,5,thread_id)*hprime_xxT(5,j)
-             tempz3(i,1,j,thread_id) = &
-                  dummyz_loc(i,1,1,thread_id)*hprime_xxT(1,j) + &
-                  dummyz_loc(i,1,2,thread_id)*hprime_xxT(2,j) + &
-                  dummyz_loc(i,1,3,thread_id)*hprime_xxT(3,j) + &
-                  dummyz_loc(i,1,4,thread_id)*hprime_xxT(4,j) + &
-                  dummyz_loc(i,1,5,thread_id)*hprime_xxT(5,j)
-
-          enddo
-       enddo
-
-       if( ATTENUATION .and. COMPUTE_AND_STORE_STRAIN ) then
-          do j=1,m2
-             do i=1,m1
-                tempx1_att(i,j,1,thread_id) = tempx1(i,j,1,thread_id)
-                tempy1_att(i,j,1,thread_id) = tempy1(i,j,1,thread_id)
-                tempz1_att(i,j,1,thread_id) = tempz1(i,j,1,thread_id)
-             enddo
-          enddo
-
-          do j=1,m1
-             do i=1,m1
-                do k=1,NGLLX
-                   tempx2_att(i,j,k,thread_id) = tempx2(i,j,k,thread_id)
-                   tempy2_att(i,j,k,thread_id) = tempy2(i,j,k,thread_id)
-                   tempz2_att(i,j,k,thread_id) = tempz2(i,j,k,thread_id)
-                enddo
-             enddo
-          enddo
-
-          do j=1,m1
-             do i=1,m2
-                tempx3_att(i,1,j,thread_id) = tempx3(i,1,j,thread_id)
-                tempy3_att(i,1,j,thread_id) = tempy3(i,1,j,thread_id)
-                tempz3_att(i,1,j,thread_id) = tempz3(i,1,j,thread_id)
-             enddo
-          enddo
-
-          ! use first order Taylor expansion of displacement for local storage of stresses 
-          ! at this current time step, to fix attenuation in a consistent way
-          do j=1,m2
-             do i=1,m1
-                tempx1l_att(i,j,1,thread_id) = tempx1l_att(i,j,1,thread_id) + &
-                     deltat*hprime_xx(i,1)*dummyx_loc_att(1,j,1,thread_id) + &
-                     deltat*hprime_xx(i,2)*dummyx_loc_att(2,j,1,thread_id) + &
-                     deltat*hprime_xx(i,3)*dummyx_loc_att(3,j,1,thread_id) + &
-                     deltat*hprime_xx(i,4)*dummyx_loc_att(4,j,1,thread_id) + &
-                     deltat*hprime_xx(i,5)*dummyx_loc_att(5,j,1,thread_id)
-
-                tempy1l_att(i,j,1,thread_id) = tempy1l_att(i,j,1,thread_id) + &
-                     deltat*hprime_xx(i,1)*dummyy_loc_att(1,j,1,thread_id) + &
-                     deltat*hprime_xx(i,2)*dummyy_loc_att(2,j,1,thread_id) + &
-                     deltat*hprime_xx(i,3)*dummyy_loc_att(3,j,1,thread_id) + &
-                     deltat*hprime_xx(i,4)*dummyy_loc_att(4,j,1,thread_id) + &
-                     deltat*hprime_xx(i,5)*dummyy_loc_att(5,j,1,thread_id)
-
-                tempz1l_att(i,j,1,thread_id) = tempz1l_att(i,j,1,thread_id) + &
-                     deltat*hprime_xx(i,1)*dummyz_loc_att(1,j,1,thread_id) + &
-                     deltat*hprime_xx(i,2)*dummyz_loc_att(2,j,1,thread_id) + &
-                     deltat*hprime_xx(i,3)*dummyz_loc_att(3,j,1,thread_id) + &
-                     deltat*hprime_xx(i,4)*dummyz_loc_att(4,j,1,thread_id) + &
-                     deltat*hprime_xx(i,5)*dummyz_loc_att(5,j,1,thread_id)
-             enddo
-          enddo
-
-          do j=1,m1
-             do i=1,m1
-                do k=1,NGLLX
-                   tempx2l_att(i,j,k,thread_id) = tempx2l_att(i,j,k,thread_id) + &
-                        deltat*dummyx_loc_att(i,1,k,thread_id)*hprime_xxT(1,j) + &
-                        deltat*dummyx_loc_att(i,2,k,thread_id)*hprime_xxT(2,j) + &
-                        deltat*dummyx_loc_att(i,3,k,thread_id)*hprime_xxT(3,j) + &
-                        deltat*dummyx_loc_att(i,4,k,thread_id)*hprime_xxT(4,j) + &
-                        deltat*dummyx_loc_att(i,5,k,thread_id)*hprime_xxT(5,j)
-
-                   tempy2l_att(i,j,k,thread_id) = tempy2l_att(i,j,k,thread_id) + &
-                        deltat*dummyy_loc_att(i,1,k,thread_id)*hprime_xxT(1,j) + &
-                        deltat*dummyy_loc_att(i,2,k,thread_id)*hprime_xxT(2,j) + &
-                        deltat*dummyy_loc_att(i,3,k,thread_id)*hprime_xxT(3,j) + &
-                        deltat*dummyy_loc_att(i,4,k,thread_id)*hprime_xxT(4,j) + &
-                        deltat*dummyy_loc_att(i,5,k,thread_id)*hprime_xxT(5,j)
-
-                   tempz2l_att(i,j,k,thread_id) = tempz2l_att(i,j,k,thread_id) + &
-                        deltat*dummyz_loc_att(i,1,k,thread_id)*hprime_xxT(1,j) + &
-                        deltat*dummyz_loc_att(i,2,k,thread_id)*hprime_xxT(2,j) + &
-                        deltat*dummyz_loc_att(i,3,k,thread_id)*hprime_xxT(3,j) + &
-                        deltat*dummyz_loc_att(i,4,k,thread_id)*hprime_xxT(4,j) + &
-                        deltat*dummyz_loc_att(i,5,k,thread_id)*hprime_xxT(5,j)
-                enddo
-             enddo
-          enddo
-
-          do j=1,m1
-             do i=1,m2
-                tempx3l_att(i,1,j,thread_id) = tempx3l_att(i,1,j,thread_id) + &
-                     deltat*dummyx_loc_att(i,1,1,thread_id)*hprime_xxT(1,j) + &
-                     deltat*dummyx_loc_att(i,1,2,thread_id)*hprime_xxT(2,j) + &
-                     deltat*dummyx_loc_att(i,1,3,thread_id)*hprime_xxT(3,j) + &
-                     deltat*dummyx_loc_att(i,1,4,thread_id)*hprime_xxT(4,j) + &
-                     deltat*dummyx_loc_att(i,1,5,thread_id)*hprime_xxT(5,j)
-
-                tempy3l_att(i,1,j,thread_id) = tempy3l_att(i,1,j,thread_id) + &
-                     deltat*dummyy_loc_att(i,1,1,thread_id)*hprime_xxT(1,j) + &
-                     deltat*dummyy_loc_att(i,1,2,thread_id)*hprime_xxT(2,j) + &
-                     deltat*dummyy_loc_att(i,1,3,thread_id)*hprime_xxT(3,j) + &
-                     deltat*dummyy_loc_att(i,1,4,thread_id)*hprime_xxT(4,j) + &
-                     deltat*dummyy_loc_att(i,1,5,thread_id)*hprime_xxT(5,j)
-
-                tempz3l_att(i,1,j,thread_id) = tempz3l_att(i,1,j,thread_id) + &
-                     deltat*dummyz_loc_att(i,1,1,thread_id)*hprime_xxT(1,j) + &
-                     deltat*dummyz_loc_att(i,1,2,thread_id)*hprime_xxT(2,j) + &
-                     deltat*dummyz_loc_att(i,1,3,thread_id)*hprime_xxT(3,j) + &
-                     deltat*dummyz_loc_att(i,1,4,thread_id)*hprime_xxT(4,j) + &
-                     deltat*dummyz_loc_att(i,1,5,thread_id)*hprime_xxT(5,j)
-             enddo
-          endif
-       endif
-
-       do k=1,NGLLZ
-          do j=1,NGLLY
-             do i=1,NGLLX
-                ! get derivatives of ux, uy and uz with respect to x, y and z
-                xixl = xix(i,j,k,ispec)
-                xiyl = xiy(i,j,k,ispec)
-                xizl = xiz(i,j,k,ispec)
-                etaxl = etax(i,j,k,ispec)
-                etayl = etay(i,j,k,ispec)
-                etazl = etaz(i,j,k,ispec)
-                gammaxl = gammax(i,j,k,ispec)
-                gammayl = gammay(i,j,k,ispec)
-                gammazl = gammaz(i,j,k,ispec)
-                jacobianl = jacobian(i,j,k,ispec)
-
-                duxdxl = xixl*tempx1(i,j,k,thread_id) + etaxl*tempx2(i,j,k,thread_id) + gammaxl*tempx3(i,j,k,thread_id)
-                duxdyl = xiyl*tempx1(i,j,k,thread_id) + etayl*tempx2(i,j,k,thread_id) + gammayl*tempx3(i,j,k,thread_id)
-                duxdzl = xizl*tempx1(i,j,k,thread_id) + etazl*tempx2(i,j,k,thread_id) + gammazl*tempx3(i,j,k,thread_id)
-
-                duydxl = xixl*tempy1(i,j,k,thread_id) + etaxl*tempy2(i,j,k,thread_id) + gammaxl*tempy3(i,j,k,thread_id)
-                duydyl = xiyl*tempy1(i,j,k,thread_id) + etayl*tempy2(i,j,k,thread_id) + gammayl*tempy3(i,j,k,thread_id)
-                duydzl = xizl*tempy1(i,j,k,thread_id) + etazl*tempy2(i,j,k,thread_id) + gammazl*tempy3(i,j,k,thread_id)
-
-                duzdxl = xixl*tempz1(i,j,k,thread_id) + etaxl*tempz2(i,j,k,thread_id) + gammaxl*tempz3(i,j,k,thread_id)
-                duzdyl = xiyl*tempz1(i,j,k,thread_id) + etayl*tempz2(i,j,k,thread_id) + gammayl*tempz3(i,j,k,thread_id)
-                duzdzl = xizl*tempz1(i,j,k,thread_id) + etazl*tempz2(i,j,k,thread_id) + gammazl*tempz3(i,j,k,thread_id)
-
-                ! save strain on the Moho boundary
-                if (SAVE_MOHO_MESH ) then
-                   if (is_moho_top(ispec)) then
-                      dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-                      dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-                      dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-                      dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-                      dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-                      dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-                      dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-                      dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-                      dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-                   else if (is_moho_bot(ispec)) then
-                      dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-                      dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-                      dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-                      dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-                      dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-                      dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-                      dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-                      dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-                      dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-                   endif
-                endif
-
-                ! precompute some sums to save CPU time
-                duxdxl_plus_duydyl = duxdxl + duydyl
-                duxdxl_plus_duzdzl = duxdxl + duzdzl
-                duydyl_plus_duzdzl = duydyl + duzdzl
-                duxdyl_plus_duydxl = duxdyl + duydxl
-                duzdxl_plus_duxdzl = duzdxl + duxdzl
-                duzdyl_plus_duydzl = duzdyl + duydzl
-                
-                if( ATTENUATION .and. COMPUTE_AND_STORE_STRAIN ) then
-                   ! temporary variables used for fixing attenuation in a consistent way
-                   duxdxl_att = xixl*tempx1l_att(i,j,k,thread_id) + etaxl*tempx2l_att(i,j,k,thread_id) + &
-                        gammaxl*tempx3l_att(i,j,k,thread_id)
-                   duxdyl_att = xiyl*tempx1l_att(i,j,k,thread_id) + etayl*tempx2l_att(i,j,k,thread_id) + &
-                        gammayl*tempx3l_att(i,j,k,thread_id)
-                   duxdzl_att = xizl*tempx1l_att(i,j,k,thread_id) + etazl*tempx2l_att(i,j,k,thread_id) + &
-                        gammazl*tempx3l_att(i,j,k,thread_id)
-
-                   duydxl_att = xixl*tempy1l_att(i,j,k,thread_id) + etaxl*tempy2l_att(i,j,k,thread_id) + &
-                        gammaxl*tempy3l_att(i,j,k,thread_id)
-                   duydyl_att = xiyl*tempy1l_att(i,j,k,thread_id) + etayl*tempy2l_att(i,j,k,thread_id) + &
-                        gammayl*tempy3l_att(i,j,k,thread_id)
-                   duydzl_att = xizl*tempy1l_att(i,j,k,thread_id) + etazl*tempy2l_att(i,j,k,thread_id) + &
-                        gammazl*tempy3l_att(i,j,k,thread_id)
-
-                   duzdxl_att = xixl*tempz1l_att(i,j,k,thread_id) + etaxl*tempz2l_att(i,j,k,thread_id) + &
-                        gammaxl*tempz3l_att(i,j,k,thread_id)
-                   duzdyl_att = xiyl*tempz1l_att(i,j,k,thread_id) + etayl*tempz2l_att(i,j,k,thread_id) + &
-                        gammayl*tempz3l_att(i,j,k,thread_id)
-                   duzdzl_att = xizl*tempz1l_att(i,j,k,thread_id) + etazl*tempz2l_att(i,j,k,thread_id) + &
-                        gammazl*tempz3l_att(i,j,k,thread_id)
-
-                   ! precompute some sums to save CPU time
-                   duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att
-                   duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att
-                   duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att
-
-                   ! compute deviatoric strain
-                   if( SIMULATION_TYPE == 3 ) &
-                        epsilon_trace_over_3(i,j,k,ispec) = ONE_THIRD * (duxdxl_att + duydyl_att + duzdzl_att)
-                   epsilondev_xx_loc(i,j,k) = duxdxl_att - epsilon_trace_over_3(i,j,k,ispec)
-                   epsilondev_yy_loc(i,j,k) = duydyl_att - epsilon_trace_over_3(i,j,k,ispec)
-                   epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl_att
-                   epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl_att
-                   epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl_att
-                else
-                   ! computes deviatoric strain attenuation and/or for kernel calculations
-                   if (COMPUTE_AND_STORE_STRAIN) then
-                      templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                      if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                      epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                      epsilondev_yy_loc(i,j,k) = duydyl - templ
-                      epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                      epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                      epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-                   endif
-                endif
-
-                kappal = kappastore(i,j,k,ispec)
-                mul = mustore(i,j,k,ispec)
-
-                ! attenuation
-                if(ATTENUATION) then
-                   ! use unrelaxed parameters if attenuation
-                   mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-                endif
-
-                ! full anisotropic case, stress calculations
-                if(ANISOTROPY) then
-                   c11 = c11store(i,j,k,ispec)
-                   c12 = c12store(i,j,k,ispec)
-                   c13 = c13store(i,j,k,ispec)
-                   c14 = c14store(i,j,k,ispec)
-                   c15 = c15store(i,j,k,ispec)
-                   c16 = c16store(i,j,k,ispec)
-                   c22 = c22store(i,j,k,ispec)
-                   c23 = c23store(i,j,k,ispec)
-                   c24 = c24store(i,j,k,ispec)
-                   c25 = c25store(i,j,k,ispec)
-                   c26 = c26store(i,j,k,ispec)
-                   c33 = c33store(i,j,k,ispec)
-                   c34 = c34store(i,j,k,ispec)
-                   c35 = c35store(i,j,k,ispec)
-                   c36 = c36store(i,j,k,ispec)
-                   c44 = c44store(i,j,k,ispec)
-                   c45 = c45store(i,j,k,ispec)
-                   c46 = c46store(i,j,k,ispec)
-                   c55 = c55store(i,j,k,ispec)
-                   c56 = c56store(i,j,k,ispec)
-                   c66 = c66store(i,j,k,ispec)
-
-                   sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                        c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-                   sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                        c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-                   sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                        c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-                   sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                        c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-                   sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                        c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-                   sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                        c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-                else
-
-                   ! isotropic case
-                   lambdalplus2mul = kappal + FOUR_THIRDS * mul
-                   lambdal = lambdalplus2mul - 2.*mul
-
-                   ! compute stress sigma
-                   sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-                   sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-                   sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-                   sigma_xy = mul*duxdyl_plus_duydxl
-                   sigma_xz = mul*duzdxl_plus_duxdzl
-                   sigma_yz = mul*duzdyl_plus_duydzl
-
-                endif ! ANISOTROPY
-
-                ! subtract memory variables if attenuation
-                if(ATTENUATION) then
-                   ! way 1
-                   !                do i_sls = 1,N_SLS
-                   !                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
-                   !                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
-                   !                  sigma_xx = sigma_xx - R_xx_val
-                   !                  sigma_yy = sigma_yy - R_yy_val
-                   !                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
-                   !                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                   !                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                   !                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                   !                enddo
-
-                   ! way 2
-                   ! note: this should help compilers to pipeline the code and make better use of the cache;
-                   !          depending on compilers, it can further decrease the computation time by ~ 30%.
-                   !          by default, N_SLS = 3, therefore we take steps of 3
-                   if(imodulo_N_SLS >= 1) then
-                      do i_sls = 1,imodulo_N_SLS
-                         R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                         R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                         sigma_xx = sigma_xx - R_xx_val1
-                         sigma_yy = sigma_yy - R_yy_val1
-                         sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                         sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                         sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                         sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                      enddo
-                   endif
-
-                   if(N_SLS >= imodulo_N_SLS+1) then
-                      do i_sls = imodulo_N_SLS+1,N_SLS,3
-                         R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                         R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                         sigma_xx = sigma_xx - R_xx_val1
-                         sigma_yy = sigma_yy - R_yy_val1
-                         sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                         sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                         sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                         sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-
-                         R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
-                         R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
-                         sigma_xx = sigma_xx - R_xx_val2
-                         sigma_yy = sigma_yy - R_yy_val2
-                         sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-                         sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
-                         sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
-                         sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
-
-                         R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
-                         R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
-                         sigma_xx = sigma_xx - R_xx_val3
-                         sigma_yy = sigma_yy - R_yy_val3
-                         sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-                         sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
-                         sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
-                         sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
-                      enddo
-                   endif
-
-
-                endif
-
-                ! form dot product with test vector, symmetric form
-                tempx1(i,j,k,thread_id) = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl)
-                tempy1(i,j,k,thread_id) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl)
-                tempz1(i,j,k,thread_id) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl)
-
-                tempx2(i,j,k,thread_id) = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl)
-                tempy2(i,j,k,thread_id) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl)
-                tempz2(i,j,k,thread_id) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl)
-
-                tempx3(i,j,k,thread_id) = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl)
-                tempy3(i,j,k,thread_id) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl)
-                tempz3(i,j,k,thread_id) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl)
-
-             enddo
-          enddo
-       enddo
-
-       ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-       ! for incompressible fluid flow, Cambridge University Press (2002),
-       ! pages 386 and 389 and Figure 8.3.1
-       ! call mxm_m1_m2_5points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
-       do j=1,m2
-          do i=1,m1
-             newtempx1(i,j,1,thread_id) = &
-                  hprimewgll_xxT(i,1)*tempx1(1,j,1,thread_id) + &
-                  hprimewgll_xxT(i,2)*tempx1(2,j,1,thread_id) + &
-                  hprimewgll_xxT(i,3)*tempx1(3,j,1,thread_id) + &
-                  hprimewgll_xxT(i,4)*tempx1(4,j,1,thread_id) + &
-                  hprimewgll_xxT(i,5)*tempx1(5,j,1,thread_id)
-             newtempy1(i,j,1,thread_id) = &
-                  hprimewgll_xxT(i,1)*tempy1(1,j,1,thread_id) + &
-                  hprimewgll_xxT(i,2)*tempy1(2,j,1,thread_id) + &
-                  hprimewgll_xxT(i,3)*tempy1(3,j,1,thread_id) + &
-                  hprimewgll_xxT(i,4)*tempy1(4,j,1,thread_id) + &
-                  hprimewgll_xxT(i,5)*tempy1(5,j,1,thread_id)
-             newtempz1(i,j,1,thread_id) = &
-                  hprimewgll_xxT(i,1)*tempz1(1,j,1,thread_id) + &
-                  hprimewgll_xxT(i,2)*tempz1(2,j,1,thread_id) + &
-                  hprimewgll_xxT(i,3)*tempz1(3,j,1,thread_id) + &
-                  hprimewgll_xxT(i,4)*tempz1(4,j,1,thread_id) + &
-                  hprimewgll_xxT(i,5)*tempz1(5,j,1,thread_id)
-          enddo
-       enddo
-
-       !   call mxm_m1_m1_5points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
-       !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
-       do i=1,m1
-          do j=1,m1
-             ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-             do k = 1,NGLLX
-                newtempx2(i,j,k,thread_id) = tempx2(i,1,k,thread_id)*hprimewgll_xx(1,j) + &
-                     tempx2(i,2,k,thread_id)*hprimewgll_xx(2,j) + &
-                     tempx2(i,3,k,thread_id)*hprimewgll_xx(3,j) + &
-                     tempx2(i,4,k,thread_id)*hprimewgll_xx(4,j) + &
-                     tempx2(i,5,k,thread_id)*hprimewgll_xx(5,j)
-                newtempy2(i,j,k,thread_id) = tempy2(i,1,k,thread_id)*hprimewgll_xx(1,j) + &
-                     tempy2(i,2,k,thread_id)*hprimewgll_xx(2,j) + &
-                     tempy2(i,3,k,thread_id)*hprimewgll_xx(3,j) + &
-                     tempy2(i,4,k,thread_id)*hprimewgll_xx(4,j) + &
-                     tempy2(i,5,k,thread_id)*hprimewgll_xx(5,j)
-                newtempz2(i,j,k,thread_id) = tempz2(i,1,k,thread_id)*hprimewgll_xx(1,j) + &
-                     tempz2(i,2,k,thread_id)*hprimewgll_xx(2,j) + &
-                     tempz2(i,3,k,thread_id)*hprimewgll_xx(3,j) + &
-                     tempz2(i,4,k,thread_id)*hprimewgll_xx(4,j) + &
-                     tempz2(i,5,k,thread_id)*hprimewgll_xx(5,j)
-             enddo
-          enddo
-       enddo
-
-       ! call mxm_m2_m1_5points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
-       do j=1,m1
-          do i=1,m2
-             newtempx3(i,1,j,thread_id) = &
-                  tempx3(i,1,1,thread_id)*hprimewgll_xx(1,j) + &
-                  tempx3(i,1,2,thread_id)*hprimewgll_xx(2,j) + &
-                  tempx3(i,1,3,thread_id)*hprimewgll_xx(3,j) + &
-                  tempx3(i,1,4,thread_id)*hprimewgll_xx(4,j) + &
-                  tempx3(i,1,5,thread_id)*hprimewgll_xx(5,j)
-             newtempy3(i,1,j,thread_id) = &
-                  tempy3(i,1,1,thread_id)*hprimewgll_xx(1,j) + &
-                  tempy3(i,1,2,thread_id)*hprimewgll_xx(2,j) + &
-                  tempy3(i,1,3,thread_id)*hprimewgll_xx(3,j) + &
-                  tempy3(i,1,4,thread_id)*hprimewgll_xx(4,j) + &
-                  tempy3(i,1,5,thread_id)*hprimewgll_xx(5,j)
-             newtempz3(i,1,j,thread_id) = &
-                  tempz3(i,1,1,thread_id)*hprimewgll_xx(1,j) + &
-                  tempz3(i,1,2,thread_id)*hprimewgll_xx(2,j) + &
-                  tempz3(i,1,3,thread_id)*hprimewgll_xx(3,j) + &
-                  tempz3(i,1,4,thread_id)*hprimewgll_xx(4,j) + &
-                  tempz3(i,1,5,thread_id)*hprimewgll_xx(5,j)
-          enddo
-       enddo
-
-       do k=1,NGLLZ
-          do j=1,NGLLY
-             do i=1,NGLLX
-
-                fac1 = wgllwgll_yz(j,k)
-                fac2 = wgllwgll_xz(i,k)
-                fac3 = wgllwgll_xy(i,j)
-
-                ! sum contributions from each element to the global mesh using indirect addressing
-                iglob = ibool(i,j,k,ispec)
-
-                ! accel_omp(1,iglob,thread_id) = accel_omp(1,iglob,thread_id)&
-                !      - fac1*newtempx1(i,j,k,thread_id) - fac2*newtempx2(i,j,k,thread_id)&
-                !      - fac3*newtempx3(i,j,k,thread_id)
-                ! accel_omp(2,iglob,thread_id) = accel_omp(2,iglob,thread_id)&
-                !      - fac1*newtempy1(i,j,k,thread_id) - fac2*newtempy2(i,j,k,thread_id)&
-                !      - fac3*newtempy3(i,j,k,thread_id)
-                ! accel_omp(3,iglob,thread_id) = accel_omp(3,iglob,thread_id)&
-                !      - fac1*newtempz1(i,j,k,thread_id) - fac2*newtempz2(i,j,k,thread_id)&
-                !      - fac3*newtempz3(i,j,k,thread_id)
-
-                ! Assembly of shared degrees of freedom fixed through mesh coloring
-                !! !$OMP ATOMIC
-                accel(1,iglob) = accel(1,iglob) &
-                                  - (fac1*newtempx1(i,j,k,thread_id) &
-                                   + fac2*newtempx2(i,j,k,thread_id) &
-                                   + fac3*newtempx3(i,j,k,thread_id))
-                !! !$OMP ATOMIC
-                accel(2,iglob) = accel(2,iglob) &
-                                  - (fac1*newtempy1(i,j,k,thread_id) &
-                                   + fac2*newtempy2(i,j,k,thread_id) &
-                                   + fac3*newtempy3(i,j,k,thread_id))
-                !! !$OMP ATOMIC
-                accel(3,iglob) = accel(3,iglob) &
-                                  - (fac1*newtempz1(i,j,k,thread_id) &
-                                   + fac2*newtempz2(i,j,k,thread_id) &
-                                   + fac3*newtempz3(i,j,k,thread_id))
-
-                ! accel(1,iglob) = accel(1,iglob) - &
-                ! (fac1*newtempx1(i,j,k,thread_id) + fac2*newtempx2(i,j,k,thread_id) + fac3*newtempx3(i,j,k,thread_id))
-                ! accel(2,iglob) = accel(2,iglob) - &
-                ! (fac1*newtempy1(i,j,k,thread_id) + fac2*newtempy2(i,j,k,thread_id) + fac3*newtempy3(i,j,k,thread_id))
-                ! accel(3,iglob) = accel(3,iglob) - &
-                ! (fac1*newtempz1(i,j,k,thread_id) + fac2*newtempz2(i,j,k,thread_id) + fac3*newtempz3(i,j,k,thread_id))
-
-                ! accel_omp(1,iglob,thread_id) = accel_omp(1,iglob,thread_id) - fac1*newtempx1(i,j,k,thread_id) - &
-                !                   fac2*newtempx2(i,j,k,thread_id) - fac3*newtempx3(i,j,k,thread_id)
-                ! accel_omp(2,iglob,thread_id) = accel_omp(2,iglob,thread_id) - fac1*newtempy1(i,j,k,thread_id) - &
-                !                   fac2*newtempy2(i,j,k,thread_id) - fac3*newtempy3(i,j,k,thread_id)
-                ! accel_omp(3,iglob,thread_id) = accel_omp(3,iglob,thread_id) - fac1*newtempz1(i,j,k,thread_id) - &
-                !      fac2*newtempz2(i,j,k,thread_id) - fac3*newtempz3(i,j,k,thread_id)
-
-                !  update memory variables based upon the Runge-Kutta scheme
-                if(ATTENUATION) then
-
-                   ! use Runge-Kutta scheme to march in time
-                   do i_sls = 1,N_SLS
-
-                      factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                      alphaval_loc = alphaval(i_sls)
-                      betaval_loc = betaval(i_sls)
-                      gammaval_loc = gammaval(i_sls)
-
-                      ! term in xx
-                      Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                      Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                      R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                           betaval_loc * Sn + gammaval_loc * Snp1
-                      ! term in yy
-                      Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                      Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                      R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                           betaval_loc * Sn + gammaval_loc * Snp1
-                      ! term in zz not computed since zero trace
-                      ! term in xy
-                      Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                      Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                      R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                           betaval_loc * Sn + gammaval_loc * Snp1
-                      ! term in xz
-                      Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                      Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                      R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                           betaval_loc * Sn + gammaval_loc * Snp1
-                      ! term in yz
-                      Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                      Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                      R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                           betaval_loc * Sn + gammaval_loc * Snp1
-
-                   enddo   ! end of loop on memory variables
-
-                endif  !  end attenuation
-
-             enddo
-          enddo
-       enddo
-
-       ! save deviatoric strain for Runge-Kutta scheme
-       if ( COMPUTE_AND_STORE_STRAIN ) then
-          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-       endif
-
-    enddo  ! spectral element loop
-    !$OMP END DO
-    !$OMP END PARALLEL
-
-    ! The elements are in order of color. First we do color 1 elements,
-    ! then color 2, etc. The ispec has to moved to start at the next
-    ! color.
-    estart = estart + num_elements
-
-  enddo ! loop over colors
-
-
-  ! "stop" timer
-  ! end_time = omp_get_wtime()
-
-  ! write(*,*) "Total Elapsed time: ", (end_time-start_time) , "seconds. (Threads=",NUM_THREADS,")"
-  ! write(*,*) "Accumulate Elapsed time: ", (accumulate_time_stop-accumulate_time_start) , "seconds"
-
-
-  ! These are now allocated at the beginning and never deallocated
-  ! because the program just finishes at the end.
-
-  ! deallocate(dummyx_loc)
-  ! deallocate(dummyy_loc)
-  ! deallocate(dummyz_loc)
-  ! deallocate(dummyx_loc_att)
-  ! deallocate(dummyy_loc_att)
-  ! deallocate(dummyz_loc_att)
-  ! deallocate(newtempx1)
-  ! deallocate(newtempx2)
-  ! deallocate(newtempx3)
-  ! deallocate(newtempy1)
-  ! deallocate(newtempy2)
-  ! deallocate(newtempy3)
-  ! deallocate(newtempz1)
-  ! deallocate(newtempz2)
-  ! deallocate(newtempz3)
-  ! deallocate(tempx1)
-  ! deallocate(tempx2)
-  ! deallocate(tempx3)
-  ! deallocate(tempy1)
-  ! deallocate(tempy2)
-  ! deallocate(tempy3)
-  ! deallocate(tempz1)
-  ! deallocate(tempz2)
-  ! deallocate(tempz3)
-  ! deallocate(tempx1_att)
-  ! deallocate(tempx2_att)
-  ! deallocate(tempx3_att)
-  ! deallocate(tempy1_att)
-  ! deallocate(tempy2_att)
-  ! deallocate(tempy3_att)
-  ! deallocate(tempz1_att)
-  ! deallocate(tempz2_att)
-  ! deallocate(tempz3_att)
-
-  ! accel(:,:) = accel_omp(:,:,1)
-
-  end subroutine compute_forces_elastic_Dev_openmp
-
-

Copied: seismo/3D/SPECFEM3D/trunk/src/specfem3D/older_not_maintained_compute_forces_viscoelastic_Dev_openmp.f90 (from rev 21239, seismo/3D/SPECFEM3D/trunk/src/specfem3D/older_compute_forces_elastic_Dev_openmp.f90)
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/older_not_maintained_compute_forces_viscoelastic_Dev_openmp.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/older_not_maintained_compute_forces_viscoelastic_Dev_openmp.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -0,0 +1,984 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 1
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+!                            October 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+! OpenMP Threaded variant by John Levesque, Max Rietmann and Olaf Schenk
+
+!! DK DK Jan 2013: beware, that OpenMP version is not maintained / supported and thus probably does not work
+
+  subroutine compute_forces_viscoelastic_Dev_openmp(iphase ,NSPEC_AB,NGLOB_AB, &
+                             displ,veloc,accel, &
+                             xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                             hprime_xx,hprime_xxT, &
+                             hprimewgll_xx,hprimewgll_xxT, &
+                             wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                             kappastore,mustore,jacobian,ibool, &
+                             ATTENUATION,deltat, &
+                             one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                             NSPEC_ATTENUATION_AB, &
+                             R_xx,R_yy,R_xy,R_xz,R_yz, &
+                             epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                             epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                             ANISOTROPY,NSPEC_ANISO, &
+                             c11store,c12store,c13store,c14store,c15store,c16store,&
+                             c22store,c23store,c24store,c25store,c26store,c33store,&
+                             c34store,c35store,c36store,c44store,c45store,c46store,&
+                             c55store,c56store,c66store, &
+                             SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                             NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                             is_moho_top,is_moho_bot, &
+                             dsdx_top,dsdx_bot, &
+                             ispec2D_moho_top,ispec2D_moho_bot, &
+                             num_phase_ispec_elastic,&
+                             phase_ispec_inner_elastic,&
+                             num_colors_outer_elastic,num_colors_inner_elastic)
+
+
+
+  ! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+       N_SLS,SAVE_MOHO_MESH, &
+       ONE_THIRD,FOUR_THIRDS,m1,m2
+
+  ! Trying to pass these variables as subroutine arguments ran into
+  ! problems, so we reference them from their module, making them
+  ! accessible from this subroutine
+  use specfem_par_elastic, only:dummyx_loc,dummyy_loc,dummyz_loc,newtempx1,newtempx2,newtempx3, &
+       newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3, &
+       tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3,num_elem_colors_elastic, &
+       dummyx_loc_att,dummyy_loc_att,dummyz_loc_att,tempx1_att,tempx2_att,tempx3_att, &
+       tempy1_att,tempy2_att,tempy3_att,tempz1_att,tempz2_att,tempz3_att
+
+  use fault_solver_dynamic, only : Kelvin_Voigt_eta
+
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+  ! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+
+  ! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+       xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+       kappastore,mustore,jacobian
+
+  ! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+  ! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+       R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+  ! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+       c11store,c12store,c13store,c14store,c15store,c16store, &
+       c22store,c23store,c24store,c25store,c26store,c33store, &
+       c34store,c35store,c36store,c44store,c45store,c46store, &
+       c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+  ! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+       dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  real(kind=CUSTOM_REAL) duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att
+  real(kind=CUSTOM_REAL) duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
+
+  integer OMP_get_thread_num
+  integer OMP_GET_MAX_THREADS
+
+  ! timing
+  !double precision omp_get_wtime
+  !double precision start_time
+  !double precision end_time
+  !double precision accumulate_time_start
+  !double precision accumulate_time_stop
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+       c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+  integer thread_id
+  integer NUM_THREADS
+  !integer omp_get_num_threads ! function
+
+  ! coloring additions
+  ! integer, dimension(:), allocatable :: num_elem_colors_elastic
+  integer istart, estart, number_of_colors
+  integer num_colors_outer_elastic, num_colors_inner_elastic
+  integer icolor
+
+  real(kind=CUSTOM_REAL) :: eta
+
+  ! write(*,*) "num_elem_colors_elastic(1) = ",num_elem_colors_elastic(1)
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! NUM_THREADS = 1
+  NUM_THREADS = OMP_GET_MAX_THREADS()
+
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+     number_of_colors = num_colors_outer_elastic
+     istart = 1
+  else
+     number_of_colors = num_colors_inner_elastic + num_colors_outer_elastic
+     istart = num_colors_outer_elastic+1
+     ! istart = num_colors_outer_elastic
+  endif
+
+  ! "start" timer
+  ! start_time = omp_get_wtime()
+
+  ! The mesh coloring algorithm provides disjoint sets of elements that
+  ! do not share degrees of freedom which is required for the assembly
+  ! step at the "accel(iglob) += update" step. The coloring is
+  ! implemented, such that the element and node indices are ordered by
+  ! color. This requires then only to iterate through the elements in
+  ! order, stopping to synchronize threads after all the elements in a
+  ! color are finished.
+  estart = 1
+  do icolor = istart, number_of_colors
+
+    !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(&
+    !$OMP R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3,&
+    !$OMP factor_loc,alphaval_loc,betaval_loc,gammaval_loc,&
+    !$OMP Sn,Snp1,&
+    !$OMP templ,&
+    !$OMP xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl,&
+    !$OMP duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl,&
+    !$OMP duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,&
+    !$OMP duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,&
+    !$OMP sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,&
+    !$OMP fac1,fac2,fac3,&
+    !$OMP lambdal,mul,lambdalplus2mul,kappal,&
+    !$OMP c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+    !$OMP c33,c34,c35,c36,c44,c45,c46,c55,c56,c66,&
+    !$OMP i_SLS,&
+    !$OMP ispec,iglob,ispec_p,&
+    !$OMP i,j,k,&
+    !$OMP thread_id)
+
+    thread_id = OMP_get_thread_num()+1
+
+    ! we retrive the subset of the total elements determined by the mesh
+    ! coloring. This number changes as we iterate through the colors
+    num_elements = num_elem_colors_elastic(icolor)
+    !$OMP DO
+    do ispec_p = estart,(estart-1)+num_elements
+
+
+       ! returns element id from stored element list
+       ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+       ! adjoint simulations: moho kernel
+       if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+             ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+             ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+       endif ! adjoint
+
+       ! Kelvin Voigt damping: artificial viscosity around dynamic faults
+
+        ! stores displacment values in local array
+        if (allocated(Kelvin_Voigt_eta)) then
+          eta = Kelvin_Voigt_eta(ispec)   
+          do k=1,NGLLZ
+            do j=1,NGLLY
+              do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k,thread_id) = displ(1,iglob) + eta*veloc(1,iglob)
+                dummyy_loc(i,j,k,thread_id) = displ(2,iglob) + eta*veloc(2,iglob)
+                dummyz_loc(i,j,k,thread_id) = displ(3,iglob) + eta*veloc(3,iglob)
+              enddo
+            enddo
+          enddo
+
+        else
+          do k=1,NGLLZ
+            do j=1,NGLLY
+              do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k,thread_id) = displ(1,iglob)
+                dummyy_loc(i,j,k,thread_id) = displ(2,iglob)
+                dummyz_loc(i,j,k,thread_id) = displ(3,iglob)
+              enddo
+            enddo
+          enddo
+        endif
+
+       ! stores velocity values in local array
+       do k=1,NGLLZ
+          do j=1,NGLLY
+             do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc_att(i,j,k,thread_id) = veloc(1,iglob)
+                dummyy_loc_att(i,j,k,thread_id) = veloc(2,iglob)
+                dummyz_loc_att(i,j,k,thread_id) = veloc(3,iglob)
+             enddo
+          enddo
+       enddo
+
+       ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+       ! for incompressible fluid flow, Cambridge University Press (2002),
+       ! pages 386 and 389 and Figure 8.3.1
+       ! call mxm_m1_m2_5points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+       do j=1,m2
+          do i=1,m1
+             tempx1(i,j,1,thread_id) = &
+                  hprime_xx(i,1)*dummyx_loc(1,j,1,thread_id) + &
+                  hprime_xx(i,2)*dummyx_loc(2,j,1,thread_id) + &
+                  hprime_xx(i,3)*dummyx_loc(3,j,1,thread_id) + &
+                  hprime_xx(i,4)*dummyx_loc(4,j,1,thread_id) + &
+                  hprime_xx(i,5)*dummyx_loc(5,j,1,thread_id)
+             tempy1(i,j,1,thread_id) = &
+                  hprime_xx(i,1)*dummyy_loc(1,j,1,thread_id) + &
+                  hprime_xx(i,2)*dummyy_loc(2,j,1,thread_id) + &
+                  hprime_xx(i,3)*dummyy_loc(3,j,1,thread_id) + &
+                  hprime_xx(i,4)*dummyy_loc(4,j,1,thread_id) + &
+                  hprime_xx(i,5)*dummyy_loc(5,j,1,thread_id)
+             tempz1(i,j,1,thread_id) = &
+                  hprime_xx(i,1)*dummyz_loc(1,j,1,thread_id) + &
+                  hprime_xx(i,2)*dummyz_loc(2,j,1,thread_id) + &
+                  hprime_xx(i,3)*dummyz_loc(3,j,1,thread_id) + &
+                  hprime_xx(i,4)*dummyz_loc(4,j,1,thread_id) + &
+                  hprime_xx(i,5)*dummyz_loc(5,j,1,thread_id)
+          enddo
+       enddo
+
+       !   call mxm_m1_m1_5points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+       !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+       do j=1,m1
+          do i=1,m1
+             ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+             do k = 1,NGLLX
+                tempx2(i,j,k,thread_id) = dummyx_loc(i,1,k,thread_id)*hprime_xxT(1,j) + &
+                     dummyx_loc(i,2,k,thread_id)*hprime_xxT(2,j) + &
+                     dummyx_loc(i,3,k,thread_id)*hprime_xxT(3,j) + &
+                     dummyx_loc(i,4,k,thread_id)*hprime_xxT(4,j) + &
+                     dummyx_loc(i,5,k,thread_id)*hprime_xxT(5,j)
+                tempy2(i,j,k,thread_id) = dummyy_loc(i,1,k,thread_id)*hprime_xxT(1,j) + &
+                     dummyy_loc(i,2,k,thread_id)*hprime_xxT(2,j) + &
+                     dummyy_loc(i,3,k,thread_id)*hprime_xxT(3,j) + &
+                     dummyy_loc(i,4,k,thread_id)*hprime_xxT(4,j) + &
+                     dummyy_loc(i,5,k,thread_id)*hprime_xxT(5,j)
+                tempz2(i,j,k,thread_id) = dummyz_loc(i,1,k,thread_id)*hprime_xxT(1,j) + &
+                     dummyz_loc(i,2,k,thread_id)*hprime_xxT(2,j) + &
+                     dummyz_loc(i,3,k,thread_id)*hprime_xxT(3,j) + &
+                     dummyz_loc(i,4,k,thread_id)*hprime_xxT(4,j) + &
+                     dummyz_loc(i,5,k,thread_id)*hprime_xxT(5,j)
+             enddo
+          enddo
+       enddo
+
+       ! call mxm_m2_m1_5points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+       do j=1,m1
+          do i=1,m2
+             tempx3(i,1,j,thread_id) = &
+                  dummyx_loc(i,1,1,thread_id)*hprime_xxT(1,j) + &
+                  dummyx_loc(i,1,2,thread_id)*hprime_xxT(2,j) + &
+                  dummyx_loc(i,1,3,thread_id)*hprime_xxT(3,j) + &
+                  dummyx_loc(i,1,4,thread_id)*hprime_xxT(4,j) + &
+                  dummyx_loc(i,1,5,thread_id)*hprime_xxT(5,j)
+             tempy3(i,1,j,thread_id) = &
+                  dummyy_loc(i,1,1,thread_id)*hprime_xxT(1,j) + &
+                  dummyy_loc(i,1,2,thread_id)*hprime_xxT(2,j) + &
+                  dummyy_loc(i,1,3,thread_id)*hprime_xxT(3,j) + &
+                  dummyy_loc(i,1,4,thread_id)*hprime_xxT(4,j) + &
+                  dummyy_loc(i,1,5,thread_id)*hprime_xxT(5,j)
+             tempz3(i,1,j,thread_id) = &
+                  dummyz_loc(i,1,1,thread_id)*hprime_xxT(1,j) + &
+                  dummyz_loc(i,1,2,thread_id)*hprime_xxT(2,j) + &
+                  dummyz_loc(i,1,3,thread_id)*hprime_xxT(3,j) + &
+                  dummyz_loc(i,1,4,thread_id)*hprime_xxT(4,j) + &
+                  dummyz_loc(i,1,5,thread_id)*hprime_xxT(5,j)
+
+          enddo
+       enddo
+
+       if( ATTENUATION .and. COMPUTE_AND_STORE_STRAIN ) then
+          do j=1,m2
+             do i=1,m1
+                tempx1_att(i,j,1,thread_id) = tempx1(i,j,1,thread_id)
+                tempy1_att(i,j,1,thread_id) = tempy1(i,j,1,thread_id)
+                tempz1_att(i,j,1,thread_id) = tempz1(i,j,1,thread_id)
+             enddo
+          enddo
+
+          do j=1,m1
+             do i=1,m1
+                do k=1,NGLLX
+                   tempx2_att(i,j,k,thread_id) = tempx2(i,j,k,thread_id)
+                   tempy2_att(i,j,k,thread_id) = tempy2(i,j,k,thread_id)
+                   tempz2_att(i,j,k,thread_id) = tempz2(i,j,k,thread_id)
+                enddo
+             enddo
+          enddo
+
+          do j=1,m1
+             do i=1,m2
+                tempx3_att(i,1,j,thread_id) = tempx3(i,1,j,thread_id)
+                tempy3_att(i,1,j,thread_id) = tempy3(i,1,j,thread_id)
+                tempz3_att(i,1,j,thread_id) = tempz3(i,1,j,thread_id)
+             enddo
+          enddo
+
+          ! use first order Taylor expansion of displacement for local storage of stresses 
+          ! at this current time step, to fix attenuation in a consistent way
+          do j=1,m2
+             do i=1,m1
+                tempx1l_att(i,j,1,thread_id) = tempx1l_att(i,j,1,thread_id) + &
+                     deltat*hprime_xx(i,1)*dummyx_loc_att(1,j,1,thread_id) + &
+                     deltat*hprime_xx(i,2)*dummyx_loc_att(2,j,1,thread_id) + &
+                     deltat*hprime_xx(i,3)*dummyx_loc_att(3,j,1,thread_id) + &
+                     deltat*hprime_xx(i,4)*dummyx_loc_att(4,j,1,thread_id) + &
+                     deltat*hprime_xx(i,5)*dummyx_loc_att(5,j,1,thread_id)
+
+                tempy1l_att(i,j,1,thread_id) = tempy1l_att(i,j,1,thread_id) + &
+                     deltat*hprime_xx(i,1)*dummyy_loc_att(1,j,1,thread_id) + &
+                     deltat*hprime_xx(i,2)*dummyy_loc_att(2,j,1,thread_id) + &
+                     deltat*hprime_xx(i,3)*dummyy_loc_att(3,j,1,thread_id) + &
+                     deltat*hprime_xx(i,4)*dummyy_loc_att(4,j,1,thread_id) + &
+                     deltat*hprime_xx(i,5)*dummyy_loc_att(5,j,1,thread_id)
+
+                tempz1l_att(i,j,1,thread_id) = tempz1l_att(i,j,1,thread_id) + &
+                     deltat*hprime_xx(i,1)*dummyz_loc_att(1,j,1,thread_id) + &
+                     deltat*hprime_xx(i,2)*dummyz_loc_att(2,j,1,thread_id) + &
+                     deltat*hprime_xx(i,3)*dummyz_loc_att(3,j,1,thread_id) + &
+                     deltat*hprime_xx(i,4)*dummyz_loc_att(4,j,1,thread_id) + &
+                     deltat*hprime_xx(i,5)*dummyz_loc_att(5,j,1,thread_id)
+             enddo
+          enddo
+
+          do j=1,m1
+             do i=1,m1
+                do k=1,NGLLX
+                   tempx2l_att(i,j,k,thread_id) = tempx2l_att(i,j,k,thread_id) + &
+                        deltat*dummyx_loc_att(i,1,k,thread_id)*hprime_xxT(1,j) + &
+                        deltat*dummyx_loc_att(i,2,k,thread_id)*hprime_xxT(2,j) + &
+                        deltat*dummyx_loc_att(i,3,k,thread_id)*hprime_xxT(3,j) + &
+                        deltat*dummyx_loc_att(i,4,k,thread_id)*hprime_xxT(4,j) + &
+                        deltat*dummyx_loc_att(i,5,k,thread_id)*hprime_xxT(5,j)
+
+                   tempy2l_att(i,j,k,thread_id) = tempy2l_att(i,j,k,thread_id) + &
+                        deltat*dummyy_loc_att(i,1,k,thread_id)*hprime_xxT(1,j) + &
+                        deltat*dummyy_loc_att(i,2,k,thread_id)*hprime_xxT(2,j) + &
+                        deltat*dummyy_loc_att(i,3,k,thread_id)*hprime_xxT(3,j) + &
+                        deltat*dummyy_loc_att(i,4,k,thread_id)*hprime_xxT(4,j) + &
+                        deltat*dummyy_loc_att(i,5,k,thread_id)*hprime_xxT(5,j)
+
+                   tempz2l_att(i,j,k,thread_id) = tempz2l_att(i,j,k,thread_id) + &
+                        deltat*dummyz_loc_att(i,1,k,thread_id)*hprime_xxT(1,j) + &
+                        deltat*dummyz_loc_att(i,2,k,thread_id)*hprime_xxT(2,j) + &
+                        deltat*dummyz_loc_att(i,3,k,thread_id)*hprime_xxT(3,j) + &
+                        deltat*dummyz_loc_att(i,4,k,thread_id)*hprime_xxT(4,j) + &
+                        deltat*dummyz_loc_att(i,5,k,thread_id)*hprime_xxT(5,j)
+                enddo
+             enddo
+          enddo
+
+          do j=1,m1
+             do i=1,m2
+                tempx3l_att(i,1,j,thread_id) = tempx3l_att(i,1,j,thread_id) + &
+                     deltat*dummyx_loc_att(i,1,1,thread_id)*hprime_xxT(1,j) + &
+                     deltat*dummyx_loc_att(i,1,2,thread_id)*hprime_xxT(2,j) + &
+                     deltat*dummyx_loc_att(i,1,3,thread_id)*hprime_xxT(3,j) + &
+                     deltat*dummyx_loc_att(i,1,4,thread_id)*hprime_xxT(4,j) + &
+                     deltat*dummyx_loc_att(i,1,5,thread_id)*hprime_xxT(5,j)
+
+                tempy3l_att(i,1,j,thread_id) = tempy3l_att(i,1,j,thread_id) + &
+                     deltat*dummyy_loc_att(i,1,1,thread_id)*hprime_xxT(1,j) + &
+                     deltat*dummyy_loc_att(i,1,2,thread_id)*hprime_xxT(2,j) + &
+                     deltat*dummyy_loc_att(i,1,3,thread_id)*hprime_xxT(3,j) + &
+                     deltat*dummyy_loc_att(i,1,4,thread_id)*hprime_xxT(4,j) + &
+                     deltat*dummyy_loc_att(i,1,5,thread_id)*hprime_xxT(5,j)
+
+                tempz3l_att(i,1,j,thread_id) = tempz3l_att(i,1,j,thread_id) + &
+                     deltat*dummyz_loc_att(i,1,1,thread_id)*hprime_xxT(1,j) + &
+                     deltat*dummyz_loc_att(i,1,2,thread_id)*hprime_xxT(2,j) + &
+                     deltat*dummyz_loc_att(i,1,3,thread_id)*hprime_xxT(3,j) + &
+                     deltat*dummyz_loc_att(i,1,4,thread_id)*hprime_xxT(4,j) + &
+                     deltat*dummyz_loc_att(i,1,5,thread_id)*hprime_xxT(5,j)
+             enddo
+          endif
+       endif
+
+       do k=1,NGLLZ
+          do j=1,NGLLY
+             do i=1,NGLLX
+                ! get derivatives of ux, uy and uz with respect to x, y and z
+                xixl = xix(i,j,k,ispec)
+                xiyl = xiy(i,j,k,ispec)
+                xizl = xiz(i,j,k,ispec)
+                etaxl = etax(i,j,k,ispec)
+                etayl = etay(i,j,k,ispec)
+                etazl = etaz(i,j,k,ispec)
+                gammaxl = gammax(i,j,k,ispec)
+                gammayl = gammay(i,j,k,ispec)
+                gammazl = gammaz(i,j,k,ispec)
+                jacobianl = jacobian(i,j,k,ispec)
+
+                duxdxl = xixl*tempx1(i,j,k,thread_id) + etaxl*tempx2(i,j,k,thread_id) + gammaxl*tempx3(i,j,k,thread_id)
+                duxdyl = xiyl*tempx1(i,j,k,thread_id) + etayl*tempx2(i,j,k,thread_id) + gammayl*tempx3(i,j,k,thread_id)
+                duxdzl = xizl*tempx1(i,j,k,thread_id) + etazl*tempx2(i,j,k,thread_id) + gammazl*tempx3(i,j,k,thread_id)
+
+                duydxl = xixl*tempy1(i,j,k,thread_id) + etaxl*tempy2(i,j,k,thread_id) + gammaxl*tempy3(i,j,k,thread_id)
+                duydyl = xiyl*tempy1(i,j,k,thread_id) + etayl*tempy2(i,j,k,thread_id) + gammayl*tempy3(i,j,k,thread_id)
+                duydzl = xizl*tempy1(i,j,k,thread_id) + etazl*tempy2(i,j,k,thread_id) + gammazl*tempy3(i,j,k,thread_id)
+
+                duzdxl = xixl*tempz1(i,j,k,thread_id) + etaxl*tempz2(i,j,k,thread_id) + gammaxl*tempz3(i,j,k,thread_id)
+                duzdyl = xiyl*tempz1(i,j,k,thread_id) + etayl*tempz2(i,j,k,thread_id) + gammayl*tempz3(i,j,k,thread_id)
+                duzdzl = xizl*tempz1(i,j,k,thread_id) + etazl*tempz2(i,j,k,thread_id) + gammazl*tempz3(i,j,k,thread_id)
+
+                ! save strain on the Moho boundary
+                if (SAVE_MOHO_MESH ) then
+                   if (is_moho_top(ispec)) then
+                      dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                      dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                      dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                      dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                      dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                      dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                      dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                      dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                      dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                   else if (is_moho_bot(ispec)) then
+                      dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                      dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                      dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                      dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                      dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                      dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                      dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                      dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                      dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                   endif
+                endif
+
+                ! precompute some sums to save CPU time
+                duxdxl_plus_duydyl = duxdxl + duydyl
+                duxdxl_plus_duzdzl = duxdxl + duzdzl
+                duydyl_plus_duzdzl = duydyl + duzdzl
+                duxdyl_plus_duydxl = duxdyl + duydxl
+                duzdxl_plus_duxdzl = duzdxl + duxdzl
+                duzdyl_plus_duydzl = duzdyl + duydzl
+                
+                if( ATTENUATION .and. COMPUTE_AND_STORE_STRAIN ) then
+                   ! temporary variables used for fixing attenuation in a consistent way
+                   duxdxl_att = xixl*tempx1l_att(i,j,k,thread_id) + etaxl*tempx2l_att(i,j,k,thread_id) + &
+                        gammaxl*tempx3l_att(i,j,k,thread_id)
+                   duxdyl_att = xiyl*tempx1l_att(i,j,k,thread_id) + etayl*tempx2l_att(i,j,k,thread_id) + &
+                        gammayl*tempx3l_att(i,j,k,thread_id)
+                   duxdzl_att = xizl*tempx1l_att(i,j,k,thread_id) + etazl*tempx2l_att(i,j,k,thread_id) + &
+                        gammazl*tempx3l_att(i,j,k,thread_id)
+
+                   duydxl_att = xixl*tempy1l_att(i,j,k,thread_id) + etaxl*tempy2l_att(i,j,k,thread_id) + &
+                        gammaxl*tempy3l_att(i,j,k,thread_id)
+                   duydyl_att = xiyl*tempy1l_att(i,j,k,thread_id) + etayl*tempy2l_att(i,j,k,thread_id) + &
+                        gammayl*tempy3l_att(i,j,k,thread_id)
+                   duydzl_att = xizl*tempy1l_att(i,j,k,thread_id) + etazl*tempy2l_att(i,j,k,thread_id) + &
+                        gammazl*tempy3l_att(i,j,k,thread_id)
+
+                   duzdxl_att = xixl*tempz1l_att(i,j,k,thread_id) + etaxl*tempz2l_att(i,j,k,thread_id) + &
+                        gammaxl*tempz3l_att(i,j,k,thread_id)
+                   duzdyl_att = xiyl*tempz1l_att(i,j,k,thread_id) + etayl*tempz2l_att(i,j,k,thread_id) + &
+                        gammayl*tempz3l_att(i,j,k,thread_id)
+                   duzdzl_att = xizl*tempz1l_att(i,j,k,thread_id) + etazl*tempz2l_att(i,j,k,thread_id) + &
+                        gammazl*tempz3l_att(i,j,k,thread_id)
+
+                   ! precompute some sums to save CPU time
+                   duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att
+                   duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att
+                   duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att
+
+                   ! compute deviatoric strain
+                   if( SIMULATION_TYPE == 3 ) &
+                        epsilon_trace_over_3(i,j,k,ispec) = ONE_THIRD * (duxdxl_att + duydyl_att + duzdzl_att)
+                   epsilondev_xx_loc(i,j,k) = duxdxl_att - epsilon_trace_over_3(i,j,k,ispec)
+                   epsilondev_yy_loc(i,j,k) = duydyl_att - epsilon_trace_over_3(i,j,k,ispec)
+                   epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl_att
+                   epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl_att
+                   epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl_att
+                else
+                   ! computes deviatoric strain attenuation and/or for kernel calculations
+                   if (COMPUTE_AND_STORE_STRAIN) then
+                      templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                      if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                      epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                      epsilondev_yy_loc(i,j,k) = duydyl - templ
+                      epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                      epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                      epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+                   endif
+                endif
+
+                kappal = kappastore(i,j,k,ispec)
+                mul = mustore(i,j,k,ispec)
+
+                ! attenuation
+                if(ATTENUATION) then
+                   ! use unrelaxed parameters if attenuation
+                   mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+                endif
+
+                ! full anisotropic case, stress calculations
+                if(ANISOTROPY) then
+                   c11 = c11store(i,j,k,ispec)
+                   c12 = c12store(i,j,k,ispec)
+                   c13 = c13store(i,j,k,ispec)
+                   c14 = c14store(i,j,k,ispec)
+                   c15 = c15store(i,j,k,ispec)
+                   c16 = c16store(i,j,k,ispec)
+                   c22 = c22store(i,j,k,ispec)
+                   c23 = c23store(i,j,k,ispec)
+                   c24 = c24store(i,j,k,ispec)
+                   c25 = c25store(i,j,k,ispec)
+                   c26 = c26store(i,j,k,ispec)
+                   c33 = c33store(i,j,k,ispec)
+                   c34 = c34store(i,j,k,ispec)
+                   c35 = c35store(i,j,k,ispec)
+                   c36 = c36store(i,j,k,ispec)
+                   c44 = c44store(i,j,k,ispec)
+                   c45 = c45store(i,j,k,ispec)
+                   c46 = c46store(i,j,k,ispec)
+                   c55 = c55store(i,j,k,ispec)
+                   c56 = c56store(i,j,k,ispec)
+                   c66 = c66store(i,j,k,ispec)
+
+                   sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                        c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                   sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                        c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                   sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                        c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                   sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                        c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                   sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                        c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                   sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                        c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+                else
+
+                   ! isotropic case
+                   lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                   lambdal = lambdalplus2mul - 2.*mul
+
+                   ! compute stress sigma
+                   sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                   sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                   sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                   sigma_xy = mul*duxdyl_plus_duydxl
+                   sigma_xz = mul*duzdxl_plus_duxdzl
+                   sigma_yz = mul*duzdyl_plus_duydzl
+
+                endif ! ANISOTROPY
+
+                ! subtract memory variables if attenuation
+                if(ATTENUATION) then
+                   ! way 1
+                   !                do i_sls = 1,N_SLS
+                   !                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+                   !                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+                   !                  sigma_xx = sigma_xx - R_xx_val
+                   !                  sigma_yy = sigma_yy - R_yy_val
+                   !                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+                   !                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                   !                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                   !                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                   !                enddo
+
+                   ! way 2
+                   ! note: this should help compilers to pipeline the code and make better use of the cache;
+                   !          depending on compilers, it can further decrease the computation time by ~ 30%.
+                   !          by default, N_SLS = 3, therefore we take steps of 3
+                   if(imodulo_N_SLS >= 1) then
+                      do i_sls = 1,imodulo_N_SLS
+                         R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                         R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                         sigma_xx = sigma_xx - R_xx_val1
+                         sigma_yy = sigma_yy - R_yy_val1
+                         sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                         sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                         sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                         sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                      enddo
+                   endif
+
+                   if(N_SLS >= imodulo_N_SLS+1) then
+                      do i_sls = imodulo_N_SLS+1,N_SLS,3
+                         R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                         R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                         sigma_xx = sigma_xx - R_xx_val1
+                         sigma_yy = sigma_yy - R_yy_val1
+                         sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                         sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                         sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                         sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                         R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                         R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                         sigma_xx = sigma_xx - R_xx_val2
+                         sigma_yy = sigma_yy - R_yy_val2
+                         sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                         sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                         sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                         sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                         R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                         R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                         sigma_xx = sigma_xx - R_xx_val3
+                         sigma_yy = sigma_yy - R_yy_val3
+                         sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                         sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                         sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                         sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                      enddo
+                   endif
+
+
+                endif
+
+                ! form dot product with test vector, symmetric form
+                tempx1(i,j,k,thread_id) = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl)
+                tempy1(i,j,k,thread_id) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl)
+                tempz1(i,j,k,thread_id) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl)
+
+                tempx2(i,j,k,thread_id) = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl)
+                tempy2(i,j,k,thread_id) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl)
+                tempz2(i,j,k,thread_id) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl)
+
+                tempx3(i,j,k,thread_id) = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl)
+                tempy3(i,j,k,thread_id) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl)
+                tempz3(i,j,k,thread_id) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl)
+
+             enddo
+          enddo
+       enddo
+
+       ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+       ! for incompressible fluid flow, Cambridge University Press (2002),
+       ! pages 386 and 389 and Figure 8.3.1
+       ! call mxm_m1_m2_5points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+       do j=1,m2
+          do i=1,m1
+             newtempx1(i,j,1,thread_id) = &
+                  hprimewgll_xxT(i,1)*tempx1(1,j,1,thread_id) + &
+                  hprimewgll_xxT(i,2)*tempx1(2,j,1,thread_id) + &
+                  hprimewgll_xxT(i,3)*tempx1(3,j,1,thread_id) + &
+                  hprimewgll_xxT(i,4)*tempx1(4,j,1,thread_id) + &
+                  hprimewgll_xxT(i,5)*tempx1(5,j,1,thread_id)
+             newtempy1(i,j,1,thread_id) = &
+                  hprimewgll_xxT(i,1)*tempy1(1,j,1,thread_id) + &
+                  hprimewgll_xxT(i,2)*tempy1(2,j,1,thread_id) + &
+                  hprimewgll_xxT(i,3)*tempy1(3,j,1,thread_id) + &
+                  hprimewgll_xxT(i,4)*tempy1(4,j,1,thread_id) + &
+                  hprimewgll_xxT(i,5)*tempy1(5,j,1,thread_id)
+             newtempz1(i,j,1,thread_id) = &
+                  hprimewgll_xxT(i,1)*tempz1(1,j,1,thread_id) + &
+                  hprimewgll_xxT(i,2)*tempz1(2,j,1,thread_id) + &
+                  hprimewgll_xxT(i,3)*tempz1(3,j,1,thread_id) + &
+                  hprimewgll_xxT(i,4)*tempz1(4,j,1,thread_id) + &
+                  hprimewgll_xxT(i,5)*tempz1(5,j,1,thread_id)
+          enddo
+       enddo
+
+       !   call mxm_m1_m1_5points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+       !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+       do i=1,m1
+          do j=1,m1
+             ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+             do k = 1,NGLLX
+                newtempx2(i,j,k,thread_id) = tempx2(i,1,k,thread_id)*hprimewgll_xx(1,j) + &
+                     tempx2(i,2,k,thread_id)*hprimewgll_xx(2,j) + &
+                     tempx2(i,3,k,thread_id)*hprimewgll_xx(3,j) + &
+                     tempx2(i,4,k,thread_id)*hprimewgll_xx(4,j) + &
+                     tempx2(i,5,k,thread_id)*hprimewgll_xx(5,j)
+                newtempy2(i,j,k,thread_id) = tempy2(i,1,k,thread_id)*hprimewgll_xx(1,j) + &
+                     tempy2(i,2,k,thread_id)*hprimewgll_xx(2,j) + &
+                     tempy2(i,3,k,thread_id)*hprimewgll_xx(3,j) + &
+                     tempy2(i,4,k,thread_id)*hprimewgll_xx(4,j) + &
+                     tempy2(i,5,k,thread_id)*hprimewgll_xx(5,j)
+                newtempz2(i,j,k,thread_id) = tempz2(i,1,k,thread_id)*hprimewgll_xx(1,j) + &
+                     tempz2(i,2,k,thread_id)*hprimewgll_xx(2,j) + &
+                     tempz2(i,3,k,thread_id)*hprimewgll_xx(3,j) + &
+                     tempz2(i,4,k,thread_id)*hprimewgll_xx(4,j) + &
+                     tempz2(i,5,k,thread_id)*hprimewgll_xx(5,j)
+             enddo
+          enddo
+       enddo
+
+       ! call mxm_m2_m1_5points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+       do j=1,m1
+          do i=1,m2
+             newtempx3(i,1,j,thread_id) = &
+                  tempx3(i,1,1,thread_id)*hprimewgll_xx(1,j) + &
+                  tempx3(i,1,2,thread_id)*hprimewgll_xx(2,j) + &
+                  tempx3(i,1,3,thread_id)*hprimewgll_xx(3,j) + &
+                  tempx3(i,1,4,thread_id)*hprimewgll_xx(4,j) + &
+                  tempx3(i,1,5,thread_id)*hprimewgll_xx(5,j)
+             newtempy3(i,1,j,thread_id) = &
+                  tempy3(i,1,1,thread_id)*hprimewgll_xx(1,j) + &
+                  tempy3(i,1,2,thread_id)*hprimewgll_xx(2,j) + &
+                  tempy3(i,1,3,thread_id)*hprimewgll_xx(3,j) + &
+                  tempy3(i,1,4,thread_id)*hprimewgll_xx(4,j) + &
+                  tempy3(i,1,5,thread_id)*hprimewgll_xx(5,j)
+             newtempz3(i,1,j,thread_id) = &
+                  tempz3(i,1,1,thread_id)*hprimewgll_xx(1,j) + &
+                  tempz3(i,1,2,thread_id)*hprimewgll_xx(2,j) + &
+                  tempz3(i,1,3,thread_id)*hprimewgll_xx(3,j) + &
+                  tempz3(i,1,4,thread_id)*hprimewgll_xx(4,j) + &
+                  tempz3(i,1,5,thread_id)*hprimewgll_xx(5,j)
+          enddo
+       enddo
+
+       do k=1,NGLLZ
+          do j=1,NGLLY
+             do i=1,NGLLX
+
+                fac1 = wgllwgll_yz(j,k)
+                fac2 = wgllwgll_xz(i,k)
+                fac3 = wgllwgll_xy(i,j)
+
+                ! sum contributions from each element to the global mesh using indirect addressing
+                iglob = ibool(i,j,k,ispec)
+
+                ! accel_omp(1,iglob,thread_id) = accel_omp(1,iglob,thread_id)&
+                !      - fac1*newtempx1(i,j,k,thread_id) - fac2*newtempx2(i,j,k,thread_id)&
+                !      - fac3*newtempx3(i,j,k,thread_id)
+                ! accel_omp(2,iglob,thread_id) = accel_omp(2,iglob,thread_id)&
+                !      - fac1*newtempy1(i,j,k,thread_id) - fac2*newtempy2(i,j,k,thread_id)&
+                !      - fac3*newtempy3(i,j,k,thread_id)
+                ! accel_omp(3,iglob,thread_id) = accel_omp(3,iglob,thread_id)&
+                !      - fac1*newtempz1(i,j,k,thread_id) - fac2*newtempz2(i,j,k,thread_id)&
+                !      - fac3*newtempz3(i,j,k,thread_id)
+
+                ! Assembly of shared degrees of freedom fixed through mesh coloring
+                !! !$OMP ATOMIC
+                accel(1,iglob) = accel(1,iglob) &
+                                  - (fac1*newtempx1(i,j,k,thread_id) &
+                                   + fac2*newtempx2(i,j,k,thread_id) &
+                                   + fac3*newtempx3(i,j,k,thread_id))
+                !! !$OMP ATOMIC
+                accel(2,iglob) = accel(2,iglob) &
+                                  - (fac1*newtempy1(i,j,k,thread_id) &
+                                   + fac2*newtempy2(i,j,k,thread_id) &
+                                   + fac3*newtempy3(i,j,k,thread_id))
+                !! !$OMP ATOMIC
+                accel(3,iglob) = accel(3,iglob) &
+                                  - (fac1*newtempz1(i,j,k,thread_id) &
+                                   + fac2*newtempz2(i,j,k,thread_id) &
+                                   + fac3*newtempz3(i,j,k,thread_id))
+
+                ! accel(1,iglob) = accel(1,iglob) - &
+                ! (fac1*newtempx1(i,j,k,thread_id) + fac2*newtempx2(i,j,k,thread_id) + fac3*newtempx3(i,j,k,thread_id))
+                ! accel(2,iglob) = accel(2,iglob) - &
+                ! (fac1*newtempy1(i,j,k,thread_id) + fac2*newtempy2(i,j,k,thread_id) + fac3*newtempy3(i,j,k,thread_id))
+                ! accel(3,iglob) = accel(3,iglob) - &
+                ! (fac1*newtempz1(i,j,k,thread_id) + fac2*newtempz2(i,j,k,thread_id) + fac3*newtempz3(i,j,k,thread_id))
+
+                ! accel_omp(1,iglob,thread_id) = accel_omp(1,iglob,thread_id) - fac1*newtempx1(i,j,k,thread_id) - &
+                !                   fac2*newtempx2(i,j,k,thread_id) - fac3*newtempx3(i,j,k,thread_id)
+                ! accel_omp(2,iglob,thread_id) = accel_omp(2,iglob,thread_id) - fac1*newtempy1(i,j,k,thread_id) - &
+                !                   fac2*newtempy2(i,j,k,thread_id) - fac3*newtempy3(i,j,k,thread_id)
+                ! accel_omp(3,iglob,thread_id) = accel_omp(3,iglob,thread_id) - fac1*newtempz1(i,j,k,thread_id) - &
+                !      fac2*newtempz2(i,j,k,thread_id) - fac3*newtempz3(i,j,k,thread_id)
+
+                !  update memory variables based upon the Runge-Kutta scheme
+                if(ATTENUATION) then
+
+                   ! use Runge-Kutta scheme to march in time
+                   do i_sls = 1,N_SLS
+
+                      factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                      alphaval_loc = alphaval(i_sls)
+                      betaval_loc = betaval(i_sls)
+                      gammaval_loc = gammaval(i_sls)
+
+                      ! term in xx
+                      Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                      Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                      R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                           betaval_loc * Sn + gammaval_loc * Snp1
+                      ! term in yy
+                      Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                      Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                      R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                           betaval_loc * Sn + gammaval_loc * Snp1
+                      ! term in zz not computed since zero trace
+                      ! term in xy
+                      Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                      Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                      R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                           betaval_loc * Sn + gammaval_loc * Snp1
+                      ! term in xz
+                      Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                      Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                      R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                           betaval_loc * Sn + gammaval_loc * Snp1
+                      ! term in yz
+                      Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                      Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                      R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                           betaval_loc * Sn + gammaval_loc * Snp1
+
+                   enddo   ! end of loop on memory variables
+
+                endif  !  end attenuation
+
+             enddo
+          enddo
+       enddo
+
+       ! save deviatoric strain for Runge-Kutta scheme
+       if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+       endif
+
+    enddo  ! spectral element loop
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    ! The elements are in order of color. First we do color 1 elements,
+    ! then color 2, etc. The ispec has to moved to start at the next
+    ! color.
+    estart = estart + num_elements
+
+  enddo ! loop over colors
+
+
+  ! "stop" timer
+  ! end_time = omp_get_wtime()
+
+  ! write(*,*) "Total Elapsed time: ", (end_time-start_time) , "seconds. (Threads=",NUM_THREADS,")"
+  ! write(*,*) "Accumulate Elapsed time: ", (accumulate_time_stop-accumulate_time_start) , "seconds"
+
+
+  ! These are now allocated at the beginning and never deallocated
+  ! because the program just finishes at the end.
+
+  ! deallocate(dummyx_loc)
+  ! deallocate(dummyy_loc)
+  ! deallocate(dummyz_loc)
+  ! deallocate(dummyx_loc_att)
+  ! deallocate(dummyy_loc_att)
+  ! deallocate(dummyz_loc_att)
+  ! deallocate(newtempx1)
+  ! deallocate(newtempx2)
+  ! deallocate(newtempx3)
+  ! deallocate(newtempy1)
+  ! deallocate(newtempy2)
+  ! deallocate(newtempy3)
+  ! deallocate(newtempz1)
+  ! deallocate(newtempz2)
+  ! deallocate(newtempz3)
+  ! deallocate(tempx1)
+  ! deallocate(tempx2)
+  ! deallocate(tempx3)
+  ! deallocate(tempy1)
+  ! deallocate(tempy2)
+  ! deallocate(tempy3)
+  ! deallocate(tempz1)
+  ! deallocate(tempz2)
+  ! deallocate(tempz3)
+  ! deallocate(tempx1_att)
+  ! deallocate(tempx2_att)
+  ! deallocate(tempx3_att)
+  ! deallocate(tempy1_att)
+  ! deallocate(tempy2_att)
+  ! deallocate(tempy3_att)
+  ! deallocate(tempz1_att)
+  ! deallocate(tempz2_att)
+  ! deallocate(tempz3_att)
+
+  ! accel(:,:) = accel_omp(:,:,1)
+
+  end subroutine compute_forces_viscoelastic_Dev_openmp
+
+

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -1360,7 +1360,7 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-! OpenMP version uses "special" compute_forces_elastic_Dev routine
+! OpenMP version uses "special" compute_forces_viscoelastic_Dev routine
 ! we need to set num_elem_colors_elastic arrays
 
 #ifdef OPENMP_MODE

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/read_mesh_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/read_mesh_databases.f90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/read_mesh_databases.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -172,7 +172,7 @@
     if( ier /= 0 ) stop 'error allocating array c11store etc.'
 
     ! note: currently, they need to be defined, as they are used in the routine arguments
-    !          for compute_forces_elastic_Deville()
+    !          for compute_forces_viscoelastic_Deville()
     allocate(R_xx(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
             R_yy(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
             R_xy(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90	2013-01-16 18:04:01 UTC (rev 21239)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90	2013-01-16 18:19:52 UTC (rev 21240)
@@ -728,7 +728,7 @@
 
 ! note:
 ! computes adjoint sources in chunks/blocks during time iterations.
-! we moved it to compute_add_sources_elastic.f90 & compute_add_sources_acoustic.f90,
+! we moved it to compute_add_sources_viscoelastic.f90 & compute_add_sources_acoustic.f90,
 ! because we may need to read in adjoint sources block by block
 
     ! initializes adjoint sources