[cig-commits] r16128 - seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN

Thu Jan 7 10:07:20 PST 2010

Author: pieyre
Date: 2010-01-07 10:07:20 -0800 (Thu, 07 Jan 2010)
New Revision: 16128

Added:
   seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN/specfem3D_no_Deville.f90
Modified:
   seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN/constants.h.in
   seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN/specfem3D.f90
Log:
added Deville et al. (2002) matrix products for forward and backward calculations

Modified: seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN/constants.h.in
===================================================================

--- seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN/constants.h.in	2010-01-07 14:04:22 UTC (rev 16127)
+++ seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN/constants.h.in	2010-01-07 18:07:20 UTC (rev 16128)
@@ -47,11 +47,17 @@
 ! apply heuristic rule to modify doubling regions to balance angles
   logical, parameter :: APPLY_HEURISTIC_RULE = .true.
 
+! use inlined products of Deville et al. (2002) to speedup the calculations to compute internal forces
+!  logical, parameter :: USE_DEVILLE_PRODUCTS = .true.
+
 ! number of GLL points in each direction of an element (degree plus one)
   integer, parameter :: NGLLX = 5
   integer, parameter :: NGLLY = NGLLX
   integer, parameter :: NGLLZ = NGLLX
 
+! for optimized routines by Deville et al. (2002)
+  integer, parameter :: m1 = NGLLX, m2 = NGLLX * NGLLY
+
 ! input, output and main MPI I/O files
   integer, parameter :: ISTANDARD_OUTPUT = 6
   integer, parameter :: IIN = 40,IOUT = 41

Modified: seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN/specfem3D.f90
===================================================================
--- seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN/specfem3D.f90	2010-01-07 14:04:22 UTC (rev 16127)
+++ seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN/specfem3D.f90	2010-01-07 18:07:20 UTC (rev 16128)
@@ -287,13 +287,57 @@
   real(kind=CUSTOM_REAL) lambdal,kappal,mul,lambdalplus2mul
   real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
 
-  real(kind=CUSTOM_REAL) tempx1l,tempx2l,tempx3l
-  real(kind=CUSTOM_REAL) tempy1l,tempy2l,tempy3l
-  real(kind=CUSTOM_REAL) tempz1l,tempz2l,tempz3l
+!  real(kind=CUSTOM_REAL) tempx1l,tempx2l,tempx3l
+!  real(kind=CUSTOM_REAL) tempy1l,tempy2l,tempy3l
+!  real(kind=CUSTOM_REAL) tempz1l,tempz2l,tempz3l
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
 
+!pll
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+
+   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
+    tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l 
+
+! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(NGLLX,m2) :: B1_m1_m2_5points,B2_m1_m2_5points,B3_m1_m2_5points
+  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points,C2_m1_m2_5points,C3_m1_m2_5points
+  real(kind=CUSTOM_REAL), dimension(m1,m2) :: D1_m1_m2_5points,D2_m1_m2_5points,D3_m1_m2_5points
+  real(kind=CUSTOM_REAL), dimension(m1,m2) :: E1_m1_m2_5points,E2_m1_m2_5points,E3_m1_m2_5points
+
+  equivalence(dummyx_loc,B1_m1_m2_5points)
+  equivalence(dummyy_loc,B2_m1_m2_5points)
+  equivalence(dummyz_loc,B3_m1_m2_5points)
+  equivalence(tempx1,C1_m1_m2_5points)
+  equivalence(tempy1,C2_m1_m2_5points)
+  equivalence(tempz1,C3_m1_m2_5points)
+  equivalence(tempx1l,D1_m1_m2_5points)
+  equivalence(tempy1l,D2_m1_m2_5points)
+  equivalence(tempz1l,D3_m1_m2_5points)
+  equivalence(newtempx1,E1_m1_m2_5points)
+  equivalence(newtempy1,E2_m1_m2_5points)
+  equivalence(newtempz1,E3_m1_m2_5points)
+
+  real(kind=CUSTOM_REAL), dimension(m2,NGLLX) :: A1_mxm_m2_m1_5points,A2_mxm_m2_m1_5points,A3_mxm_m2_m1_5points
+  real(kind=CUSTOM_REAL), dimension(m2,m1) :: C1_mxm_m2_m1_5points,C2_mxm_m2_m1_5points,C3_mxm_m2_m1_5points
+  real(kind=CUSTOM_REAL), dimension(m2,m1) :: D1_mxm_m2_m1_5points,D2_mxm_m2_m1_5points,D3_mxm_m2_m1_5points
+  real(kind=CUSTOM_REAL), dimension(m2,m1) :: E1_mxm_m2_m1_5points,E2_mxm_m2_m1_5points,E3_mxm_m2_m1_5points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_5points)
+  equivalence(dummyy_loc,A2_mxm_m2_m1_5points)
+  equivalence(dummyz_loc,A3_mxm_m2_m1_5points)
+  equivalence(tempx3,C1_mxm_m2_m1_5points)
+  equivalence(tempy3,C2_mxm_m2_m1_5points)
+  equivalence(tempz3,C3_mxm_m2_m1_5points)
+  equivalence(tempx3l,D1_mxm_m2_m1_5points)
+  equivalence(tempy3l,D2_mxm_m2_m1_5points)
+  equivalence(tempz3l,D3_mxm_m2_m1_5points) 
+  equivalence(newtempx3,E1_mxm_m2_m1_5points)
+  equivalence(newtempy3,E2_mxm_m2_m1_5points)
+  equivalence(newtempz3,E3_mxm_m2_m1_5points)
+
 ! time scheme
   real(kind=CUSTOM_REAL) deltat,deltatover2,deltatsqover2
 
@@ -308,15 +352,29 @@
   real(kind=CUSTOM_REAL) b_duxdyl_plus_duydxl,b_duzdxl_plus_duxdzl,b_duzdyl_plus_duydzl
   real(kind=CUSTOM_REAL) b_dsxx,b_dsxy,b_dsxz,b_dsyy,b_dsyz,b_dszz
   real(kind=CUSTOM_REAL) b_sigma_xx,b_sigma_yy,b_sigma_zz,b_sigma_xy,b_sigma_xz,b_sigma_yz
-  real(kind=CUSTOM_REAL) b_tempx1l,b_tempx2l,b_tempx3l
-  real(kind=CUSTOM_REAL) b_tempy1l,b_tempy2l,b_tempy3l
-  real(kind=CUSTOM_REAL) b_tempz1l,b_tempz2l,b_tempz3l
+!   real(kind=CUSTOM_REAL) b_tempx1l,b_tempx2l,b_tempx3l
+!   real(kind=CUSTOM_REAL) b_tempy1l,b_tempy2l,b_tempy3l
+!   real(kind=CUSTOM_REAL) b_tempz1l,b_tempz2l,b_tempz3l
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     b_tempx1,b_tempx2,b_tempx3,b_tempy1,b_tempy2,b_tempy3,b_tempz1,b_tempz2,b_tempz3
   real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: absorb_xmin, absorb_xmax, &
     absorb_ymin, absorb_ymax, absorb_zmin ! for absorbing b.c.
   integer reclen_xmin, reclen_xmax, reclen_ymin, reclen_ymax, reclen_zmin, reclen1, reclen2
   real(kind=CUSTOM_REAL) b_deltat, b_deltatover2, b_deltatsqover2
+
+! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(m1,m2) :: b_C1_m1_m2_5points,b_C2_m1_m2_5points,b_C3_m1_m2_5points
+
+  equivalence(b_tempx1,b_C1_m1_m2_5points)
+  equivalence(b_tempy1,b_C2_m1_m2_5points)
+  equivalence(b_tempz1,b_C3_m1_m2_5points)
+
+  real(kind=CUSTOM_REAL), dimension(m2,m1) :: b_C1_mxm_m2_m1_5points,b_C2_mxm_m2_m1_5points,b_C3_mxm_m2_m1_5points
+
+  equivalence(b_tempx3,b_C1_mxm_m2_m1_5points)
+  equivalence(b_tempy3,b_C2_mxm_m2_m1_5points)
+  equivalence(b_tempz3,b_C3_mxm_m2_m1_5points)
+
 ! ADJOINT
 
 ! for attenuation
@@ -400,9 +458,12 @@
   double precision, dimension(NGLLZ) :: zigll,wzgll
 
 ! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
-  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprime_xxT
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy
+  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprimewgll_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprimewgll_yy
+  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprimewgll_zz
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
   real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
@@ -659,6 +720,14 @@
          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz)
 
+! define transpose of derivation matrix
+  do j = 1,NGLLY
+    do i = 1,NGLLX
+      hprime_xxT(j,i) = hprime_xx(i,j)
+      hprimewgll_xxT(j,i) = hprimewgll_xx(i,j)
+    enddo
+  enddo
+
 ! allocate 1-D Lagrange interpolators and derivatives
   allocate(hxir(NGLLX))
   allocate(hpxir(NGLLX))
@@ -1562,182 +1631,234 @@
 
   do ispec = 1,NSPEC_AB
 
-    if (SAVE_MOHO_MESH .and. SIMULATION_TYPE == 3) then
-      if (is_moho_top(ispec)) then
-        ispec2D_moho_top = ispec2D_moho_top + 1
-      else if (is_moho_bot(ispec)) then
-        ispec2D_moho_bot = ispec2D_moho_bot + 1
-      endif
-    endif
+     if (SAVE_MOHO_MESH .and. SIMULATION_TYPE == 3) then
+        if (is_moho_top(ispec)) then
+           ispec2D_moho_top = ispec2D_moho_top + 1
+        else if (is_moho_bot(ispec)) then
+           ispec2D_moho_bot = ispec2D_moho_bot + 1
+        endif
+     endif
 
 !---------------------------------------------------------------------------------------------------
 ! beginning of nested loops on i,j,k to perform the forward calculations in a given element (ispec)
 !---------------------------------------------------------------------------------------------------
 
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
+     do k=1,NGLLZ
+        do j=1,NGLLY
+           do i=1,NGLLX
+              iglob = ibool(i,j,k,ispec)
+              dummyx_loc(i,j,k) = displ(1,iglob)
+              dummyy_loc(i,j,k) = displ(2,iglob)
+              dummyz_loc(i,j,k) = displ(3,iglob)
+           enddo
+        enddo
+     enddo
+     
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+    ! call mxm_m1_m2_5points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+     do j=1,m2
+        do i=1,m1
+           D1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
+                hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
+                hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
+                hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
+                hprime_xx(i,5)*B1_m1_m2_5points(5,j)
 
-          tempx1l = 0.
-          tempx2l = 0.
-          tempx3l = 0.
+           D2_m1_m2_5points(i,j) = hprime_xx(i,1)*B2_m1_m2_5points(1,j) + &
+                hprime_xx(i,2)*B2_m1_m2_5points(2,j) + &
+                hprime_xx(i,3)*B2_m1_m2_5points(3,j) + &
+                hprime_xx(i,4)*B2_m1_m2_5points(4,j) + &
+                hprime_xx(i,5)*B2_m1_m2_5points(5,j)
 
-          tempy1l = 0.
-          tempy2l = 0.
-          tempy3l = 0.
+           D3_m1_m2_5points(i,j) = hprime_xx(i,1)*B3_m1_m2_5points(1,j) + &
+                hprime_xx(i,2)*B3_m1_m2_5points(2,j) + &
+                hprime_xx(i,3)*B3_m1_m2_5points(3,j) + &
+                hprime_xx(i,4)*B3_m1_m2_5points(4,j) + &
+                hprime_xx(i,5)*B3_m1_m2_5points(5,j)
+        enddo
+     enddo
 
-          tempz1l = 0.
-          tempz2l = 0.
-          tempz3l = 0.
+    !   call mxm_m1_m1_5points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+    !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+     do j=1,m1
+        do i=1,m1
+    ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+           do k = 1,NGLLX
+              tempx2l(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                   dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                   dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                   dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                   dummyx_loc(i,5,k)*hprime_xxT(5,j)
+              
+              tempy2l(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                   dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                   dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                   dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                   dummyy_loc(i,5,k)*hprime_xxT(5,j)
 
-          do l=1,NGLLX
-            hp1 = hprime_xx(i,l)
-            iglob = ibool(l,j,k,ispec)
-            tempx1l = tempx1l + displ(1,iglob)*hp1
-            tempy1l = tempy1l + displ(2,iglob)*hp1
-            tempz1l = tempz1l + displ(3,iglob)*hp1
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+              tempz2l(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                   dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                   dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                   dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                   dummyz_loc(i,5,k)*hprime_xxT(5,j)
+           enddo
+        enddo
+     enddo
 
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-            hp2 = hprime_yy(j,l)
-            iglob = ibool(i,l,k,ispec)
-            tempx2l = tempx2l + displ(1,iglob)*hp2
-            tempy2l = tempy2l + displ(2,iglob)*hp2
-            tempz2l = tempz2l + displ(3,iglob)*hp2
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+    ! call mxm_m2_m1_5points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+     do j=1,m1
+        do i=1,m2
+           D1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+                A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+                A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+                A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+                A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
 
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-            hp3 = hprime_zz(k,l)
-            iglob = ibool(i,j,l,ispec)
-            tempx3l = tempx3l + displ(1,iglob)*hp3
-            tempy3l = tempy3l + displ(2,iglob)*hp3
-            tempz3l = tempz3l + displ(3,iglob)*hp3
-          enddo
+           D2_mxm_m2_m1_5points(i,j) = A2_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+                A2_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+                A2_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+                A2_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+                A2_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
 
-!         get derivatives of ux, uy and uz with respect to x, y and z
-          xixl = xix(i,j,k,ispec)
-          xiyl = xiy(i,j,k,ispec)
-          xizl = xiz(i,j,k,ispec)
-          etaxl = etax(i,j,k,ispec)
-          etayl = etay(i,j,k,ispec)
-          etazl = etaz(i,j,k,ispec)
-          gammaxl = gammax(i,j,k,ispec)
-          gammayl = gammay(i,j,k,ispec)
-          gammazl = gammaz(i,j,k,ispec)
-          jacobianl = jacobian(i,j,k,ispec)
+           D3_mxm_m2_m1_5points(i,j) = A3_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+                A3_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+                A3_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+                A3_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+                A3_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
+        enddo
+     enddo
 
-          duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-          duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-          duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+     
+     do k=1,NGLLZ
+        do j=1,NGLLY
+           do i=1,NGLLX
 
-          duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
-          duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
-          duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
+!         get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+              
+              duxdxl = xixl*tempx1l(i,j,k) + etaxl*tempx2l(i,j,k) + gammaxl*tempx3l(i,j,k)
+              duxdyl = xiyl*tempx1l(i,j,k) + etayl*tempx2l(i,j,k) + gammayl*tempx3l(i,j,k)
+              duxdzl = xizl*tempx1l(i,j,k) + etazl*tempx2l(i,j,k) + gammazl*tempx3l(i,j,k)
+              
+              duydxl = xixl*tempy1l(i,j,k) + etaxl*tempy2l(i,j,k) + gammaxl*tempy3l(i,j,k)
+              duydyl = xiyl*tempy1l(i,j,k) + etayl*tempy2l(i,j,k) + gammayl*tempy3l(i,j,k)
+              duydzl = xizl*tempy1l(i,j,k) + etazl*tempy2l(i,j,k) + gammazl*tempy3l(i,j,k)
 
-          duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
-          duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
-          duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
+              duzdxl = xixl*tempz1l(i,j,k) + etaxl*tempz2l(i,j,k) + gammaxl*tempz3l(i,j,k)
+              duzdyl = xiyl*tempz1l(i,j,k) + etayl*tempz2l(i,j,k) + gammayl*tempz3l(i,j,k)
+              duzdzl = xizl*tempz1l(i,j,k) + etazl*tempz2l(i,j,k) + gammazl*tempz3l(i,j,k) 
 
 ! precompute some sums to save CPU time
-          duxdxl_plus_duydyl = duxdxl + duydyl
-          duxdxl_plus_duzdzl = duxdxl + duzdzl
-          duydyl_plus_duzdzl = duydyl + duzdzl
-          duxdyl_plus_duydxl = duxdyl + duydxl
-          duzdxl_plus_duxdzl = duzdxl + duxdzl
-          duzdyl_plus_duydzl = duzdyl + duydzl
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
 
 ! precompute terms for attenuation if needed
-  if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+              if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
 
 ! compute deviatoric strain
-    epsilon_trace_over_3 = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-    epsilondev_xx_loc(i,j,k) = duxdxl - epsilon_trace_over_3
-    epsilondev_yy_loc(i,j,k) = duydyl - epsilon_trace_over_3
-    epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-    epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-    epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+                 epsilon_trace_over_3 = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                 epsilondev_xx_loc(i,j,k) = duxdxl - epsilon_trace_over_3
+                 epsilondev_yy_loc(i,j,k) = duydyl - epsilon_trace_over_3
+                 epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                 epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                 epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
 
 ! distinguish attenuation factors
-    if(flag_sediments(i,j,k,ispec)) then
+                 if(flag_sediments(i,j,k,ispec)) then
 
 ! use constant attenuation of Q = 90
 ! or use scaling rule similar to Olsen et al. (2003)
-     if(USE_OLSEN_ATTENUATION) then
-       vs_val = mustore(i,j,k,ispec) / rho_vs(i,j,k,ispec)
+                    if(USE_OLSEN_ATTENUATION) then
+                       vs_val = mustore(i,j,k,ispec) / rho_vs(i,j,k,ispec)
 ! use rule Q_mu = constant * v_s
-       Q_mu = OLSEN_ATTENUATION_RATIO * vs_val
-       int_Q_mu = 10 * nint(Q_mu / 10.)
-       if(int_Q_mu < 40) int_Q_mu = 40
-       if(int_Q_mu > 150) int_Q_mu = 150
+                       Q_mu = OLSEN_ATTENUATION_RATIO * vs_val
+                       int_Q_mu = 10 * nint(Q_mu / 10.)
+                       if(int_Q_mu < 40) int_Q_mu = 40
+                       if(int_Q_mu > 150) int_Q_mu = 150
 
-       if(int_Q_mu == 40) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_40
-       else if(int_Q_mu == 50) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_50
-       else if(int_Q_mu == 60) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_60
-       else if(int_Q_mu == 70) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_70
-       else if(int_Q_mu == 80) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_80
-       else if(int_Q_mu == 90) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_90
-       else if(int_Q_mu == 100) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_100
-       else if(int_Q_mu == 110) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_110
-       else if(int_Q_mu == 120) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_120
-       else if(int_Q_mu == 130) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_130
-       else if(int_Q_mu == 140) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_140
-       else if(int_Q_mu == 150) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_150
-       else
-         stop 'incorrect attenuation coefficient'
-       endif
+                       if(int_Q_mu == 40) then
+                          iattenuation_sediments = IATTENUATION_SEDIMENTS_40
+                       else if(int_Q_mu == 50) then
+                          iattenuation_sediments = IATTENUATION_SEDIMENTS_50
+                       else if(int_Q_mu == 60) then
+                          iattenuation_sediments = IATTENUATION_SEDIMENTS_60
+                       else if(int_Q_mu == 70) then
+                          iattenuation_sediments = IATTENUATION_SEDIMENTS_70
+                       else if(int_Q_mu == 80) then
+                          iattenuation_sediments = IATTENUATION_SEDIMENTS_80
+                       else if(int_Q_mu == 90) then
+                          iattenuation_sediments = IATTENUATION_SEDIMENTS_90
+                       else if(int_Q_mu == 100) then
+                          iattenuation_sediments = IATTENUATION_SEDIMENTS_100
+                       else if(int_Q_mu == 110) then
+                          iattenuation_sediments = IATTENUATION_SEDIMENTS_110
+                       else if(int_Q_mu == 120) then
+                          iattenuation_sediments = IATTENUATION_SEDIMENTS_120
+                       else if(int_Q_mu == 130) then
+                          iattenuation_sediments = IATTENUATION_SEDIMENTS_130
+                       else if(int_Q_mu == 140) then
+                          iattenuation_sediments = IATTENUATION_SEDIMENTS_140
+                       else if(int_Q_mu == 150) then
+                          iattenuation_sediments = IATTENUATION_SEDIMENTS_150
+                       else
+                          stop 'incorrect attenuation coefficient'
+                       endif
 
-     else
-       iattenuation_sediments = IATTENUATION_SEDIMENTS_90
-     endif
+                    else
+                       iattenuation_sediments = IATTENUATION_SEDIMENTS_90
+                    endif
 
-      iselected = iattenuation_sediments
-    else
-      iselected = IATTENUATION_BEDROCK
-    endif
+                    iselected = iattenuation_sediments
+                 else
+                    iselected = IATTENUATION_BEDROCK
+                 endif
 
-    one_minus_sum_beta_use = one_minus_sum_beta(iselected)
-    minus_sum_beta =  one_minus_sum_beta_use - 1.
-
-  endif
-
-    kappal = kappastore(i,j,k,ispec)
-    mul = mustore(i,j,k,ispec)
-
+                 one_minus_sum_beta_use = one_minus_sum_beta(iselected)
+                 minus_sum_beta =  one_minus_sum_beta_use - 1.
+                 
+              endif
+              
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+              
 ! For fully anisotropic case
-    if(ANISOTROPY_VAL) then
-       c11 = c11store(i,j,k,ispec)
-       c12 = c12store(i,j,k,ispec)
-       c13 = c13store(i,j,k,ispec)
-       c14 = c14store(i,j,k,ispec)
-       c15 = c15store(i,j,k,ispec)
-       c16 = c16store(i,j,k,ispec)
-       c22 = c22store(i,j,k,ispec)
-       c23 = c23store(i,j,k,ispec)
-       c24 = c24store(i,j,k,ispec)
-       c25 = c25store(i,j,k,ispec)
-       c26 = c26store(i,j,k,ispec)
-       c33 = c33store(i,j,k,ispec)
-       c34 = c34store(i,j,k,ispec)
-       c35 = c35store(i,j,k,ispec)
-       c36 = c36store(i,j,k,ispec)
-       c44 = c44store(i,j,k,ispec)
-       c45 = c45store(i,j,k,ispec)
-       c46 = c46store(i,j,k,ispec)
-       c55 = c55store(i,j,k,ispec)
-       c56 = c56store(i,j,k,ispec)
-       c66 = c66store(i,j,k,ispec)
+              if(ANISOTROPY_VAL) then
+                 c11 = c11store(i,j,k,ispec)
+                 c12 = c12store(i,j,k,ispec)
+                 c13 = c13store(i,j,k,ispec)
+                 c14 = c14store(i,j,k,ispec)
+                 c15 = c15store(i,j,k,ispec)
+                 c16 = c16store(i,j,k,ispec)
+                 c22 = c22store(i,j,k,ispec)
+                 c23 = c23store(i,j,k,ispec)
+                 c24 = c24store(i,j,k,ispec)
+                 c25 = c25store(i,j,k,ispec)
+                 c26 = c26store(i,j,k,ispec)
+                 c33 = c33store(i,j,k,ispec)
+                 c34 = c34store(i,j,k,ispec)
+                 c35 = c35store(i,j,k,ispec)
+                 c36 = c36store(i,j,k,ispec)
+                 c44 = c44store(i,j,k,ispec)
+                 c45 = c45store(i,j,k,ispec)
+                 c46 = c46store(i,j,k,ispec)
+                 c55 = c55store(i,j,k,ispec)
+                 c56 = c56store(i,j,k,ispec)
+                 c66 = c66store(i,j,k,ispec)
        !if(ATTENUATION_VAL.and. not_fully_in_bedrock(ispec)) then
        !   mul = c44
        !   c11 = c11 + FOUR_THIRDS * minus_sum_beta * mul
@@ -1751,74 +1872,74 @@
        !   c66 = c66 + minus_sum_beta * mul
        !endif
 
-       sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-            c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                 sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                      c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
 
-       sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-            c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                 sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                      c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                 
+                 sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                      c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                 
+                 sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                      c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                 
+                 sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                      c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
 
-       sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-            c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                 sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                      c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
 
-       sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-            c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+              else
 
-       sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-            c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-
-       sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-            c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-    else
-
 ! For isotropic case
 ! use unrelaxed parameters if attenuation
-       if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) mul = mul * one_minus_sum_beta_use
+                 if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) mul = mul * one_minus_sum_beta_use
 
-       lambdalplus2mul = kappal + FOUR_THIRDS * mul
-       lambdal = lambdalplus2mul - 2.*mul
+                 lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                 lambdal = lambdalplus2mul - 2.*mul
 
 ! compute stress sigma
-       sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-       sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-       sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+                 sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                 sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                 sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
 
-       sigma_xy = mul*duxdyl_plus_duydxl
-       sigma_xz = mul*duzdxl_plus_duxdzl
-       sigma_yz = mul*duzdyl_plus_duydzl
+                 sigma_xy = mul*duxdyl_plus_duydxl
+                 sigma_xz = mul*duzdxl_plus_duxdzl
+                 sigma_yz = mul*duzdyl_plus_duydzl
 
-    endif
+              endif
 
 ! subtract memory variables if attenuation
-    if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
-      do i_sls = 1,N_SLS
-        R_xx_val = R_xx(i,j,k,ispec,i_sls)
-        R_yy_val = R_yy(i,j,k,ispec,i_sls)
-        sigma_xx = sigma_xx - R_xx_val
-        sigma_yy = sigma_yy - R_yy_val
-        sigma_zz = sigma_zz + R_xx_val + R_yy_val
-        sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-        sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-        sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-      enddo
-    endif
+              if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+                 do i_sls = 1,N_SLS
+                    R_xx_val = R_xx(i,j,k,ispec,i_sls)
+                    R_yy_val = R_yy(i,j,k,ispec,i_sls)
+                    sigma_xx = sigma_xx - R_xx_val
+                    sigma_yy = sigma_yy - R_yy_val
+                    sigma_zz = sigma_zz + R_xx_val + R_yy_val
+                    sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                    sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                    sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                 enddo
+              endif
 
 ! form dot product with test vector, symmetric form
-      tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl)
-      tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl)
-      tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl)
+              tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl)
+              tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl)
+              tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl)
+              
+              tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl)
+              tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl)
+              tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl)
 
-      tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl)
-      tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl)
-      tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl)
+              tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl)
+              tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl)
+              tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl)
 
-      tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl)
-      tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl)
-      tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl)
-
-          enddo
+           enddo
         enddo
-      enddo
+     enddo
 
 !---------------------------------------------------------------------------------------------
 ! end of nested loops on i,j,k to perform the forward calculations in a given element (ispec)
@@ -1828,260 +1949,312 @@
 ! beginning of nested loops on i,j,k to perform the backward calculations in a given element (ispec)
 !----------------------------------------------------------------------------------------------------
 
-    if (SIMULATION_TYPE == 3) then
+     if (SIMULATION_TYPE == 3) then
 
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
+        do k=1,NGLLZ
+           do j=1,NGLLY
+              do i=1,NGLLX
+                 iglob = ibool(i,j,k,ispec)
+                 dummyx_loc(i,j,k) = b_displ(1,iglob)
+                 dummyy_loc(i,j,k) = b_displ(2,iglob)
+                 dummyz_loc(i,j,k) = b_displ(3,iglob)
+              enddo
+           enddo
+        enddo
 
-            b_tempx1l = 0.
-            b_tempx2l = 0.
-            b_tempx3l = 0.
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+    ! call mxm_m1_m2_5points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+        do j=1,m2
+           do i=1,m1
+              b_C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
+                   hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
+                   hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
+                   hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
+                   hprime_xx(i,5)*B1_m1_m2_5points(5,j)
 
-            b_tempy1l = 0.
-            b_tempy2l = 0.
-            b_tempy3l = 0.
+              b_C2_m1_m2_5points(i,j) = hprime_xx(i,1)*B2_m1_m2_5points(1,j) + &
+                   hprime_xx(i,2)*B2_m1_m2_5points(2,j) + &
+                   hprime_xx(i,3)*B2_m1_m2_5points(3,j) + &
+                   hprime_xx(i,4)*B2_m1_m2_5points(4,j) + &
+                   hprime_xx(i,5)*B2_m1_m2_5points(5,j)
 
-            b_tempz1l = 0.
-            b_tempz2l = 0.
-            b_tempz3l = 0.
+              b_C3_m1_m2_5points(i,j) = hprime_xx(i,1)*B3_m1_m2_5points(1,j) + &
+                   hprime_xx(i,2)*B3_m1_m2_5points(2,j) + &
+                   hprime_xx(i,3)*B3_m1_m2_5points(3,j) + &
+                   hprime_xx(i,4)*B3_m1_m2_5points(4,j) + &
+                   hprime_xx(i,5)*B3_m1_m2_5points(5,j)
+          enddo
+       enddo
 
-          do l=1,NGLLX
-            hp1 = hprime_xx(i,l)
-            iglob = ibool(l,j,k,ispec)
-            b_tempx1l = b_tempx1l + b_displ(1,iglob)*hp1
-            b_tempy1l = b_tempy1l + b_displ(2,iglob)*hp1
-            b_tempz1l = b_tempz1l + b_displ(3,iglob)*hp1
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+    !   call mxm_m1_m1_5points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+    !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+       do j=1,m1
+          do i=1,m1
+    ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+             do k = 1,NGLLX
+                b_tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                     dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                     dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                     dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                     dummyx_loc(i,5,k)*hprime_xxT(5,j)
 
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-            hp2 = hprime_yy(j,l)
-            iglob = ibool(i,l,k,ispec)
-            b_tempx2l = b_tempx2l + b_displ(1,iglob)*hp2
-            b_tempy2l = b_tempy2l + b_displ(2,iglob)*hp2
-            b_tempz2l = b_tempz2l + b_displ(3,iglob)*hp2
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+                b_tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                     dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                     dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                     dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                     dummyy_loc(i,5,k)*hprime_xxT(5,j)
 
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-            hp3 = hprime_zz(k,l)
-            iglob = ibool(i,j,l,ispec)
-            b_tempx3l = b_tempx3l + b_displ(1,iglob)*hp3
-            b_tempy3l = b_tempy3l + b_displ(2,iglob)*hp3
-            b_tempz3l = b_tempz3l + b_displ(3,iglob)*hp3
+                b_tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                     dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                     dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                     dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                     dummyz_loc(i,5,k)*hprime_xxT(5,j)
+             enddo
           enddo
+       enddo
 
+    ! call mxm_m2_m1_5points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+       do j=1,m1
+          do i=1,m2
+             b_C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+                  A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+                  A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+                  A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+                  A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
+
+             b_C2_mxm_m2_m1_5points(i,j) = A2_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+                  A2_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+                  A2_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+                  A2_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+                                      A2_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
+
+             b_C3_mxm_m2_m1_5points(i,j) = A3_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+                  A3_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+                  A3_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+                  A3_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+                  A3_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
+          enddo
+       enddo
+
+
+       do k=1,NGLLZ
+          do j=1,NGLLY
+             do i=1,NGLLX
+
 !         get derivatives of ux, uy and uz with respect to x, y and z
-          xixl = xix(i,j,k,ispec)
-          xiyl = xiy(i,j,k,ispec)
-          xizl = xiz(i,j,k,ispec)
-          etaxl = etax(i,j,k,ispec)
-          etayl = etay(i,j,k,ispec)
-          etazl = etaz(i,j,k,ispec)
-          gammaxl = gammax(i,j,k,ispec)
-          gammayl = gammay(i,j,k,ispec)
-          gammazl = gammaz(i,j,k,ispec)
-          jacobianl = jacobian(i,j,k,ispec)
+                xixl = xix(i,j,k,ispec)
+                xiyl = xiy(i,j,k,ispec)
+                xizl = xiz(i,j,k,ispec)
+                etaxl = etax(i,j,k,ispec)
+                etayl = etay(i,j,k,ispec)
+                etazl = etaz(i,j,k,ispec)
+                gammaxl = gammax(i,j,k,ispec)
+                gammayl = gammay(i,j,k,ispec)
+                gammazl = gammaz(i,j,k,ispec)
+                jacobianl = jacobian(i,j,k,ispec)
 
-          duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-          duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-          duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+                duxdxl = xixl*tempx1l(i,j,k) + etaxl*tempx2l(i,j,k) + gammaxl*tempx3l(i,j,k)
+                duxdyl = xiyl*tempx1l(i,j,k) + etayl*tempx2l(i,j,k) + gammayl*tempx3l(i,j,k)
+                duxdzl = xizl*tempx1l(i,j,k) + etazl*tempx2l(i,j,k) + gammazl*tempx3l(i,j,k)
+          
+                duydxl = xixl*tempy1l(i,j,k) + etaxl*tempy2l(i,j,k) + gammaxl*tempy3l(i,j,k)
+                duydyl = xiyl*tempy1l(i,j,k) + etayl*tempy2l(i,j,k) + gammayl*tempy3l(i,j,k)
+                duydzl = xizl*tempy1l(i,j,k) + etazl*tempy2l(i,j,k) + gammazl*tempy3l(i,j,k)
 
-          duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
-          duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
-          duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
+                duzdxl = xixl*tempz1l(i,j,k) + etaxl*tempz2l(i,j,k) + gammaxl*tempz3l(i,j,k)
+                duzdyl = xiyl*tempz1l(i,j,k) + etayl*tempz2l(i,j,k) + gammayl*tempz3l(i,j,k)
+                duzdzl = xizl*tempz1l(i,j,k) + etazl*tempz2l(i,j,k) + gammazl*tempz3l(i,j,k)  
 
-          duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
-          duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
-          duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
-
 ! save strain on the Moho boundary
-          if (SAVE_MOHO_MESH) then
-            if (is_moho_top(ispec)) then
-              dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-              dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-              dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-              dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-              dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-              dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-              dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-              dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-              dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-            else if (is_moho_bot(ispec)) then
-              dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-              dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-              dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-              dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-              dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-              dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-              dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-              dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-              dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-            endif
-          endif
+                if (SAVE_MOHO_MESH) then
+                   if (is_moho_top(ispec)) then
+                      dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                      dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                      dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                      dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                      dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                      dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                      dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                      dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                      dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                   else if (is_moho_bot(ispec)) then
+                      dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                      dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                      dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                      dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                      dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                      dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                      dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                      dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                      dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                   endif
+                endif
 
 ! precompute some sums to save CPU time
-          duxdxl_plus_duydyl = duxdxl + duydyl
-          duxdxl_plus_duzdzl = duxdxl + duzdzl
-          duydyl_plus_duzdzl = duydyl + duzdzl
-          duxdyl_plus_duydxl = duxdyl + duydxl
-          duzdxl_plus_duxdzl = duzdxl + duxdzl
-          duzdyl_plus_duydzl = duzdyl + duydzl
+                duxdxl_plus_duydyl = duxdxl + duydyl
+                duxdxl_plus_duzdzl = duxdxl + duzdzl
+                duydyl_plus_duzdzl = duydyl + duzdzl
+                duxdyl_plus_duydxl = duxdyl + duydxl
+                duzdxl_plus_duxdzl = duzdxl + duxdzl
+                duzdyl_plus_duydzl = duzdyl + duydzl
+                
+                dsxx = duxdxl
+                dsxy = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+                dsxz = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+                dsyy = duydyl
+                dsyz = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+                dszz = duzdzl
 
-            dsxx = duxdxl
-            dsxy = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-            dsxz = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-            dsyy = duydyl
-            dsyz = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
-            dszz = duzdzl
+                b_duxdxl = xixl*b_tempx1(i,j,k) + etaxl*b_tempx2(i,j,k) + gammaxl*b_tempx3(i,j,k)
+                b_duxdyl = xiyl*b_tempx1(i,j,k) + etayl*b_tempx2(i,j,k) + gammayl*b_tempx3(i,j,k)
+                b_duxdzl = xizl*b_tempx1(i,j,k) + etazl*b_tempx2(i,j,k) + gammazl*b_tempx3(i,j,k)
+                
+                b_duydxl = xixl*b_tempy1(i,j,k) + etaxl*b_tempy2(i,j,k) + gammaxl*b_tempy3(i,j,k)
+                b_duydyl = xiyl*b_tempy1(i,j,k) + etayl*b_tempy2(i,j,k) + gammayl*b_tempy3(i,j,k)
+                b_duydzl = xizl*b_tempy1(i,j,k) + etazl*b_tempy2(i,j,k) + gammazl*b_tempy3(i,j,k)
 
-            b_duxdxl = xixl*b_tempx1l + etaxl*b_tempx2l + gammaxl*b_tempx3l
-            b_duxdyl = xiyl*b_tempx1l + etayl*b_tempx2l + gammayl*b_tempx3l
-            b_duxdzl = xizl*b_tempx1l + etazl*b_tempx2l + gammazl*b_tempx3l
+                b_duzdxl = xixl*b_tempz1(i,j,k) + etaxl*b_tempz2(i,j,k) + gammaxl*b_tempz3(i,j,k)
+                b_duzdyl = xiyl*b_tempz1(i,j,k) + etayl*b_tempz2(i,j,k) + gammayl*b_tempz3(i,j,k)
+                b_duzdzl = xizl*b_tempz1(i,j,k) + etazl*b_tempz2(i,j,k) + gammazl*b_tempz3(i,j,k)
 
-            b_duydxl = xixl*b_tempy1l + etaxl*b_tempy2l + gammaxl*b_tempy3l
-            b_duydyl = xiyl*b_tempy1l + etayl*b_tempy2l + gammayl*b_tempy3l
-            b_duydzl = xizl*b_tempy1l + etazl*b_tempy2l + gammazl*b_tempy3l
+                b_duxdxl_plus_duydyl = b_duxdxl + b_duydyl
+                b_duxdxl_plus_duzdzl = b_duxdxl + b_duzdzl
+                b_duydyl_plus_duzdzl = b_duydyl + b_duzdzl
+                b_duxdyl_plus_duydxl = b_duxdyl + b_duydxl
+                b_duzdxl_plus_duxdzl = b_duzdxl + b_duxdzl
+                b_duzdyl_plus_duydzl = b_duzdyl + b_duydzl
 
-            b_duzdxl = xixl*b_tempz1l + etaxl*b_tempz2l + gammaxl*b_tempz3l
-            b_duzdyl = xiyl*b_tempz1l + etayl*b_tempz2l + gammayl*b_tempz3l
-            b_duzdzl = xizl*b_tempz1l + etazl*b_tempz2l + gammazl*b_tempz3l
+                b_dsxx =  b_duxdxl
+                b_dsxy = 0.5_CUSTOM_REAL * b_duxdyl_plus_duydxl
+                b_dsxz = 0.5_CUSTOM_REAL * b_duzdxl_plus_duxdzl
+                b_dsyy =  b_duydyl
+                b_dsyz = 0.5_CUSTOM_REAL * b_duzdyl_plus_duydzl
+                b_dszz =  b_duzdzl
 
-            b_duxdxl_plus_duydyl = b_duxdxl + b_duydyl
-            b_duxdxl_plus_duzdzl = b_duxdxl + b_duzdzl
-            b_duydyl_plus_duzdzl = b_duydyl + b_duzdzl
-            b_duxdyl_plus_duydxl = b_duxdyl + b_duydxl
-            b_duzdxl_plus_duxdzl = b_duzdxl + b_duxdzl
-            b_duzdyl_plus_duydzl = b_duzdyl + b_duydzl
+                kappa_k = (duxdxl + duydyl + duzdzl) *  (b_duxdxl + b_duydyl + b_duzdzl)
+                mu_k = dsxx * b_dsxx + dsyy * b_dsyy + dszz * b_dszz + &
+                     2 * (dsxy * b_dsxy + dsxz * b_dsxz + dsyz * b_dsyz) - ONE_THIRD * kappa_k
+                kappa_kl(i,j,k,ispec) = kappa_kl(i,j,k,ispec) + deltat * kappa_k
+                mu_kl(i,j,k,ispec) = mu_kl(i,j,k,ispec) + 2 * deltat * mu_k
 
-            b_dsxx =  b_duxdxl
-            b_dsxy = 0.5_CUSTOM_REAL * b_duxdyl_plus_duydxl
-            b_dsxz = 0.5_CUSTOM_REAL * b_duzdxl_plus_duxdzl
-            b_dsyy =  b_duydyl
-            b_dsyz = 0.5_CUSTOM_REAL * b_duzdyl_plus_duydzl
-            b_dszz =  b_duzdzl
+                if (SAVE_MOHO_MESH) then
+                   if (is_moho_top(ispec)) then
+                      b_dsdx_top(1,1,i,j,k,ispec2D_moho_top) = b_duxdxl
+                      b_dsdx_top(1,2,i,j,k,ispec2D_moho_top) = b_duxdyl
+                      b_dsdx_top(1,3,i,j,k,ispec2D_moho_top) = b_duxdzl
+                      b_dsdx_top(2,1,i,j,k,ispec2D_moho_top) = b_duydxl
+                      b_dsdx_top(2,2,i,j,k,ispec2D_moho_top) = b_duydyl
+                      b_dsdx_top(2,3,i,j,k,ispec2D_moho_top) = b_duydzl
+                      b_dsdx_top(3,1,i,j,k,ispec2D_moho_top) = b_duzdxl
+                      b_dsdx_top(3,2,i,j,k,ispec2D_moho_top) = b_duzdyl
+                      b_dsdx_top(3,3,i,j,k,ispec2D_moho_top) = b_duzdzl
+                   else if (is_moho_bot(ispec)) then
+                      b_dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = b_duxdxl
+                      b_dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = b_duxdyl
+                      b_dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = b_duxdzl
+                      b_dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = b_duydxl
+                      b_dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = b_duydyl
+                      b_dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = b_duydzl
+                      b_dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = b_duzdxl
+                      b_dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = b_duzdyl
+                      b_dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = b_duzdzl
+                   endif
+                endif
 
-            kappa_k = (duxdxl + duydyl + duzdzl) *  (b_duxdxl + b_duydyl + b_duzdzl)
-            mu_k = dsxx * b_dsxx + dsyy * b_dsyy + dszz * b_dszz + &
-                  2 * (dsxy * b_dsxy + dsxz * b_dsxz + dsyz * b_dsyz) - ONE_THIRD * kappa_k
-            kappa_kl(i,j,k,ispec) = kappa_kl(i,j,k,ispec) + deltat * kappa_k
-            mu_kl(i,j,k,ispec) = mu_kl(i,j,k,ispec) + 2 * deltat * mu_k
-
-            if (SAVE_MOHO_MESH) then
-              if (is_moho_top(ispec)) then
-                b_dsdx_top(1,1,i,j,k,ispec2D_moho_top) = b_duxdxl
-                b_dsdx_top(1,2,i,j,k,ispec2D_moho_top) = b_duxdyl
-                b_dsdx_top(1,3,i,j,k,ispec2D_moho_top) = b_duxdzl
-                b_dsdx_top(2,1,i,j,k,ispec2D_moho_top) = b_duydxl
-                b_dsdx_top(2,2,i,j,k,ispec2D_moho_top) = b_duydyl
-                b_dsdx_top(2,3,i,j,k,ispec2D_moho_top) = b_duydzl
-                b_dsdx_top(3,1,i,j,k,ispec2D_moho_top) = b_duzdxl
-                b_dsdx_top(3,2,i,j,k,ispec2D_moho_top) = b_duzdyl
-                b_dsdx_top(3,3,i,j,k,ispec2D_moho_top) = b_duzdzl
-              else if (is_moho_bot(ispec)) then
-                b_dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = b_duxdxl
-                b_dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = b_duxdyl
-                b_dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = b_duxdzl
-                b_dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = b_duydxl
-                b_dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = b_duydyl
-                b_dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = b_duydzl
-                b_dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = b_duzdxl
-                b_dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = b_duzdyl
-                b_dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = b_duzdzl
-              endif
-            endif
-
 ! precompute terms for attenuation if needed
-  if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+                if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
 
 ! compute deviatoric strain
-    b_epsilon_trace_over_3 = ONE_THIRD * (b_duxdxl + b_duydyl + b_duzdzl)
-    b_epsilondev_xx_loc(i,j,k) = b_duxdxl - b_epsilon_trace_over_3
-    b_epsilondev_yy_loc(i,j,k) = b_duydyl - b_epsilon_trace_over_3
-    b_epsilondev_xy_loc(i,j,k) = 0.5 * b_duxdyl_plus_duydxl
-    b_epsilondev_xz_loc(i,j,k) = 0.5 * b_duzdxl_plus_duxdzl
-    b_epsilondev_yz_loc(i,j,k) = 0.5 * b_duzdyl_plus_duydzl
+                   b_epsilon_trace_over_3 = ONE_THIRD * (b_duxdxl + b_duydyl + b_duzdzl)
+                   b_epsilondev_xx_loc(i,j,k) = b_duxdxl - b_epsilon_trace_over_3
+                   b_epsilondev_yy_loc(i,j,k) = b_duydyl - b_epsilon_trace_over_3
+                   b_epsilondev_xy_loc(i,j,k) = 0.5 * b_duxdyl_plus_duydxl
+                   b_epsilondev_xz_loc(i,j,k) = 0.5 * b_duzdxl_plus_duxdzl
+                   b_epsilondev_yz_loc(i,j,k) = 0.5 * b_duzdyl_plus_duydzl
 
 ! distinguish attenuation factors
-    if(flag_sediments(i,j,k,ispec)) then
+                   if(flag_sediments(i,j,k,ispec)) then
 
 ! use constant attenuation of Q = 90
 ! or use scaling rule similar to Olsen et al. (2003)
-     if(USE_OLSEN_ATTENUATION) then
-       vs_val = mustore(i,j,k,ispec) / rho_vs(i,j,k,ispec)
+                      if(USE_OLSEN_ATTENUATION) then
+                         vs_val = mustore(i,j,k,ispec) / rho_vs(i,j,k,ispec)
 ! use rule Q_mu = constant * v_s
-       Q_mu = OLSEN_ATTENUATION_RATIO * vs_val
-       int_Q_mu = 10 * nint(Q_mu / 10.)
-       if(int_Q_mu < 40) int_Q_mu = 40
-       if(int_Q_mu > 150) int_Q_mu = 150
+                         Q_mu = OLSEN_ATTENUATION_RATIO * vs_val
+                         int_Q_mu = 10 * nint(Q_mu / 10.)
+                         if(int_Q_mu < 40) int_Q_mu = 40
+                         if(int_Q_mu > 150) int_Q_mu = 150
 
-       if(int_Q_mu == 40) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_40
-       else if(int_Q_mu == 50) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_50
-       else if(int_Q_mu == 60) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_60
-       else if(int_Q_mu == 70) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_70
-       else if(int_Q_mu == 80) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_80
-       else if(int_Q_mu == 90) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_90
-       else if(int_Q_mu == 100) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_100
-       else if(int_Q_mu == 110) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_110
-       else if(int_Q_mu == 120) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_120
-       else if(int_Q_mu == 130) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_130
-       else if(int_Q_mu == 140) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_140
-       else if(int_Q_mu == 150) then
-         iattenuation_sediments = IATTENUATION_SEDIMENTS_150
-       else
-         stop 'incorrect attenuation coefficient'
-       endif
+                         if(int_Q_mu == 40) then
+                            iattenuation_sediments = IATTENUATION_SEDIMENTS_40
+                         else if(int_Q_mu == 50) then
+                            iattenuation_sediments = IATTENUATION_SEDIMENTS_50
+                         else if(int_Q_mu == 60) then
+                            iattenuation_sediments = IATTENUATION_SEDIMENTS_60
+                         else if(int_Q_mu == 70) then
+                            iattenuation_sediments = IATTENUATION_SEDIMENTS_70
+                         else if(int_Q_mu == 80) then
+                            iattenuation_sediments = IATTENUATION_SEDIMENTS_80
+                         else if(int_Q_mu == 90) then
+                            iattenuation_sediments = IATTENUATION_SEDIMENTS_90
+                         else if(int_Q_mu == 100) then
+                            iattenuation_sediments = IATTENUATION_SEDIMENTS_100
+                         else if(int_Q_mu == 110) then
+                            iattenuation_sediments = IATTENUATION_SEDIMENTS_110
+                         else if(int_Q_mu == 120) then
+                            iattenuation_sediments = IATTENUATION_SEDIMENTS_120
+                         else if(int_Q_mu == 130) then
+                            iattenuation_sediments = IATTENUATION_SEDIMENTS_130
+                         else if(int_Q_mu == 140) then
+                            iattenuation_sediments = IATTENUATION_SEDIMENTS_140
+                         else if(int_Q_mu == 150) then
+                            iattenuation_sediments = IATTENUATION_SEDIMENTS_150
+                         else
+                            stop 'incorrect attenuation coefficient'
+                         endif
+                         
+                      else
+                         iattenuation_sediments = IATTENUATION_SEDIMENTS_90
+                      endif
 
-     else
-       iattenuation_sediments = IATTENUATION_SEDIMENTS_90
-     endif
+                      iselected = iattenuation_sediments
+                   else
+                      iselected = IATTENUATION_BEDROCK
+                   endif
 
-      iselected = iattenuation_sediments
-    else
-      iselected = IATTENUATION_BEDROCK
-    endif
+                   one_minus_sum_beta_use = one_minus_sum_beta(iselected)
+                   minus_sum_beta =  one_minus_sum_beta_use - 1.
 
-    one_minus_sum_beta_use = one_minus_sum_beta(iselected)
-    minus_sum_beta =  one_minus_sum_beta_use - 1.
+                endif
 
-  endif
+                kappal = kappastore(i,j,k,ispec)
+                mul = mustore(i,j,k,ispec)
 
-    kappal = kappastore(i,j,k,ispec)
-    mul = mustore(i,j,k,ispec)
-
 ! For fully anisotropic case
-    if(ANISOTROPY_VAL) then
-       c11 = c11store(i,j,k,ispec)
-       c12 = c12store(i,j,k,ispec)
-       c13 = c13store(i,j,k,ispec)
-       c14 = c14store(i,j,k,ispec)
-       c15 = c15store(i,j,k,ispec)
-       c16 = c16store(i,j,k,ispec)
-       c22 = c22store(i,j,k,ispec)
-       c23 = c23store(i,j,k,ispec)
-       c24 = c24store(i,j,k,ispec)
-       c25 = c25store(i,j,k,ispec)
-       c26 = c26store(i,j,k,ispec)
-       c33 = c33store(i,j,k,ispec)
-       c34 = c34store(i,j,k,ispec)
-       c35 = c35store(i,j,k,ispec)
-       c36 = c36store(i,j,k,ispec)
-       c44 = c44store(i,j,k,ispec)
-       c45 = c45store(i,j,k,ispec)
-       c46 = c46store(i,j,k,ispec)
-       c55 = c55store(i,j,k,ispec)
-       c56 = c56store(i,j,k,ispec)
-       c66 = c66store(i,j,k,ispec)
+                if(ANISOTROPY_VAL) then
+                   c11 = c11store(i,j,k,ispec)
+                   c12 = c12store(i,j,k,ispec)
+                   c13 = c13store(i,j,k,ispec)
+                   c14 = c14store(i,j,k,ispec)
+                   c15 = c15store(i,j,k,ispec)
+                   c16 = c16store(i,j,k,ispec)
+                   c22 = c22store(i,j,k,ispec)
+                   c23 = c23store(i,j,k,ispec)
+                   c24 = c24store(i,j,k,ispec)
+                   c25 = c25store(i,j,k,ispec)
+                   c26 = c26store(i,j,k,ispec)
+                   c33 = c33store(i,j,k,ispec)
+                   c34 = c34store(i,j,k,ispec)
+                   c35 = c35store(i,j,k,ispec)
+                   c36 = c36store(i,j,k,ispec)
+                   c44 = c44store(i,j,k,ispec)
+                   c45 = c45store(i,j,k,ispec)
+                   c46 = c46store(i,j,k,ispec)
+                   c55 = c55store(i,j,k,ispec)
+                   c56 = c56store(i,j,k,ispec)
+                   c66 = c66store(i,j,k,ispec)
        !if(ATTENUATION_VAL.and. not_fully_in_bedrock(ispec)) then
        !   mul = c44
        !   c11 = c11 + FOUR_THIRDS * minus_sum_beta * mul
@@ -2095,76 +2268,76 @@
        !   c66 = c66 + minus_sum_beta * mul
        !endif
 
-       b_sigma_xx = c11*b_duxdxl + c16*b_duxdyl_plus_duydxl + c12*b_duydyl + &
-             c15*b_duzdxl_plus_duxdzl + c14*b_duzdyl_plus_duydzl + c13*b_duzdzl
+                   b_sigma_xx = c11*b_duxdxl + c16*b_duxdyl_plus_duydxl + c12*b_duydyl + &
+                        c15*b_duzdxl_plus_duxdzl + c14*b_duzdyl_plus_duydzl + c13*b_duzdzl
+                   
+                   b_sigma_yy = c12*b_duxdxl + c26*b_duxdyl_plus_duydxl + c22*b_duydyl + &
+                        c25*b_duzdxl_plus_duxdzl + c24*b_duzdyl_plus_duydzl + c23*b_duzdzl
+                   
+                   b_sigma_zz = c13*b_duxdxl + c36*b_duxdyl_plus_duydxl + c23*b_duydyl + &
+                        c35*b_duzdxl_plus_duxdzl + c34*b_duzdyl_plus_duydzl + c33*b_duzdzl
+                   
+                   b_sigma_xy = c16*b_duxdxl + c66*b_duxdyl_plus_duydxl + c26*b_duydyl + &
+                        c56*b_duzdxl_plus_duxdzl + c46*b_duzdyl_plus_duydzl + c36*b_duzdzl
+                   
+                   b_sigma_xz = c15*b_duxdxl + c56*b_duxdyl_plus_duydxl + c25*b_duydyl + &
+                        c55*b_duzdxl_plus_duxdzl + c45*b_duzdyl_plus_duydzl + c35*b_duzdzl
+                   
+                   b_sigma_yz = c14*b_duxdxl + c46*b_duxdyl_plus_duydxl + c24*b_duydyl + &
+                        c45*b_duzdxl_plus_duxdzl + c44*b_duzdyl_plus_duydzl + c34*b_duzdzl
 
-       b_sigma_yy = c12*b_duxdxl + c26*b_duxdyl_plus_duydxl + c22*b_duydyl + &
-             c25*b_duzdxl_plus_duxdzl + c24*b_duzdyl_plus_duydzl + c23*b_duzdzl
-
-       b_sigma_zz = c13*b_duxdxl + c36*b_duxdyl_plus_duydxl + c23*b_duydyl + &
-             c35*b_duzdxl_plus_duxdzl + c34*b_duzdyl_plus_duydzl + c33*b_duzdzl
-
-       b_sigma_xy = c16*b_duxdxl + c66*b_duxdyl_plus_duydxl + c26*b_duydyl + &
-             c56*b_duzdxl_plus_duxdzl + c46*b_duzdyl_plus_duydzl + c36*b_duzdzl
-
-       b_sigma_xz = c15*b_duxdxl + c56*b_duxdyl_plus_duydxl + c25*b_duydyl + &
-             c55*b_duzdxl_plus_duxdzl + c45*b_duzdyl_plus_duydzl + c35*b_duzdzl
-
-       b_sigma_yz = c14*b_duxdxl + c46*b_duxdyl_plus_duydxl + c24*b_duydyl + &
-             c45*b_duzdxl_plus_duxdzl + c44*b_duzdyl_plus_duydzl + c34*b_duzdzl
-
-    else
+                else
 ! For isotropic case
 ! use unrelaxed parameters if attenuation
-       if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) mul = mul * one_minus_sum_beta_use
+                   if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) mul = mul * one_minus_sum_beta_use
 
-       lambdalplus2mul = kappal + FOUR_THIRDS * mul
-       lambdal = lambdalplus2mul - 2.*mul
-
+                   lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                   lambdal = lambdalplus2mul - 2.*mul
+                   
 ! compute stress sigma
-       b_sigma_xx = lambdalplus2mul*b_duxdxl + lambdal*b_duydyl_plus_duzdzl
-       b_sigma_yy = lambdalplus2mul*b_duydyl + lambdal*b_duxdxl_plus_duzdzl
-       b_sigma_zz = lambdalplus2mul*b_duzdzl + lambdal*b_duxdxl_plus_duydyl
+                   b_sigma_xx = lambdalplus2mul*b_duxdxl + lambdal*b_duydyl_plus_duzdzl
+                   b_sigma_yy = lambdalplus2mul*b_duydyl + lambdal*b_duxdxl_plus_duzdzl
+                   b_sigma_zz = lambdalplus2mul*b_duzdzl + lambdal*b_duxdxl_plus_duydyl
+                   
+                   b_sigma_xy = mul*b_duxdyl_plus_duydxl
+                   b_sigma_xz = mul*b_duzdxl_plus_duxdzl
+                   b_sigma_yz = mul*b_duzdyl_plus_duydzl
+                   
+                endif
 
-       b_sigma_xy = mul*b_duxdyl_plus_duydxl
-       b_sigma_xz = mul*b_duzdxl_plus_duxdzl
-       b_sigma_yz = mul*b_duzdyl_plus_duydzl
-
-    endif
-
 ! subtract memory variables if attenuation
-    if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
-      do i_sls = 1,N_SLS
-        b_R_xx_val = b_R_xx(i,j,k,ispec,i_sls)
-        b_R_yy_val = b_R_yy(i,j,k,ispec,i_sls)
-        b_sigma_xx = b_sigma_xx - b_R_xx_val
-        b_sigma_yy = b_sigma_yy - b_R_yy_val
-        b_sigma_zz = b_sigma_zz + b_R_xx_val + b_R_yy_val
-        b_sigma_xy = b_sigma_xy - b_R_xy(i,j,k,ispec,i_sls)
-        b_sigma_xz = b_sigma_xz - b_R_xz(i,j,k,ispec,i_sls)
-        b_sigma_yz = b_sigma_yz - b_R_yz(i,j,k,ispec,i_sls)
-      enddo
-    endif
+                if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+                   do i_sls = 1,N_SLS
+                      b_R_xx_val = b_R_xx(i,j,k,ispec,i_sls)
+                      b_R_yy_val = b_R_yy(i,j,k,ispec,i_sls)
+                      b_sigma_xx = b_sigma_xx - b_R_xx_val
+                      b_sigma_yy = b_sigma_yy - b_R_yy_val
+                      b_sigma_zz = b_sigma_zz + b_R_xx_val + b_R_yy_val
+                      b_sigma_xy = b_sigma_xy - b_R_xy(i,j,k,ispec,i_sls)
+                      b_sigma_xz = b_sigma_xz - b_R_xz(i,j,k,ispec,i_sls)
+                      b_sigma_yz = b_sigma_yz - b_R_yz(i,j,k,ispec,i_sls)
+                   enddo
+                endif
 
 ! form dot product with test vector, symmetric form
-        b_tempx1(i,j,k) = jacobianl * (b_sigma_xx*xixl + b_sigma_xy*xiyl + b_sigma_xz*xizl)
-        b_tempy1(i,j,k) = jacobianl * (b_sigma_xy*xixl + b_sigma_yy*xiyl + b_sigma_yz*xizl)
-        b_tempz1(i,j,k) = jacobianl * (b_sigma_xz*xixl + b_sigma_yz*xiyl + b_sigma_zz*xizl)
+                b_tempx1(i,j,k) = jacobianl * (b_sigma_xx*xixl + b_sigma_xy*xiyl + b_sigma_xz*xizl)
+                b_tempy1(i,j,k) = jacobianl * (b_sigma_xy*xixl + b_sigma_yy*xiyl + b_sigma_yz*xizl)
+                b_tempz1(i,j,k) = jacobianl * (b_sigma_xz*xixl + b_sigma_yz*xiyl + b_sigma_zz*xizl)
 
-        b_tempx2(i,j,k) = jacobianl * (b_sigma_xx*etaxl + b_sigma_xy*etayl + b_sigma_xz*etazl)
-        b_tempy2(i,j,k) = jacobianl * (b_sigma_xy*etaxl + b_sigma_yy*etayl + b_sigma_yz*etazl)
-        b_tempz2(i,j,k) = jacobianl * (b_sigma_xz*etaxl + b_sigma_yz*etayl + b_sigma_zz*etazl)
-
-        b_tempx3(i,j,k) = jacobianl * (b_sigma_xx*gammaxl + b_sigma_xy*gammayl + b_sigma_xz*gammazl)
-        b_tempy3(i,j,k) = jacobianl * (b_sigma_xy*gammaxl + b_sigma_yy*gammayl + b_sigma_yz*gammazl)
-        b_tempz3(i,j,k) = jacobianl * (b_sigma_xz*gammaxl + b_sigma_yz*gammayl + b_sigma_zz*gammazl)
-
+                b_tempx2(i,j,k) = jacobianl * (b_sigma_xx*etaxl + b_sigma_xy*etayl + b_sigma_xz*etazl)
+                b_tempy2(i,j,k) = jacobianl * (b_sigma_xy*etaxl + b_sigma_yy*etayl + b_sigma_yz*etazl)
+                b_tempz2(i,j,k) = jacobianl * (b_sigma_xz*etaxl + b_sigma_yz*etayl + b_sigma_zz*etazl)
+                
+                b_tempx3(i,j,k) = jacobianl * (b_sigma_xx*gammaxl + b_sigma_xy*gammayl + b_sigma_xz*gammazl)
+                b_tempy3(i,j,k) = jacobianl * (b_sigma_xy*gammaxl + b_sigma_yy*gammayl + b_sigma_yz*gammazl)
+                b_tempz3(i,j,k) = jacobianl * (b_sigma_xz*gammaxl + b_sigma_yz*gammayl + b_sigma_zz*gammazl)
+                
+             enddo
           enddo
-        enddo
-      enddo
+       enddo
+       
+    endif ! of test if SIMULATION_TYPE == 3
 
-      endif ! of test if SIMULATION_TYPE == 3
-
 !----------------------------------------------------------------------------------------------
 ! end of nested loops on i,j,k to perform the backward calculations in a given element (ispec)
 !----------------------------------------------------------------------------------------------
@@ -2173,76 +2346,123 @@
 ! beginning of nested loops on i,j,k to perform the forward calculations in a given element (ispec)
 !---------------------------------------------------------------------------------------------------
 
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+    ! call mxm_m1_m2_5points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+    do j=1,m2
+       do i=1,m1
+          E1_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_5points(1,j) + &
+               hprimewgll_xxT(i,2)*C1_m1_m2_5points(2,j) + &
+               hprimewgll_xxT(i,3)*C1_m1_m2_5points(3,j) + &
+               hprimewgll_xxT(i,4)*C1_m1_m2_5points(4,j) + &
+               hprimewgll_xxT(i,5)*C1_m1_m2_5points(5,j)
 
-          tempx1l = 0.
-          tempy1l = 0.
-          tempz1l = 0.
+          E2_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_5points(1,j) + &
+               hprimewgll_xxT(i,2)*C2_m1_m2_5points(2,j) + &
+               hprimewgll_xxT(i,3)*C2_m1_m2_5points(3,j) + &
+               hprimewgll_xxT(i,4)*C2_m1_m2_5points(4,j) + &
+               hprimewgll_xxT(i,5)*C2_m1_m2_5points(5,j)
 
-          tempx2l = 0.
-          tempy2l = 0.
-          tempz2l = 0.
+          E3_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_5points(1,j) + &
+               hprimewgll_xxT(i,2)*C3_m1_m2_5points(2,j) + &
+               hprimewgll_xxT(i,3)*C3_m1_m2_5points(3,j) + &
+               hprimewgll_xxT(i,4)*C3_m1_m2_5points(4,j) + &
+               hprimewgll_xxT(i,5)*C3_m1_m2_5points(5,j)
+       enddo
+    enddo
 
-          tempx3l = 0.
-          tempy3l = 0.
-          tempz3l = 0.
+    !   call mxm_m1_m1_5points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+    !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+    do i=1,m1
+       do j=1,m1
+    ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+          do k = 1,NGLLX
+             newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                  tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                  tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                  tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                  tempx2(i,5,k)*hprimewgll_xx(5,j)
 
-          do l=1,NGLLX
-            fac1 = hprimewgll_xx(l,i)
-            tempx1l = tempx1l + tempx1(l,j,k)*fac1
-            tempy1l = tempy1l + tempy1(l,j,k)*fac1
-            tempz1l = tempz1l + tempz1(l,j,k)*fac1
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+             newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                  tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                  tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                  tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                  tempy2(i,5,k)*hprimewgll_xx(5,j)
 
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-            fac2 = hprimewgll_yy(l,j)
-            tempx2l = tempx2l + tempx2(i,l,k)*fac2
-            tempy2l = tempy2l + tempy2(i,l,k)*fac2
-            tempz2l = tempz2l + tempz2(i,l,k)*fac2
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-            fac3 = hprimewgll_zz(l,k)
-            tempx3l = tempx3l + tempx3(i,j,l)*fac3
-            tempy3l = tempy3l + tempy3(i,j,l)*fac3
-            tempz3l = tempz3l + tempz3(i,j,l)*fac3
+             newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                  tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                  tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                  tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                  tempz2(i,5,k)*hprimewgll_xx(5,j)
           enddo
+       enddo
+    enddo
 
-          fac1 = wgllwgll_yz(j,k)
-          fac2 = wgllwgll_xz(i,k)
-          fac3 = wgllwgll_xy(i,j)
+    ! call mxm_m2_m1_5points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+    do j=1,m1
+       do i=1,m2
+          E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+               C1_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+               C1_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+               C1_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+               C1_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
 
+          E2_mxm_m2_m1_5points(i,j) = C2_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+               C2_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+               C2_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+               C2_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+               C2_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
+
+          E3_mxm_m2_m1_5points(i,j) = C3_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+               C3_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+               C3_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+               C3_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+               C3_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
+       enddo
+    enddo
+
+    do k=1,NGLLZ
+       do j=1,NGLLY
+          do i=1,NGLLX
+
+
+             fac1 = wgllwgll_yz(j,k)
+             fac2 = wgllwgll_xz(i,k)
+             fac3 = wgllwgll_xy(i,j)
+
 ! sum contributions from each element to the global mesh
-  iglob = ibool(i,j,k,ispec)
-  accel(1,iglob) = accel(1,iglob) - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
-  accel(2,iglob) = accel(2,iglob) - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
-  accel(3,iglob) = accel(3,iglob) - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
-
+             iglob = ibool(i,j,k,ispec)
+             accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
+                  fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
+             accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
+                  fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
+             accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
+                  fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
+             
 ! update memory variables based upon the Runge-Kutta scheme
 
-  if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+             if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
 
 ! use Runge-Kutta scheme to march in time
-  do i_sls = 1,N_SLS
+                do i_sls = 1,N_SLS
 
 ! get coefficients for that standard linear solid
 
-  factor_loc = mustore(i,j,k,ispec) * factor_common(iselected,i_sls)
-  alphaval_loc = alphaval(iselected,i_sls)
-  betaval_loc = betaval(iselected,i_sls)
-  gammaval_loc = gammaval(iselected,i_sls)
+                   factor_loc = mustore(i,j,k,ispec) * factor_common(iselected,i_sls)
+                   alphaval_loc = alphaval(iselected,i_sls)
+                   betaval_loc = betaval(iselected,i_sls)
+                   gammaval_loc = gammaval(iselected,i_sls)
 
 ! term in xx
-  Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-  Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-  R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
+                   Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                   Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                   R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
 
 ! term in yy
-  Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-  Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-  R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
+                   Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                   Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                   R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
 
 ! term in zz not computed since zero trace
 ! This is because we only implement Q_\mu attenuation and not Q_\kappa.
@@ -2253,26 +2473,26 @@
 ! we get Q_\alpha = (9 / 4) * Q_\mu = 2.25 * Q_\mu
 
 ! term in xy
-  Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-  Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-  R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
+                   Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                   Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                   R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
 
 ! term in xz
-  Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-  Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-  R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
+                   Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                   Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                   R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
 
 ! term in yz
-  Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-  Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-  R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
+                   Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                   Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                   R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
 
-    enddo   ! end of loop on memory variables
+                enddo   ! end of loop on memory variables
+                
+             endif  !  end attenuation
 
-  endif  !  end attenuation
-
-        enddo
-      enddo
+          enddo
+       enddo
     enddo
 
 !---------------------------------------------------------------------------------------------
@@ -2285,101 +2505,150 @@
 
     if (SIMULATION_TYPE == 3) then
 
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
+     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+    ! call mxm_m1_m2_5points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+       do j=1,m2
+          do i=1,m1
+             E1_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*b_C1_m1_m2_5points(1,j) + &
+                  hprimewgll_xxT(i,2)*b_C1_m1_m2_5points(2,j) + &
+                  hprimewgll_xxT(i,3)*b_C1_m1_m2_5points(3,j) + &
+                  hprimewgll_xxT(i,4)*b_C1_m1_m2_5points(4,j) + &
+                  hprimewgll_xxT(i,5)*b_C1_m1_m2_5points(5,j)
+             
+             E2_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*b_C2_m1_m2_5points(1,j) + &
+                  hprimewgll_xxT(i,2)*b_C2_m1_m2_5points(2,j) + &
+                  hprimewgll_xxT(i,3)*b_C2_m1_m2_5points(3,j) + &
+                  hprimewgll_xxT(i,4)*b_C2_m1_m2_5points(4,j) + &
+                  hprimewgll_xxT(i,5)*b_C2_m1_m2_5points(5,j)
 
-          b_tempx1l = 0.
-          b_tempy1l = 0.
-          b_tempz1l = 0.
+             E3_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*b_C3_m1_m2_5points(1,j) + &
+                  hprimewgll_xxT(i,2)*b_C3_m1_m2_5points(2,j) + &
+                  hprimewgll_xxT(i,3)*b_C3_m1_m2_5points(3,j) + &
+                  hprimewgll_xxT(i,4)*b_C3_m1_m2_5points(4,j) + &
+                  hprimewgll_xxT(i,5)*b_C3_m1_m2_5points(5,j)
+          enddo
+       enddo
 
-          b_tempx2l = 0.
-          b_tempy2l = 0.
-          b_tempz2l = 0.
+    !   call mxm_m1_m1_5points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+    !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+       do i=1,m1
+          do j=1,m1
+    ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+             do k = 1,NGLLX
+                newtempx2(i,j,k) = b_tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                     b_tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                     b_tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                     b_tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                     b_tempx2(i,5,k)*hprimewgll_xx(5,j)
 
-          b_tempx3l = 0.
-          b_tempy3l = 0.
-          b_tempz3l = 0.
+                newtempy2(i,j,k) = b_tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                     b_tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                     b_tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                     b_tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                     b_tempy2(i,5,k)*hprimewgll_xx(5,j)
 
-          do l=1,NGLLX
-            fac1 = hprimewgll_xx(l,i)
-            b_tempx1l = b_tempx1l + b_tempx1(l,j,k)*fac1
-            b_tempy1l = b_tempy1l + b_tempy1(l,j,k)*fac1
-            b_tempz1l = b_tempz1l + b_tempz1(l,j,k)*fac1
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+                newtempz2(i,j,k) = b_tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                     b_tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                     b_tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                     b_tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                     b_tempz2(i,5,k)*hprimewgll_xx(5,j)
+             enddo
+          enddo
+       enddo
 
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-            fac2 = hprimewgll_yy(l,j)
-            b_tempx2l = b_tempx2l + b_tempx2(i,l,k)*fac2
-            b_tempy2l = b_tempy2l + b_tempy2(i,l,k)*fac2
-            b_tempz2l = b_tempz2l + b_tempz2(i,l,k)*fac2
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+    ! call mxm_m2_m1_5points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+       do j=1,m1
+          do i=1,m2
+             E1_mxm_m2_m1_5points(i,j) = b_C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+                  b_C1_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+                  b_C1_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+                  b_C1_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+                  b_C1_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
+             
+             E2_mxm_m2_m1_5points(i,j) = b_C2_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+                  b_C2_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+                  b_C2_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+                  b_C2_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+                  b_C2_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
 
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-            fac3 = hprimewgll_zz(l,k)
-            b_tempx3l = b_tempx3l + b_tempx3(i,j,l)*fac3
-            b_tempy3l = b_tempy3l + b_tempy3(i,j,l)*fac3
-            b_tempz3l = b_tempz3l + b_tempz3(i,j,l)*fac3
+             E3_mxm_m2_m1_5points(i,j) = b_C3_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+                  b_C3_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+                  b_C3_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+                  b_C3_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+                  b_C3_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
           enddo
+       enddo
 
-          fac1 = wgllwgll_yz(j,k)
-          fac2 = wgllwgll_xz(i,k)
-          fac3 = wgllwgll_xy(i,j)
+       do k=1,NGLLZ
+          do j=1,NGLLY
+             do i=1,NGLLX
 
+
+                fac1 = wgllwgll_yz(j,k)
+                fac2 = wgllwgll_xz(i,k)
+                fac3 = wgllwgll_xy(i,j)
+
 ! sum contributions from each element to the global mesh
-  iglob = ibool(i,j,k,ispec)
-  b_accel(1,iglob) = b_accel(1,iglob) - (fac1*b_tempx1l + fac2*b_tempx2l + fac3*b_tempx3l)
-  b_accel(2,iglob) = b_accel(2,iglob) - (fac1*b_tempy1l + fac2*b_tempy2l + fac3*b_tempy3l)
-  b_accel(3,iglob) = b_accel(3,iglob) - (fac1*b_tempz1l + fac2*b_tempz2l + fac3*b_tempz3l)
+                iglob = ibool(i,j,k,ispec)
+                b_accel(1,iglob) = b_accel(1,iglob) - (fac1*newtempx1(i,j,k) + fac2*newtempx2(i,j,k) + fac3*newtempx3(i,j,k))
+                b_accel(2,iglob) = b_accel(2,iglob) - (fac1*newtempy1(i,j,k) + fac2*newtempy2(i,j,k) + fac3*newtempy3(i,j,k))
+                b_accel(3,iglob) = b_accel(3,iglob) - (fac1*newtempz1(i,j,k) + fac2*newtempz2(i,j,k) + fac3*newtempz3(i,j,k))
 
 ! update memory variables based upon the Runge-Kutta scheme
 
-  if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+                if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
 
 ! use Runge-Kutta scheme to march in time
-  do i_sls = 1,N_SLS
+                   do i_sls = 1,N_SLS
 
 ! get coefficients for that standard linear solid
-    factor_loc = mustore(i,j,k,ispec) * factor_common(iselected,i_sls)
-    b_alphaval_loc = b_alphaval(iselected,i_sls)
-    b_betaval_loc = b_betaval(iselected,i_sls)
-    b_gammaval_loc = b_gammaval(iselected,i_sls)
+                      factor_loc = mustore(i,j,k,ispec) * factor_common(iselected,i_sls)
+                      b_alphaval_loc = b_alphaval(iselected,i_sls)
+                      b_betaval_loc = b_betaval(iselected,i_sls)
+                      b_gammaval_loc = b_gammaval(iselected,i_sls)
 
 ! term in xx
-    b_Sn   = factor_loc * b_epsilondev_xx(i,j,k,ispec)
-    b_Snp1   = factor_loc * b_epsilondev_xx_loc(i,j,k)
-    b_R_xx(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_xx(i,j,k,ispec,i_sls) + b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
+                      b_Sn   = factor_loc * b_epsilondev_xx(i,j,k,ispec)
+                      b_Snp1   = factor_loc * b_epsilondev_xx_loc(i,j,k)
+                      b_R_xx(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_xx(i,j,k,ispec,i_sls) + &
+                           b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
 
 ! term in yy
-    b_Sn   = factor_loc * b_epsilondev_yy(i,j,k,ispec)
-    b_Snp1   = factor_loc * b_epsilondev_yy_loc(i,j,k)
-    b_R_yy(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_yy(i,j,k,ispec,i_sls) + b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
+                      b_Sn   = factor_loc * b_epsilondev_yy(i,j,k,ispec)
+                      b_Snp1   = factor_loc * b_epsilondev_yy_loc(i,j,k)
+                      b_R_yy(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_yy(i,j,k,ispec,i_sls) + &
+                           b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
 
 ! term in zz not computed since zero trace
 
 ! term in xy
-    b_Sn   = factor_loc * b_epsilondev_xy(i,j,k,ispec)
-    b_Snp1   = factor_loc * b_epsilondev_xy_loc(i,j,k)
-    b_R_xy(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_xy(i,j,k,ispec,i_sls) + b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
+                      b_Sn   = factor_loc * b_epsilondev_xy(i,j,k,ispec)
+                      b_Snp1   = factor_loc * b_epsilondev_xy_loc(i,j,k)
+                      b_R_xy(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_xy(i,j,k,ispec,i_sls) + &
+                           b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
 
 ! term in xz
-    b_Sn   = factor_loc * b_epsilondev_xz(i,j,k,ispec)
-    b_Snp1   = factor_loc * b_epsilondev_xz_loc(i,j,k)
-    b_R_xz(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_xz(i,j,k,ispec,i_sls) + b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
+                      b_Sn   = factor_loc * b_epsilondev_xz(i,j,k,ispec)
+                      b_Snp1   = factor_loc * b_epsilondev_xz_loc(i,j,k)
+                      b_R_xz(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_xz(i,j,k,ispec,i_sls) + &
+                           b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
 
 ! term in yz
-    b_Sn   = factor_loc * b_epsilondev_yz(i,j,k,ispec)
-    b_Snp1   = factor_loc * b_epsilondev_yz_loc(i,j,k)
-    b_R_yz(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_yz(i,j,k,ispec,i_sls) + b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
+                      b_Sn   = factor_loc * b_epsilondev_yz(i,j,k,ispec)
+                      b_Snp1   = factor_loc * b_epsilondev_yz_loc(i,j,k)
+                      b_R_yz(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_yz(i,j,k,ispec,i_sls) + &
+                           b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
 
-    enddo   ! end of loop on memory variables
+                   enddo   ! end of loop on memory variables
+                   
+                endif  !  end attenuation
 
-  endif  !  end attenuation
+             enddo
+          enddo
+       enddo
 
-        enddo
-      enddo
-    enddo
-
     endif ! of test if SIMULATION_TYPE == 3
 
 !----------------------------------------------------------------------------------------------
@@ -2387,22 +2656,22 @@
 !----------------------------------------------------------------------------------------------
 
 ! save deviatoric strain for Runge-Kutta scheme
-  if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
-    epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-    epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-    epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-    epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-    epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-    if (SIMULATION_TYPE == 3) then
-      b_epsilondev_xx(:,:,:,ispec) = b_epsilondev_xx_loc(:,:,:)
-      b_epsilondev_yy(:,:,:,ispec) = b_epsilondev_yy_loc(:,:,:)
-      b_epsilondev_xy(:,:,:,ispec) = b_epsilondev_xy_loc(:,:,:)
-      b_epsilondev_xz(:,:,:,ispec) = b_epsilondev_xz_loc(:,:,:)
-      b_epsilondev_yz(:,:,:,ispec) = b_epsilondev_yz_loc(:,:,:)
+    if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+       epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+       epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+       epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+       epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+       epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+       if (SIMULATION_TYPE == 3) then
+          b_epsilondev_xx(:,:,:,ispec) = b_epsilondev_xx_loc(:,:,:)
+          b_epsilondev_yy(:,:,:,ispec) = b_epsilondev_yy_loc(:,:,:)
+          b_epsilondev_xy(:,:,:,ispec) = b_epsilondev_xy_loc(:,:,:)
+          b_epsilondev_xz(:,:,:,ispec) = b_epsilondev_xz_loc(:,:,:)
+          b_epsilondev_yz(:,:,:,ispec) = b_epsilondev_yz_loc(:,:,:)
+       endif
     endif
-  endif
 
-  enddo   ! of the spectral element loop
+ enddo   ! of the spectral element loop
 
 ! add Stacey conditions
 
@@ -3248,44 +3517,44 @@
 ! calculate strain div and curl
     do ispec=1,NSPEC_AB
 
+       tempx1l(:,:,:) = 0._CUSTOM_REAL
+       tempx2l(:,:,:) = 0._CUSTOM_REAL
+       tempx3l(:,:,:) = 0._CUSTOM_REAL
+
+       tempy1l(:,:,:) = 0._CUSTOM_REAL
+       tempy2l(:,:,:) = 0._CUSTOM_REAL
+       tempy3l(:,:,:) = 0._CUSTOM_REAL
+
+       tempz1l(:,:,:) = 0._CUSTOM_REAL
+       tempz2l(:,:,:) = 0._CUSTOM_REAL
+       tempz3l(:,:,:) = 0._CUSTOM_REAL   
+       
     do k=1,NGLLZ
       do j=1,NGLLY
         do i=1,NGLLX
 
-          tempx1l = 0._CUSTOM_REAL
-          tempx2l = 0._CUSTOM_REAL
-          tempx3l = 0._CUSTOM_REAL
-
-          tempy1l = 0._CUSTOM_REAL
-          tempy2l = 0._CUSTOM_REAL
-          tempy3l = 0._CUSTOM_REAL
-
-          tempz1l = 0._CUSTOM_REAL
-          tempz2l = 0._CUSTOM_REAL
-          tempz3l = 0._CUSTOM_REAL
-
           do l=1,NGLLX
             hp1 = hprime_xx(i,l)
             iglob = ibool(l,j,k,ispec)
-            tempx1l = tempx1l + veloc(1,iglob)*hp1
-            tempy1l = tempy1l + veloc(2,iglob)*hp1
-            tempz1l = tempz1l + veloc(3,iglob)*hp1
+            tempx1l(i,j,k) = tempx1l(i,j,k) + veloc(1,iglob)*hp1
+            tempy1l(i,j,k) = tempy1l(i,j,k) + veloc(2,iglob)*hp1
+            tempz1l(i,j,k) = tempz1l(i,j,k) + veloc(3,iglob)*hp1
 !!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
 
 !!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
             hp2 = hprime_yy(j,l)
             iglob = ibool(i,l,k,ispec)
-            tempx2l = tempx2l + veloc(1,iglob)*hp2
-            tempy2l = tempy2l + veloc(2,iglob)*hp2
-            tempz2l = tempz2l + veloc(3,iglob)*hp2
+            tempx2l(i,j,k) = tempx2l(i,j,k) + veloc(1,iglob)*hp2
+            tempy2l(i,j,k) = tempy2l(i,j,k) + veloc(2,iglob)*hp2
+            tempz2l(i,j,k) = tempz2l(i,j,k) + veloc(3,iglob)*hp2
 !!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
 
 !!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
             hp3 = hprime_zz(k,l)
             iglob = ibool(i,j,l,ispec)
-            tempx3l = tempx3l + veloc(1,iglob)*hp3
-            tempy3l = tempy3l + veloc(2,iglob)*hp3
-            tempz3l = tempz3l + veloc(3,iglob)*hp3
+            tempx3l(i,j,k) = tempx3l(i,j,k) + veloc(1,iglob)*hp3
+            tempy3l(i,j,k) = tempy3l(i,j,k) + veloc(2,iglob)*hp3
+            tempz3l(i,j,k) = tempz3l(i,j,k) + veloc(3,iglob)*hp3
           enddo
 
 !         get derivatives of ux, uy and uz with respect to x, y and z
@@ -3300,17 +3569,17 @@
           gammayl = gammay(i,j,k,ispec)
           gammazl = gammaz(i,j,k,ispec)
 
-          dvxdxl(i,j,k) = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-          dvxdyl(i,j,k) = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-          dvxdzl(i,j,k) = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+          dvxdxl(i,j,k) = xixl*tempx1l(i,j,k) + etaxl*tempx2l(i,j,k) + gammaxl*tempx3l(i,j,k)
+          dvxdyl(i,j,k) = xiyl*tempx1l(i,j,k) + etayl*tempx2l(i,j,k) + gammayl*tempx3l(i,j,k)
+          dvxdzl(i,j,k) = xizl*tempx1l(i,j,k) + etazl*tempx2l(i,j,k) + gammazl*tempx3l(i,j,k)
 
-          dvydxl(i,j,k) = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
-          dvydyl(i,j,k) = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
-          dvydzl(i,j,k) = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
+          dvydxl(i,j,k) = xixl*tempy1l(i,j,k) + etaxl*tempy2l(i,j,k) + gammaxl*tempy3l(i,j,k)
+          dvydyl(i,j,k) = xiyl*tempy1l(i,j,k) + etayl*tempy2l(i,j,k) + gammayl*tempy3l(i,j,k)
+          dvydzl(i,j,k) = xizl*tempy1l(i,j,k) + etazl*tempy2l(i,j,k) + gammazl*tempy3l(i,j,k)
 
-          dvzdxl(i,j,k) = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
-          dvzdyl(i,j,k) = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
-          dvzdzl(i,j,k) = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
+          dvzdxl(i,j,k) = xixl*tempz1l(i,j,k) + etaxl*tempz2l(i,j,k) + gammaxl*tempz3l(i,j,k)
+          dvzdyl(i,j,k) = xiyl*tempz1l(i,j,k) + etayl*tempz2l(i,j,k) + gammayl*tempz3l(i,j,k)
+          dvzdzl(i,j,k) = xizl*tempz1l(i,j,k) + etazl*tempz2l(i,j,k) + gammazl*tempz3l(i,j,k)
 
         enddo
       enddo

Added: seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN/specfem3D_no_Deville.f90
===================================================================
--- seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN/specfem3D_no_Deville.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_SESAME/tags/v1.4.4_last_BASIN/specfem3D_no_Deville.f90	2010-01-07 18:07:20 UTC (rev 16128)
@@ -0,0 +1,3470 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  1 . 4
+!               ---------------------------------------
+!
+!                 Dimitri Komatitsch and Jeroen Tromp
+!    Seismological Laboratory - California Institute of Technology
+!         (c) California Institute of Technology September 2006
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+!
+! United States and French Government Sponsorship Acknowledged.
+
+  subroutine specfem3D
+
+  implicit none
+
+  include "constants.h"
+
+! include values created by the mesher
+  include "OUTPUT_FILES/values_from_mesher.h"
+
+!=============================================================================!
+!                                                                             !
+!  specfem3D is a 3-D spectral-element solver for a local or regional model.  !
+!  It uses a mesh generated by program meshfem3D                              !
+!                                                                             !
+!=============================================================================!
+!
+! If you use this code for your own research, please cite at least one article
+! written by the developers of the package, for instance:
+!
+! @ARTICLE{TrKoLi08,
+! author = {Jeroen Tromp and Dimitri Komatitsch and Qinya Liu},
+! title = {Spectral-Element and Adjoint Methods in Seismology},
+! journal = {Communications in Computational Physics},
+! year = {2008},
+! volume = {3},
+! pages = {1-32},
+! number = {1}}
+!
+! or
+!
+! @ARTICLE{LiPoKoTr04,
+! author = {Qinya Liu and Jascha Polet and Dimitri Komatitsch and Jeroen Tromp},
+! title = {Spectral-element moment tensor inversions for earthquakes in {S}outhern {C}alifornia},
+! journal={Bull. Seismol. Soc. Am.},
+! year = {2004},
+! volume = {94},
+! pages = {1748-1761},
+! number = {5},
+! doi = {10.1785/012004038}}
+!
+! @INCOLLECTION{ChKoViCaVaFe07,
+! author = {Emmanuel Chaljub and Dimitri Komatitsch and Jean-Pierre Vilotte and
+! Yann Capdeville and Bernard Valette and Gaetano Festa},
+! title = {Spectral Element Analysis in Seismology},
+! booktitle = {Advances in Wave Propagation in Heterogeneous Media},
+! publisher = {Elsevier - Academic Press},
+! year = {2007},
+! editor = {Ru-Shan Wu and Val\'erie Maupin},
+! volume = {48},
+! series = {Advances in Geophysics},
+! pages = {365-419}}
+!
+! @ARTICLE{KoVi98,
+! author={D. Komatitsch and J. P. Vilotte},
+! title={The spectral-element method: an efficient tool to simulate the seismic response of 2{D} and 3{D} geological structures},
+! journal={Bull. Seismol. Soc. Am.},
+! year=1998,
+! volume=88,
+! number=2,
+! pages={368-392}}
+!
+! @ARTICLE{KoTr99,
+! author={D. Komatitsch and J. Tromp},
+! year=1999,
+! title={Introduction to the spectral-element method for 3-{D} seismic wave propagation},
+! journal={Geophys. J. Int.},
+! volume=139,
+! number=3,
+! pages={806-822},
+! doi={10.1046/j.1365-246x.1999.00967.x}}
+!
+! @ARTICLE{KoLiTrSuStSh04,
+! author={Dimitri Komatitsch and Qinya Liu and Jeroen Tromp and Peter S\"{u}ss
+!   and Christiane Stidham and John H. Shaw},
+! year=2004,
+! title={Simulations of Ground Motion in the {L}os {A}ngeles {B}asin
+!   based upon the Spectral-Element Method},
+! journal={Bull. Seism. Soc. Am.},
+! volume=94,
+! number=1,
+! pages={187-206}}
+!
+! and/or another article from http://web.univ-pau.fr/~dkomati1/publications.html
+!
+!
+! If you use the kernel capabilities of the code, please cite at least one article
+! written by the developers of the package, for instance:
+!
+! @ARTICLE{TrKoLi08,
+! author = {Jeroen Tromp and Dimitri Komatitsch and Qinya Liu},
+! title = {Spectral-Element and Adjoint Methods in Seismology},
+! journal = {Communications in Computational Physics},
+! year = {2008},
+! volume = {3},
+! pages = {1-32},
+! number = {1}}
+!
+! or
+!
+! @ARTICLE{LiTr06,
+! author={Qinya Liu and Jeroen Tromp},
+! title={Finite-frequency kernels based on adjoint methods},
+! journal={Bull. Seismol. Soc. Am.},
+! year=2006,
+! volume=96,
+! number=6,
+! pages={2383-2397},
+! doi={10.1785/0120060041}}
+!
+!
+! Reference frame - convention:
+! ----------------------------
+!
+! The code uses the following convention for the reference frame:
+!
+!  - X axis is East
+!  - Y axis is North
+!  - Z axis is up
+!
+! Note that this convention is different from both the Aki-Richards convention
+! and the Harvard CMT convention.
+!
+! Let us recall that the Aki-Richards convention is:
+!
+!  - X axis is North
+!  - Y axis is East
+!  - Z axis is down
+!
+! and that the Harvard CMT convention is:
+!
+!  - X axis is South
+!  - Y axis is East
+!  - Z axis is up
+!
+! To report bugs or suggest improvements to the code, please send an email
+! to Jeroen Tromp <jtromp AT princeton.edu> and/or use our online
+! bug tracking system at http://www.geodynamics.org/roundup .
+!
+! Evolution of the code:
+! ---------------------
+!
+! MPI v. 1.4 Dimitri Komatitsch, University of Pau, Qinya Liu and others, Caltech, September 2006:
+!  better adjoint and kernel calculations, faster and better I/Os
+!  on very large systems, many small improvements and bug fixes
+!
+! MPI v. 1.3 Dimitri Komatitsch, University of Pau, and Qinya Liu, Caltech, July 2005:
+!  serial version, regular mesh, adjoint and kernel calculations, ParaView support
+!
+! MPI v. 1.2 Min Chen and Dimitri Komatitsch, Caltech, July 2004:
+!  full anisotropy, volume movie
+!
+! MPI v. 1.1 Dimitri Komatitsch, Caltech, October 2002: Zhu's Moho map, scaling
+!  of Vs with depth, Hauksson's regional model, attenuation, oceans, movies
+!
+! MPI v. 1.0 Dimitri Komatitsch, Caltech, May 2002: first MPI version based on global code
+
+! memory variables and standard linear solids for attenuation
+  double precision, dimension(N_SLS) :: tau_mu_dble,tau_sigma_dble,beta_dble
+  double precision factor_scale_dble,one_minus_sum_beta_dble
+  real(kind=CUSTOM_REAL), dimension(NUM_REGIONS_ATTENUATION,N_SLS) :: tau_mu,tau_sigma,beta
+  real(kind=CUSTOM_REAL), dimension(NUM_REGIONS_ATTENUATION) :: factor_scale,one_minus_sum_beta
+
+  real(kind=CUSTOM_REAL), dimension(NUM_REGIONS_ATTENUATION,N_SLS) :: tauinv,factor_common, alphaval,betaval,gammaval
+  integer iattenuation
+  double precision scale_factor
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION,N_SLS) :: &
+    R_xx,R_yy,R_xy,R_xz,R_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION) :: &
+    epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+
+! ADJOINT
+  real(kind=CUSTOM_REAL), dimension(NUM_REGIONS_ATTENUATION,N_SLS) :: b_alphaval, b_betaval, b_gammaval
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATT_AND_KERNEL,N_SLS) :: &
+             b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATT_AND_KERNEL) ::  b_epsilondev_xx, &
+             b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz
+! ADJOINT
+
+  integer NPOIN2DMAX_XY
+
+! use integer array to store topography values
+  integer NX_TOPO,NY_TOPO
+  double precision ORIG_LAT_TOPO,ORIG_LONG_TOPO,DEGREES_PER_CELL_TOPO
+  character(len=100) topo_file
+  integer, dimension(:,:), allocatable :: itopo_bathy
+
+  integer, dimension(NSPEC2DMAX_XMIN_XMAX_VAL) :: ibelm_xmin,ibelm_xmax
+  integer, dimension(NSPEC2DMAX_YMIN_YMAX_VAL) :: ibelm_ymin,ibelm_ymax
+  integer, dimension(NSPEC2D_BOTTOM_VAL) :: ibelm_bottom
+  integer, dimension(NSPEC2D_TOP_VAL) :: ibelm_top
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_VAL) :: jacobian2D_xmin,jacobian2D_xmax
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_VAL) :: jacobian2D_ymin,jacobian2D_ymax
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_BOTTOM_VAL) :: jacobian2D_bottom
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_TOP_VAL) :: jacobian2D_top
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_VAL) :: normal_xmin,normal_xmax
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_VAL) :: normal_ymin,normal_ymax
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_BOTTOM_VAL) :: normal_bottom
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_TOP_VAL) :: normal_top
+
+! Moho mesh
+  integer,dimension(NSPEC2D_MOHO_BOUN) :: ibelm_moho_top, ibelm_moho_bot
+  real(CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_MOHO_BOUN) :: normal_moho
+  integer :: nspec2D_moho, njunk
+  logical, dimension(NSPEC_BOUN) :: is_moho_top, is_moho_bot
+
+! buffers for send and receive between faces of the slices and the chunks
+  real(kind=CUSTOM_REAL), dimension(NPOIN2DMAX_XY_VAL) :: buffer_send_faces_scalar,buffer_received_faces_scalar
+  real(kind=CUSTOM_REAL), dimension(NDIM,NPOIN2DMAX_XY_VAL) :: buffer_send_faces_vector,buffer_received_faces_vector
+
+! -----------------
+
+! mesh parameters
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB_VAL) :: ibool
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB_VAL) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz,jacobian
+  real(kind=CUSTOM_REAL), dimension(NGLOB_AB_VAL) :: xstore,ystore,zstore
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB_VAL) :: &
+        kappastore,mustore
+
+! material properties in case of a fully anisotropic material
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+        c11store,c12store,c13store,c14store,c15store,c16store,c22store, &
+        c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
+        c36store,c44store,c45store,c46store,c55store,c56store,c66store
+
+! flag for sediments
+  logical not_fully_in_bedrock(NSPEC_AB_VAL)
+  logical, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB_VAL) :: flag_sediments
+
+! Stacey
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB_VAL) :: rho_vp,rho_vs
+
+! local to global mapping
+  integer, dimension(NSPEC_AB_VAL) :: idoubling
+
+! mass matrix
+  real(kind=CUSTOM_REAL), dimension(NGLOB_AB_VAL) :: rmass
+
+! additional mass matrix for ocean load
+! ocean load mass matrix is always allocated statically even if no oceans
+  real(kind=CUSTOM_REAL), dimension(NGLOB_AB_VAL) :: rmass_ocean_load
+  logical, dimension(NGLOB_AB_VAL) :: updated_dof_ocean_load
+  real(kind=CUSTOM_REAL) additional_term,force_normal_comp
+
+! displacement, velocity, acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB_VAL) :: displ,veloc,accel
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
+
+  real(kind=CUSTOM_REAL) hp1,hp2,hp3
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,kappal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1l,tempx2l,tempx3l
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempy1l,tempy2l,tempy3l
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempz1l,tempz2l,tempz3l
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1lbis,tempx2lbis,tempx3lbis
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempy1lbis,tempy2lbis,tempy3lbis
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempz1lbis,tempz2lbis,tempz3lbis
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+! time scheme
+  real(kind=CUSTOM_REAL) deltat,deltatover2,deltatsqover2
+
+! ADJOINT
+  real(kind=CUSTOM_REAL) b_additional_term,b_force_normal_comp, kappa_k, mu_k
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_ADJOINT) :: b_displ, b_veloc, b_accel
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: rho_kl, mu_kl, kappa_kl, &
+    rhop_kl, beta_kl, alpha_kl
+  real(kind=CUSTOM_REAL) dsxx,dsxy,dsxz,dsyy,dsyz,dszz
+  real(kind=CUSTOM_REAL) b_duxdxl,b_duxdyl,b_duxdzl,b_duydxl,b_duydyl,b_duydzl,b_duzdxl,b_duzdyl,b_duzdzl
+  real(kind=CUSTOM_REAL) b_duxdxl_plus_duydyl,b_duxdxl_plus_duzdzl,b_duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) b_duxdyl_plus_duydxl,b_duzdxl_plus_duxdzl,b_duzdyl_plus_duydzl
+  real(kind=CUSTOM_REAL) b_dsxx,b_dsxy,b_dsxz,b_dsyy,b_dsyz,b_dszz
+  real(kind=CUSTOM_REAL) b_sigma_xx,b_sigma_yy,b_sigma_zz,b_sigma_xy,b_sigma_xz,b_sigma_yz
+  real(kind=CUSTOM_REAL) b_tempx1l,b_tempx2l,b_tempx3l
+  real(kind=CUSTOM_REAL) b_tempy1l,b_tempy2l,b_tempy3l
+  real(kind=CUSTOM_REAL) b_tempz1l,b_tempz2l,b_tempz3l
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
+    b_tempx1,b_tempx2,b_tempx3,b_tempy1,b_tempy2,b_tempy3,b_tempz1,b_tempz2,b_tempz3
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: absorb_xmin, absorb_xmax, &
+    absorb_ymin, absorb_ymax, absorb_zmin ! for absorbing b.c.
+  integer reclen_xmin, reclen_xmax, reclen_ymin, reclen_ymax, reclen_zmin, reclen1, reclen2
+  real(kind=CUSTOM_REAL) b_deltat, b_deltatover2, b_deltatsqover2
+! ADJOINT
+
+! for attenuation
+  real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc,Sn,Snp1
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+    epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) epsilon_trace_over_3
+
+! ADJOINT
+  real(kind=CUSTOM_REAL) b_R_xx_val,b_R_yy_val
+  real(kind=CUSTOM_REAL) b_alphaval_loc,b_betaval_loc,b_gammaval_loc,b_Sn,b_Snp1
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: b_epsilondev_xx_loc, &
+    b_epsilondev_yy_loc, b_epsilondev_xy_loc, b_epsilondev_xz_loc, b_epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) b_epsilon_trace_over_3
+! ADJOINT
+
+  integer l
+  integer i_SLS
+  real(kind=CUSTOM_REAL) one_minus_sum_beta_use,minus_sum_beta,vs_val,Q_mu
+  integer iselected,iattenuation_sediments,int_Q_mu
+
+! Moho kernel
+  integer ispec2D_moho_top, ispec2D_moho_bot, k_top, k_bot, ispec_top, ispec_bot, iglob_top, iglob_bot
+  real(kind=CUSTOM_REAL), dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO_BOUN) :: dsdx_top, dsdx_bot, b_dsdx_top, b_dsdx_bot
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_MOHO_BOUN) :: moho_kl
+  real(kind=CUSTOM_REAL) :: kernel_moho_top, kernel_moho_bot
+
+! --------
+
+! parameters for the source
+  integer it,isource
+  integer, dimension(:), allocatable :: islice_selected_source,ispec_selected_source
+  integer yr,jda,ho,mi
+  real(kind=CUSTOM_REAL) stf_used
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: sourcearray
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: sourcearrays
+!ADJOINT
+  character(len=150) adj_source_file
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: adj_sourcearray
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:,:), allocatable :: adj_sourcearrays
+!ADJOINT
+  double precision sec,stf
+  double precision, dimension(:), allocatable :: Mxx,Myy,Mzz,Mxy,Mxz,Myz
+  double precision, dimension(:), allocatable :: xi_source,eta_source,gamma_source
+  double precision, dimension(:), allocatable :: t_cmt,hdur,hdur_gaussian
+  double precision, dimension(:), allocatable :: utm_x_source,utm_y_source
+  double precision, external :: comp_source_time_function
+  double precision :: t0
+
+! receiver information
+  character(len=150) rec_filename,filtered_rec_filename,dummystring
+  integer nrec,nrec_local,nrec_tot_found,irec_local,ios
+  integer, allocatable, dimension(:) :: islice_selected_rec,ispec_selected_rec,number_receiver_global
+  double precision, allocatable, dimension(:) :: xi_receiver,eta_receiver,gamma_receiver
+  double precision hlagrange
+! ADJOINT
+  integer nrec_simulation, nadj_rec_local
+! source frechet derivatives
+  real(kind=CUSTOM_REAL) :: displ_s(NDIM,NGLLX,NGLLY,NGLLZ), eps_s(NDIM,NDIM), eps_m_s(NDIM), stf_deltat
+  real(kind=CUSTOM_REAL), dimension(:), allocatable :: Mxx_der,Myy_der,Mzz_der,Mxy_der,Mxz_der,Myz_der
+  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: sloc_der
+  double precision, dimension(:,:), allocatable :: hpxir_store,hpetar_store,hpgammar_store
+! ADJOINT
+
+! timing information for the stations
+  double precision, allocatable, dimension(:,:,:) :: nu
+  character(len=MAX_LENGTH_STATION_NAME), allocatable, dimension(:) :: station_name
+  character(len=MAX_LENGTH_NETWORK_NAME), allocatable, dimension(:) :: network_name
+
+! seismograms
+  double precision dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: seismograms_d,seismograms_v,seismograms_a
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: seismograms_eps
+
+  integer i,j,k,ispec,irec,iglob
+
+! Gauss-Lobatto-Legendre points of integration and weights
+  double precision, dimension(NGLLX) :: xigll,wxgll
+  double precision, dimension(NGLLY) :: yigll,wygll
+  double precision, dimension(NGLLZ) :: zigll,wzgll
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
+  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! Lagrange interpolators at receivers
+  double precision, dimension(:), allocatable :: hxir,hetar,hpxir,hpetar,hgammar,hpgammar
+  double precision, dimension(:,:), allocatable :: hxir_store,hetar_store,hgammar_store
+
+! 2-D addressing and buffers for summation between slices
+  integer, dimension(NPOIN2DMAX_XMIN_XMAX_VAL) :: iboolleft_xi,iboolright_xi
+  integer, dimension(NPOIN2DMAX_YMIN_YMAX_VAL) :: iboolleft_eta,iboolright_eta
+
+! for addressing of the slices
+  integer, dimension(0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL) :: addressing
+  integer, dimension(0:NPROC_VAL-1) :: iproc_xi_slice,iproc_eta_slice
+
+! proc numbers for MPI
+  integer myrank,sizeprocs
+
+  integer npoin2D_xi,npoin2D_eta
+
+  integer iproc_xi,iproc_eta,iproc,iproc_read
+
+! maximum of the norm of the displacement
+  real(kind=CUSTOM_REAL) Usolidnorm,Usolidnorm_all
+! ADJOINT
+  real(kind=CUSTOM_REAL) b_Usolidnorm, b_Usolidnorm_all
+! ADJOINT
+
+! timer MPI
+  double precision, external :: wtime
+  integer ihours,iminutes,iseconds,int_tCPU
+  double precision time_start,tCPU
+
+! parameters read from parameter file
+  integer NER_SEDIM,NER_BASEMENT_SEDIM,NER_16_BASEMENT, &
+             NER_MOHO_16,NER_BOTTOM_MOHO,NEX_XI,NEX_ETA, &
+             NPROC_XI,NPROC_ETA,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,UTM_PROJECTION_ZONE,SIMULATION_TYPE
+  integer NSOURCES
+
+  double precision UTM_X_MIN,UTM_X_MAX,UTM_Y_MIN,UTM_Y_MAX,Z_DEPTH_BLOCK
+  double precision DT,LATITUDE_MIN,LATITUDE_MAX,LONGITUDE_MIN,LONGITUDE_MAX,HDUR_MOVIE
+  double precision THICKNESS_TAPER_BLOCK_HR,THICKNESS_TAPER_BLOCK_MR,VP_MIN_GOCAD,VP_VS_RATIO_GOCAD_TOP,VP_VS_RATIO_GOCAD_BOTTOM
+
+  logical HARVARD_3D_GOCAD_MODEL,TOPOGRAPHY,ATTENUATION,USE_OLSEN_ATTENUATION, &
+          OCEANS,IMPOSE_MINIMUM_VP_GOCAD,HAUKSSON_REGIONAL_MODEL, &
+          BASEMENT_MAP,MOHO_MAP_LUPEI,ABSORBING_CONDITIONS,SAVE_FORWARD
+  logical ANISOTROPY,SAVE_MESH_FILES,PRINT_SOURCE_TIME_FUNCTION
+
+  logical MOVIE_SURFACE,MOVIE_VOLUME,CREATE_SHAKEMAP,SAVE_DISPLACEMENT, &
+          USE_HIGHRES_FOR_MOVIES,SUPPRESS_UTM_PROJECTION,USE_REGULAR_MESH
+  integer NTSTEP_BETWEEN_FRAMES,NTSTEP_BETWEEN_OUTPUT_INFO
+
+  character(len=150) OUTPUT_FILES,LOCAL_PATH,prname,prname_Q,MODEL
+
+! parameters deduced from parameters read from file
+  integer NPROC,NEX_PER_PROC_XI,NEX_PER_PROC_ETA
+  integer NER
+
+  integer NSPEC2D_A_XI,NSPEC2D_B_XI, &
+               NSPEC2D_A_ETA,NSPEC2D_B_ETA, &
+               NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_YMAX, &
+               NSPEC2D_BOTTOM,NSPEC2D_TOP, &
+               NPOIN2DMAX_XMIN_XMAX,NPOIN2DMAX_YMIN_YMAX, &
+               NSPEC_AB, NGLOB_AB
+
+! names of the data files for all the processors in MPI
+  character(len=150) outputname
+
+! Stacey conditions put back
+  integer nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax,ispec2D
+  integer, dimension(2,NSPEC2DMAX_YMIN_YMAX_VAL) :: nimin,nimax,nkmin_eta
+  integer, dimension(2,NSPEC2DMAX_XMIN_XMAX_VAL) :: njmin,njmax,nkmin_xi
+  real(kind=CUSTOM_REAL) vx,vy,vz,nx,ny,nz,tx,ty,tz,vn,weight
+
+! to save movie frames
+  integer ipoin, nmovie_points, iloc, iorderi(NGNOD2D), iorderj(NGNOD2D)
+  real(kind=CUSTOM_REAL), dimension(:), allocatable :: &
+      store_val_x,store_val_y,store_val_z, &
+      store_val_ux,store_val_uy,store_val_uz, &
+      store_val_norm_displ,store_val_norm_veloc,store_val_norm_accel
+  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: &
+      store_val_x_all,store_val_y_all,store_val_z_all, &
+      store_val_ux_all,store_val_uy_all,store_val_uz_all
+
+! to save full 3D snapshot of velocity
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dvxdxl,dvxdyl,dvxdzl,dvydxl,dvydyl,dvydzl,dvzdxl,dvzdyl,dvzdzl
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:),allocatable::  div, curl_x, curl_y, curl_z
+
+! ************** PROGRAM STARTS HERE **************
+
+! sizeprocs returns number of processes started
+! (should be equal to NPROC)
+! myrank is the rank of each process, between 0 and sizeprocs-1.
+! as usual in MPI, process 0 is in charge of coordinating everything
+! and also takes care of the main output
+  call world_size(sizeprocs)
+  call world_rank(myrank)
+
+! read the parameter file
+  call read_parameter_file(LATITUDE_MIN,LATITUDE_MAX,LONGITUDE_MIN,LONGITUDE_MAX, &
+        UTM_X_MIN,UTM_X_MAX,UTM_Y_MIN,UTM_Y_MAX,Z_DEPTH_BLOCK, &
+        NER_SEDIM,NER_BASEMENT_SEDIM,NER_16_BASEMENT,NER_MOHO_16,NER_BOTTOM_MOHO, &
+        NEX_XI,NEX_ETA,NPROC_XI,NPROC_ETA,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,UTM_PROJECTION_ZONE,DT, &
+        ATTENUATION,USE_OLSEN_ATTENUATION,HARVARD_3D_GOCAD_MODEL,TOPOGRAPHY,LOCAL_PATH,NSOURCES, &
+        THICKNESS_TAPER_BLOCK_HR,THICKNESS_TAPER_BLOCK_MR,VP_MIN_GOCAD,VP_VS_RATIO_GOCAD_TOP,VP_VS_RATIO_GOCAD_BOTTOM, &
+        OCEANS,IMPOSE_MINIMUM_VP_GOCAD,HAUKSSON_REGIONAL_MODEL,ANISOTROPY, &
+        BASEMENT_MAP,MOHO_MAP_LUPEI,ABSORBING_CONDITIONS, &
+        MOVIE_SURFACE,MOVIE_VOLUME,CREATE_SHAKEMAP,SAVE_DISPLACEMENT, &
+        NTSTEP_BETWEEN_FRAMES,USE_HIGHRES_FOR_MOVIES,HDUR_MOVIE, &
+        SAVE_MESH_FILES,PRINT_SOURCE_TIME_FUNCTION, &
+        NTSTEP_BETWEEN_OUTPUT_INFO,SUPPRESS_UTM_PROJECTION,MODEL,USE_REGULAR_MESH,SIMULATION_TYPE,SAVE_FORWARD)
+
+  if (sizeprocs == 1 .and. (NPROC_XI /= 1 .or. NPROC_ETA /= 1)) then
+    stop 'must have NPROC_XI = NPROC_ETA = 1 for a serial run'
+  endif
+
+! check simulation type
+  if (SIMULATION_TYPE /= 1 .and. SIMULATION_TYPE /= 2 .and. SIMULATION_TYPE /= 3) &
+        call exit_mpi(myrank,'SIMULATION_TYPE can only be 1, 2, or 3')
+
+! check simulation parameters
+  if (SIMULATION_TYPE /= 1 .and. NSOURCES > 1000) call exit_mpi(myrank, 'for adjoint simulations, NSOURCES <= 1000')
+! LQY -- note: kernel simulations with attenuation turned on has been implemented
+
+! compute other parameters based upon values read
+  call compute_parameters(NER,NEX_XI,NEX_ETA,NPROC_XI,NPROC_ETA, &
+      NPROC,NEX_PER_PROC_XI,NEX_PER_PROC_ETA, &
+      NER_BOTTOM_MOHO,NER_MOHO_16,NER_16_BASEMENT,NER_BASEMENT_SEDIM,NER_SEDIM, &
+      NSPEC_AB,NSPEC2D_A_XI,NSPEC2D_B_XI, &
+      NSPEC2D_A_ETA,NSPEC2D_B_ETA, &
+      NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_YMAX,NSPEC2D_BOTTOM,NSPEC2D_TOP, &
+      NPOIN2DMAX_XMIN_XMAX,NPOIN2DMAX_YMIN_YMAX,NGLOB_AB,USE_REGULAR_MESH)
+
+! get the base pathname for output files
+  call get_value_string(OUTPUT_FILES, 'OUTPUT_FILES', 'OUTPUT_FILES')
+
+! open main output file, only written to by process 0
+  if(myrank == 0 .and. IMAIN /= ISTANDARD_OUTPUT) &
+    open(unit=IMAIN,file=trim(OUTPUT_FILES)//'/output_solver.txt',status='unknown')
+
+  if(myrank == 0) then
+
+  write(IMAIN,*)
+  write(IMAIN,*) '**********************************************'
+  write(IMAIN,*) '**** Specfem 3-D Solver - MPI version f90 ****'
+  write(IMAIN,*) '**********************************************'
+  write(IMAIN,*)
+  write(IMAIN,*)
+
+  if(FIX_UNDERFLOW_PROBLEM) write(IMAIN,*) 'Fixing slow underflow trapping problem using small initial field'
+
+  write(IMAIN,*)
+  write(IMAIN,*) 'There are ',sizeprocs,' MPI processes'
+  write(IMAIN,*) 'Processes are numbered from 0 to ',sizeprocs-1
+  write(IMAIN,*)
+
+  write(IMAIN,*) 'There are ',NEX_XI,' elements along xi'
+  write(IMAIN,*) 'There are ',NEX_ETA,' elements along eta'
+  write(IMAIN,*)
+  write(IMAIN,*) 'There are ',NPROC_XI,' slices along xi'
+  write(IMAIN,*) 'There are ',NPROC_ETA,' slices along eta'
+  write(IMAIN,*) 'There is a total of ',NPROC,' slices'
+
+  write(IMAIN,*)
+  write(IMAIN,*) ' NDIM = ',NDIM
+  write(IMAIN,*)
+  write(IMAIN,*) ' NGLLX = ',NGLLX
+  write(IMAIN,*) ' NGLLY = ',NGLLY
+  write(IMAIN,*) ' NGLLZ = ',NGLLZ
+  write(IMAIN,*)
+
+! write information about precision used for floating-point operations
+  if(CUSTOM_REAL == SIZE_REAL) then
+    write(IMAIN,*) 'using single precision for the calculations'
+  else
+    write(IMAIN,*) 'using double precision for the calculations'
+  endif
+  write(IMAIN,*)
+  write(IMAIN,*) 'smallest and largest possible floating-point numbers are: ',tiny(1._CUSTOM_REAL),huge(1._CUSTOM_REAL)
+  write(IMAIN,*)
+
+  endif
+
+! check that the code is running with the requested nb of processes
+  if(sizeprocs /= NPROC) call exit_MPI(myrank,'wrong number of MPI processes')
+
+! check that we have at least one source
+  if(NSOURCES < 1) call exit_MPI(myrank,'need at least one source')
+
+! open file with global slice number addressing
+  if(myrank == 0) then
+    open(unit=IIN,file=trim(OUTPUT_FILES)//'/addressing.txt',status='old',action='read')
+    do iproc = 0,NPROC-1
+      read(IIN,*) iproc_read,iproc_xi,iproc_eta
+      if(iproc_read /= iproc) call exit_MPI(myrank,'incorrect slice number read')
+      addressing(iproc_xi,iproc_eta) = iproc
+      iproc_xi_slice(iproc) = iproc_xi
+      iproc_eta_slice(iproc) = iproc_eta
+    enddo
+    close(IIN)
+  endif
+
+! broadcast the information read on the master to the nodes
+  call bcast_all_i(addressing,NPROC_XI*NPROC_ETA)
+  call bcast_all_i(iproc_xi_slice,NPROC)
+  call bcast_all_i(iproc_eta_slice,NPROC)
+
+! determine local slice coordinates using addressing
+  iproc_xi = iproc_xi_slice(myrank)
+  iproc_eta = iproc_eta_slice(myrank)
+
+! define maximum size for message buffers
+  NPOIN2DMAX_XY = max(NPOIN2DMAX_XMIN_XMAX,NPOIN2DMAX_YMIN_YMAX)
+
+! start reading the databases
+
+! read arrays created by the mesher
+  call read_arrays_solver(myrank,xstore,ystore,zstore, &
+            xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz,jacobian, &
+            flag_sediments,not_fully_in_bedrock,rho_vp,rho_vs,ANISOTROPY, &
+            c11store,c12store,c13store,c14store,c15store,c16store,c22store, &
+            c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
+            c36store,c44store,c45store,c46store,c55store,c56store,c66store, &
+            kappastore,mustore,ibool,idoubling,rmass,rmass_ocean_load,LOCAL_PATH,OCEANS)
+
+! check that the number of points in this slice is correct
+  if(minval(ibool(:,:,:,:)) /= 1 .or. maxval(ibool(:,:,:,:)) /= NGLOB_AB_VAL) &
+      call exit_MPI(myrank,'incorrect global numbering: iboolmax does not equal NGLOB')
+
+! read 2-D addressing for summation between slices with MPI
+  call read_arrays_buffers_solver(myrank,iboolleft_xi, &
+     iboolright_xi,iboolleft_eta,iboolright_eta, &
+     npoin2D_xi,npoin2D_eta, &
+     NPOIN2DMAX_XMIN_XMAX,NPOIN2DMAX_YMIN_YMAX,LOCAL_PATH)
+
+! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
+
+  if(myrank == 0) then
+    write(IMAIN,*) '******************************************'
+    write(IMAIN,*) 'There are ',NEX_XI,' elements along xi'
+    write(IMAIN,*) 'There are ',NEX_ETA,' elements along eta'
+    write(IMAIN,*)
+    write(IMAIN,*) 'There are ',NPROC_XI,' slices along xi'
+    write(IMAIN,*) 'There are ',NPROC_ETA,' slices along eta'
+    write(IMAIN,*) 'There is a total of ',NPROC,' slices'
+    write(IMAIN,*) '******************************************'
+    write(IMAIN,*)
+  endif
+
+! set up GLL points, weights and derivation matrices
+  call define_derivation_matrices(xigll,yigll,zigll,wxgll,wygll,wzgll, &
+         hprime_xx,hprime_yy,hprime_zz, &
+         hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+         wgllwgll_xy,wgllwgll_xz,wgllwgll_yz)
+
+! allocate 1-D Lagrange interpolators and derivatives
+  allocate(hxir(NGLLX))
+  allocate(hpxir(NGLLX))
+  allocate(hetar(NGLLY))
+  allocate(hpetar(NGLLY))
+  allocate(hgammar(NGLLZ))
+  allocate(hpgammar(NGLLZ))
+
+! create name of database
+  call create_name_database(prname,myrank,LOCAL_PATH)
+  if (ATTENUATION .and. ((SIMULATION_TYPE == 1 .and. SAVE_FORWARD) .or. SIMULATION_TYPE == 3)) &
+           call create_name_database(prname_Q,myrank,LOCAL_PATH_Q)
+
+! boundary parameters
+  open(unit=27,file=prname(1:len_trim(prname))//'ibelm.bin',status='old',action='read',form='unformatted')
+  read(27) ibelm_xmin
+  read(27) ibelm_xmax
+  read(27) ibelm_ymin
+  read(27) ibelm_ymax
+  read(27) ibelm_bottom
+  read(27) ibelm_top
+  close(27)
+
+  open(unit=27,file=prname(1:len_trim(prname))//'normal.bin',status='old',action='read',form='unformatted')
+  read(27) normal_xmin
+  read(27) normal_xmax
+  read(27) normal_ymin
+  read(27) normal_ymax
+  read(27) normal_bottom
+  read(27) normal_top
+  close(27)
+
+! moho boundary
+  if (SAVE_MOHO_MESH .and. SIMULATION_TYPE == 3) then
+
+    moho_kl = ZERO
+
+    open(unit=27,file=prname(1:len_trim(prname))//'ibelm_moho.bin',status='unknown',form='unformatted')
+    read(27) nspec2D_moho
+    read(27) njunk
+    read(27) njunk
+    read(27) ibelm_moho_top
+    read(27) ibelm_moho_bot
+    close(27)
+    if (nspec2D_moho /= NSPEC2D_BOTTOM) call exit_mpi(myrank, "nspec2D_moho /= NSPEC2D_BOTTOM for Moho mesh")
+
+    open(unit=27,file=prname(1:len_trim(prname))//'normal_moho.bin',status='old',form='unformatted')
+    read(27) normal_moho
+    close(27)
+
+    open(unit=27,file=prname(1:len_trim(prname))//'is_moho.bin',status='old',form='unformatted')
+    read(27) is_moho_top
+    read(27) is_moho_bot
+    close(27)
+
+  endif
+
+! Stacey put back
+  open(unit=27,file=prname(1:len_trim(prname))//'nspec2D.bin',status='unknown',form='unformatted')
+  read(27) nspec2D_xmin
+  read(27) nspec2D_xmax
+  read(27) nspec2D_ymin
+  read(27) nspec2D_ymax
+  close(27)
+
+  open(unit=27,file=prname(1:len_trim(prname))//'jacobian2D.bin',status='old',action='read',form='unformatted')
+  read(27) jacobian2D_xmin
+  read(27) jacobian2D_xmax
+  read(27) jacobian2D_ymin
+  read(27) jacobian2D_ymax
+  read(27) jacobian2D_bottom
+  read(27) jacobian2D_top
+  close(27)
+
+! Stacey put back
+! read arrays for Stacey conditions
+
+  if(ABSORBING_CONDITIONS) then
+      open(unit=27,file=prname(1:len_trim(prname))//'nimin.bin',status='unknown',form='unformatted')
+      read(27) nimin
+      close(27)
+
+      open(unit=27,file=prname(1:len_trim(prname))//'nimax.bin',status='unknown',form='unformatted')
+      read(27) nimax
+      close(27)
+
+      open(unit=27,file=prname(1:len_trim(prname))//'njmin.bin',status='unknown',form='unformatted')
+      read(27) njmin
+      close(27)
+
+      open(unit=27,file=prname(1:len_trim(prname))//'njmax.bin',status='unknown',form='unformatted')
+      read(27) njmax
+      close(27)
+
+      open(unit=27,file=prname(1:len_trim(prname))//'nkmin_xi.bin',status='unknown',form='unformatted')
+      read(27) nkmin_xi
+      close(27)
+
+      open(unit=27,file=prname(1:len_trim(prname))//'nkmin_eta.bin',status='unknown',form='unformatted')
+      read(27) nkmin_eta
+      close(27)
+
+! read in absorbing wavefield saved by forward simulations
+      if (nspec2D_xmin > 0 .and. (SIMULATION_TYPE == 3 .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+        allocate(absorb_xmin(NDIM,NGLLY,NGLLZ,nspec2D_xmin))
+        reclen_xmin = CUSTOM_REAL * (NDIM * NGLLY * NGLLZ * nspec2D_xmin)
+        if (SIMULATION_TYPE == 3) then
+          open(unit=31,file=trim(prname)//'absorb_xmin.bin',status='old',action='read',form='unformatted',access='direct', &
+                recl=reclen_xmin+2*4)
+        else
+          open(unit=31,file=trim(prname)//'absorb_xmin.bin',status='unknown',form='unformatted',access='direct',&
+                recl=reclen_xmin+2*4)
+        endif
+      endif
+
+      if (nspec2D_xmax > 0 .and. (SIMULATION_TYPE == 3 .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+        allocate(absorb_xmax(NDIM,NGLLY,NGLLZ,nspec2D_xmax))
+        reclen_xmax = CUSTOM_REAL * (NDIM * NGLLY * NGLLZ * nspec2D_xmax)
+        if (SIMULATION_TYPE == 3) then
+          open(unit=32,file=trim(prname)//'absorb_xmax.bin',status='old',action='read',form='unformatted',access='direct', &
+                recl=reclen_xmax+2*4)
+        else
+          open(unit=32,file=trim(prname)//'absorb_xmax.bin',status='unknown',form='unformatted',access='direct', &
+                recl=reclen_xmax+2*4)
+        endif
+      endif
+
+      if (nspec2D_ymin > 0 .and. (SIMULATION_TYPE == 3 .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+        allocate(absorb_ymin(NDIM,NGLLX,NGLLZ,nspec2D_ymin))
+        reclen_ymin = CUSTOM_REAL * (NDIM * NGLLX * NGLLZ * nspec2D_ymin)
+        if (SIMULATION_TYPE == 3) then
+          open(unit=33,file=trim(prname)//'absorb_ymin.bin',status='old',action='read',form='unformatted',access='direct',&
+                recl=reclen_ymin+2*4)
+        else
+          open(unit=33,file=trim(prname)//'absorb_ymin.bin',status='unknown',form='unformatted',access='direct',&
+                recl=reclen_ymin+2*4)
+        endif
+      endif
+
+      if (nspec2D_ymax > 0 .and. (SIMULATION_TYPE == 3 .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+        allocate(absorb_ymax(NDIM,NGLLX,NGLLZ,nspec2D_ymax))
+        reclen_ymax = CUSTOM_REAL * (NDIM * NGLLX * NGLLZ * nspec2D_ymax)
+        if (SIMULATION_TYPE == 3) then
+          open(unit=34,file=trim(prname)//'absorb_ymax.bin',status='old',action='read',form='unformatted',access='direct',&
+                recl=reclen_ymax+2*4)
+        else
+          open(unit=34,file=trim(prname)//'absorb_ymax.bin',status='unknown',form='unformatted',access='direct',&
+                recl=reclen_ymax+2*4)
+        endif
+      endif
+
+      if (NSPEC2D_BOTTOM > 0 .and. (SIMULATION_TYPE == 3 .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+        allocate(absorb_zmin(NDIM,NGLLX,NGLLY,NSPEC2D_BOTTOM))
+        reclen_zmin = CUSTOM_REAL * (NDIM * NGLLX * NGLLY * NSPEC2D_BOTTOM)
+         if (SIMULATION_TYPE == 3) then
+         open(unit=35,file=trim(prname)//'absorb_zmin.bin',status='old',action='read',form='unformatted',access='direct',&
+                recl=reclen_zmin+2*4)
+        else
+          open(unit=35,file=trim(prname)//'absorb_zmin.bin',status='unknown',form='unformatted',access='direct',&
+                recl=reclen_zmin+2*4)
+        endif
+      endif
+
+  endif
+
+
+! $$$$$$$$$$$$$$$$$$$$$$$$ SOURCES $$$$$$$$$$$$$$$$$
+
+! read topography and bathymetry file
+  if(TOPOGRAPHY .or. OCEANS) then
+
+    NX_TOPO = NX_TOPO_SOCAL
+    NY_TOPO = NY_TOPO_SOCAL
+    ORIG_LAT_TOPO = ORIG_LAT_TOPO_SOCAL
+    ORIG_LONG_TOPO = ORIG_LONG_TOPO_SOCAL
+    DEGREES_PER_CELL_TOPO = DEGREES_PER_CELL_TOPO_SOCAL
+    topo_file = TOPO_FILE_SOCAL
+
+    allocate(itopo_bathy(NX_TOPO,NY_TOPO))
+
+    call read_topo_bathy_file(itopo_bathy,NX_TOPO,NY_TOPO,topo_file)
+
+    if(myrank == 0) then
+      write(IMAIN,*)
+      write(IMAIN,*) 'regional topography file read ranges in m from ', &
+        minval(itopo_bathy),' to ',maxval(itopo_bathy)
+      write(IMAIN,*)
+    endif
+
+  endif
+
+! write source and receiver VTK files for Paraview
+  if (myrank == 0) then
+    open(IOVTK,file=trim(OUTPUT_FILES)//'/sr.vtk',status='unknown')
+    write(IOVTK,'(a)') '# vtk DataFile Version 2.0'
+    write(IOVTK,'(a)') 'Source and Receiver VTK file'
+    write(IOVTK,'(a)') 'ASCII'
+    write(IOVTK,'(a)') 'DATASET POLYDATA'
+    ! LQY -- cannot figure out NSOURCES+nrec at this point
+    write(IOVTK, '(a,i6,a)') 'POINTS ', 2, ' float'
+  endif
+
+! allocate arrays for source
+  allocate(islice_selected_source(NSOURCES))
+  allocate(ispec_selected_source(NSOURCES))
+  allocate(Mxx(NSOURCES))
+  allocate(Myy(NSOURCES))
+  allocate(Mzz(NSOURCES))
+  allocate(Mxy(NSOURCES))
+  allocate(Mxz(NSOURCES))
+  allocate(Myz(NSOURCES))
+  allocate(xi_source(NSOURCES))
+  allocate(eta_source(NSOURCES))
+  allocate(gamma_source(NSOURCES))
+  allocate(t_cmt(NSOURCES))
+  allocate(hdur(NSOURCES))
+  allocate(hdur_gaussian(NSOURCES))
+  allocate(utm_x_source(NSOURCES))
+  allocate(utm_y_source(NSOURCES))
+
+! locate sources in the mesh
+  call locate_source(ibool,NSOURCES,myrank,NSPEC_AB,NGLOB_AB, &
+          xstore,ystore,zstore,xigll,yigll,zigll,NPROC, &
+          sec,t_cmt,yr,jda,ho,mi,utm_x_source,utm_y_source, &
+          NSTEP,DT,hdur,Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
+          islice_selected_source,ispec_selected_source, &
+          xi_source,eta_source,gamma_source, &
+          LATITUDE_MIN,LATITUDE_MAX,LONGITUDE_MIN,LONGITUDE_MAX,Z_DEPTH_BLOCK, &
+          TOPOGRAPHY,itopo_bathy,UTM_PROJECTION_ZONE, &
+          PRINT_SOURCE_TIME_FUNCTION,SUPPRESS_UTM_PROJECTION, &
+          NX_TOPO,NY_TOPO,ORIG_LAT_TOPO,ORIG_LONG_TOPO,DEGREES_PER_CELL_TOPO)
+
+  if(minval(t_cmt) /= 0.) call exit_MPI(myrank,'one t_cmt must be zero, others must be positive')
+
+! filter source time function by Gaussian with hdur = HDUR_MOVIE when outputing movies or shakemaps
+  if (MOVIE_SURFACE .or. MOVIE_VOLUME .or. CREATE_SHAKEMAP) then
+     hdur = sqrt(hdur**2 + HDUR_MOVIE**2)
+     if(myrank == 0) then
+        write(IMAIN,*)
+        write(IMAIN,*) 'Each source is being convolved with HDUR_MOVIE = ',HDUR_MOVIE
+        write(IMAIN,*)
+     endif
+  endif
+! convert the half duration for triangle STF to the one for gaussian STF
+  hdur_gaussian = hdur/SOURCE_DECAY_MIMIC_TRIANGLE
+
+! define t0 as the earliest start time
+  t0 = - 1.5d0 * minval(t_cmt-hdur)
+
+!$$$$$$$$$$$$$$$$$$ RECEIVERS $$$$$$$$$$$$$$$$$$$$$
+
+  if (SIMULATION_TYPE == 1) then
+    call get_value_string(filtered_rec_filename, 'solver.STATIONS_FILTERED', 'DATA/STATIONS_FILTERED')
+
+! get total number of stations
+    open(unit=IIN,file=filtered_rec_filename,iostat=ios,status='old',action='read')
+    nrec = 0
+    do while(ios == 0)
+      read(IIN,"(a)",iostat=ios) dummystring
+      if(ios == 0) nrec = nrec + 1
+    enddo
+    close(IIN)
+
+    if(nrec < 1) call exit_MPI(myrank,'need at least one receiver')
+
+  else
+    call get_value_string(rec_filename, 'solver.STATIONS', 'DATA/STATIONS_ADJOINT')
+    call get_value_string(filtered_rec_filename, 'solver.STATIONS_FILTERED', 'DATA/STATIONS_ADJOINT_FILTERED')
+    call station_filter(myrank,rec_filename,filtered_rec_filename,nrec, &
+           LATITUDE_MIN, LATITUDE_MAX, LONGITUDE_MIN, LONGITUDE_MAX)
+    if (nrec < 1) call exit_MPI(myrank, 'adjoint simulation needs at least one source')
+    call sync_all()
+  endif
+
+  if(myrank == 0) then
+    write(IMAIN,*)
+    if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
+      write(IMAIN,*) 'Total number of receivers = ', nrec
+    else
+      write(IMAIN,*) 'Total number of adjoint sources = ', nrec
+    endif
+    write(IMAIN,*)
+  endif
+
+  if(nrec < 1) call exit_MPI(myrank,'need at least one receiver')
+
+! allocate memory for receiver arrays
+  allocate(islice_selected_rec(nrec))
+  allocate(ispec_selected_rec(nrec))
+  allocate(xi_receiver(nrec))
+  allocate(eta_receiver(nrec))
+  allocate(gamma_receiver(nrec))
+  allocate(station_name(nrec))
+  allocate(network_name(nrec))
+  allocate(nu(NDIM,NDIM,nrec))
+
+! locate receivers in the mesh
+  call locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, &
+            xstore,ystore,zstore,xigll,yigll,zigll,filtered_rec_filename, &
+            nrec,islice_selected_rec,ispec_selected_rec, &
+            xi_receiver,eta_receiver,gamma_receiver,station_name,network_name,nu, &
+            NPROC,utm_x_source(1),utm_y_source(1), &
+            TOPOGRAPHY,itopo_bathy,UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION, &
+            NX_TOPO,NY_TOPO,ORIG_LAT_TOPO,ORIG_LONG_TOPO,DEGREES_PER_CELL_TOPO)
+
+
+!###################### SOURCE ARRAYS ################
+
+  if (SIMULATION_TYPE == 1  .or. SIMULATION_TYPE == 3) then
+    allocate(sourcearray(NDIM,NGLLX,NGLLY,NGLLZ))
+    allocate(sourcearrays(NSOURCES,NDIM,NGLLX,NGLLY,NGLLZ))
+
+! compute source arrays
+    do isource = 1,NSOURCES
+
+!   check that the source slice number is okay
+      if(islice_selected_source(isource) < 0 .or. islice_selected_source(isource) > NPROC-1) &
+            call exit_MPI(myrank,'something is wrong with the source slice number')
+
+!   compute source arrays in source slice
+      if(myrank == islice_selected_source(isource)) then
+        call compute_arrays_source(ispec_selected_source(isource), &
+              xi_source(isource),eta_source(isource),gamma_source(isource),sourcearray, &
+              Mxx(isource),Myy(isource),Mzz(isource),Mxy(isource),Mxz(isource),Myz(isource), &
+              xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+              xigll,yigll,zigll,NSPEC_AB)
+        sourcearrays(isource,:,:,:,:) = sourcearray(:,:,:,:)
+      endif
+    enddo
+  endif
+
+  if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
+    nadj_rec_local = 0
+    do irec = 1,nrec
+      if(myrank == islice_selected_rec(irec))then
+!   check that the source slice number is okay
+        if(islice_selected_rec(irec) < 0 .or. islice_selected_rec(irec) > NPROC-1) &
+              call exit_MPI(myrank,'something is wrong with the source slice number in adjoint simulation')
+        nadj_rec_local = nadj_rec_local + 1
+      endif
+    enddo
+    allocate(adj_sourcearray(NSTEP,NDIM,NGLLX,NGLLY,NGLLZ))
+    if (nadj_rec_local > 0) allocate(adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ))
+    irec_local = 0
+    do irec = 1, nrec
+!   compute only adjoint source arrays in the local slice
+      if(myrank == islice_selected_rec(irec)) then
+        irec_local = irec_local + 1
+        adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
+        call compute_arrays_adjoint_source(myrank, adj_source_file, &
+              xi_receiver(irec), eta_receiver(irec), gamma_receiver(irec), &
+              adj_sourcearray, xigll,yigll,zigll,NSTEP)
+
+        adj_sourcearrays(irec_local,:,:,:,:,:) = adj_sourcearray(:,:,:,:,:)
+
+      endif
+    enddo
+  endif
+
+!--- select local receivers
+
+! count number of receivers located in this slice
+  nrec_local = 0
+  if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
+    nrec_simulation = nrec
+    do irec = 1,nrec
+      if(myrank == islice_selected_rec(irec)) nrec_local = nrec_local + 1
+    enddo
+  else
+    nrec_simulation = NSOURCES
+    do isource = 1, NSOURCES
+      if(myrank == islice_selected_source(isource)) nrec_local = nrec_local + 1
+    enddo
+  endif
+
+  if (nrec_local > 0) then
+! allocate Lagrange interpolators for receivers
+  allocate(hxir_store(nrec_local,NGLLX))
+  allocate(hetar_store(nrec_local,NGLLY))
+  allocate(hgammar_store(nrec_local,NGLLZ))
+
+! define local to global receiver numbering mapping
+  allocate(number_receiver_global(nrec_local))
+  irec_local = 0
+  if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
+    do irec = 1,nrec
+    if(myrank == islice_selected_rec(irec)) then
+      irec_local = irec_local + 1
+      number_receiver_global(irec_local) = irec
+    endif
+    enddo
+  else
+  do isource = 1,NSOURCES
+    if(myrank == islice_selected_source(isource)) then
+      irec_local = irec_local + 1
+      number_receiver_global(irec_local) = isource
+    endif
+  enddo
+  endif
+
+! define and store Lagrange interpolators at all the receivers
+  if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
+    do irec_local = 1,nrec_local
+      irec = number_receiver_global(irec_local)
+      call lagrange_any(xi_receiver(irec),NGLLX,xigll,hxir,hpxir)
+      call lagrange_any(eta_receiver(irec),NGLLY,yigll,hetar,hpetar)
+      call lagrange_any(gamma_receiver(irec),NGLLZ,zigll,hgammar,hpgammar)
+      hxir_store(irec_local,:) = hxir(:)
+      hetar_store(irec_local,:) = hetar(:)
+      hgammar_store(irec_local,:) = hgammar(:)
+    enddo
+  else
+    allocate(hpxir_store(nrec_local,NGLLX))
+    allocate(hpetar_store(nrec_local,NGLLY))
+    allocate(hpgammar_store(nrec_local,NGLLZ))
+    do irec_local = 1,nrec_local
+      irec = number_receiver_global(irec_local)
+      call lagrange_any(xi_source(irec),NGLLX,xigll,hxir,hpxir)
+      call lagrange_any(eta_source(irec),NGLLY,yigll,hetar,hpetar)
+      call lagrange_any(gamma_source(irec),NGLLZ,zigll,hgammar,hpgammar)
+      hxir_store(irec_local,:) = hxir(:)
+      hetar_store(irec_local,:) = hetar(:)
+      hgammar_store(irec_local,:) = hgammar(:)
+      hpxir_store(irec_local,:) = hpxir(:)
+      hpetar_store(irec_local,:) = hpetar(:)
+      hpgammar_store(irec_local,:) = hpgammar(:)
+    enddo
+  endif
+  endif ! nrec_local > 0
+
+! check that the sum of the number of receivers in each slice is nrec
+  call sum_all_i(nrec_local,nrec_tot_found)
+  if(myrank == 0) then
+
+    close(IOVTK)
+
+    write(IMAIN,*)
+    write(IMAIN,*) 'Total number of samples for seismograms = ',NSTEP
+    write(IMAIN,*)
+    write(IMAIN,*)
+    write(IMAIN,*) 'found a total of ',nrec_tot_found,' receivers in all the slices'
+    if(nrec_tot_found /= nrec_simulation) then
+      call exit_MPI(myrank,'problem when dispatching the receivers')
+    else
+      write(IMAIN,*) 'this total is okay'
+    endif
+  endif
+
+  if(myrank == 0) then
+
+  if(NSOURCES > 1) write(IMAIN,*) 'Using ',NSOURCES,' point sources'
+
+  write(IMAIN,*)
+  if(TOPOGRAPHY) then
+    write(IMAIN,*) 'incorporating surface topography'
+  else
+    write(IMAIN,*) 'no surface topography'
+  endif
+
+  write(IMAIN,*)
+  if(SUPPRESS_UTM_PROJECTION) then
+    write(IMAIN,*) 'suppressing UTM projection'
+  else
+    write(IMAIN,*) 'using UTM projection in region ',UTM_PROJECTION_ZONE
+  endif
+
+  write(IMAIN,*)
+  if(HARVARD_3D_GOCAD_MODEL) then
+    write(IMAIN,*) 'incorporating 3-D lateral variations'
+  else
+    write(IMAIN,*) 'no 3-D lateral variations'
+  endif
+
+  write(IMAIN,*)
+  if(ATTENUATION) then
+    write(IMAIN,*) 'incorporating attenuation using ',N_SLS,' standard linear solids'
+    if(USE_OLSEN_ATTENUATION) then
+      write(IMAIN,*) 'using Olsen''s attenuation'
+    else
+      write(IMAIN,*) 'not using Olsen''s attenuation'
+    endif
+  else
+    write(IMAIN,*) 'no attenuation'
+  endif
+
+  write(IMAIN,*)
+  if(OCEANS) then
+    write(IMAIN,*) 'incorporating the oceans using equivalent load'
+  else
+    write(IMAIN,*) 'no oceans'
+  endif
+
+  endif
+
+! synchronize all the processes before assembling the mass matrix
+! to make sure all the nodes have finished to read their databases
+  call sync_all()
+
+! the mass matrix needs to be assembled with MPI here once and for all
+  call assemble_MPI_scalar(rmass,iproc_xi,iproc_eta,addressing, &
+            iboolleft_xi,iboolright_xi,iboolleft_eta,iboolright_eta, &
+            buffer_send_faces_scalar,buffer_received_faces_scalar,npoin2D_xi,npoin2D_eta, &
+            NPROC_XI,NPROC_ETA,NPOIN2DMAX_XMIN_XMAX,NPOIN2DMAX_YMIN_YMAX,NPOIN2DMAX_XY)
+
+  if(myrank == 0) write(IMAIN,*) 'end assembling MPI mass matrix'
+
+! check that mass matrix is positive
+  if(minval(rmass(:)) <= 0.) call exit_MPI(myrank,'negative mass matrix term')
+  if(OCEANS .and. minval(rmass_ocean_load(:)) <= 0.) &
+       call exit_MPI(myrank,'negative ocean load mass matrix term')
+
+! for efficiency, invert final mass matrix once and for all in each slice
+  if(OCEANS) rmass_ocean_load(:) = 1. / rmass_ocean_load(:)
+  rmass(:) = 1.0 / rmass(:)
+
+! if attenuation is on, shift PREM to right frequency
+! rescale mu in PREM to average frequency for attenuation
+
+  if(ATTENUATION) then
+
+! get and store PREM attenuation model
+    do iattenuation = 1,NUM_REGIONS_ATTENUATION
+
+      call get_attenuation_model(myrank,iattenuation,tau_mu_dble, &
+        tau_sigma_dble,beta_dble,one_minus_sum_beta_dble,factor_scale_dble)
+
+! distinguish between single and double precision for reals
+      if(CUSTOM_REAL == SIZE_REAL) then
+        tau_mu(iattenuation,:) = sngl(tau_mu_dble(:))
+        tau_sigma(iattenuation,:) = sngl(tau_sigma_dble(:))
+        beta(iattenuation,:) = sngl(beta_dble(:))
+        factor_scale(iattenuation) = sngl(factor_scale_dble)
+        one_minus_sum_beta(iattenuation) = sngl(one_minus_sum_beta_dble)
+      else
+        tau_mu(iattenuation,:) = tau_mu_dble(:)
+        tau_sigma(iattenuation,:) = tau_sigma_dble(:)
+        beta(iattenuation,:) = beta_dble(:)
+        factor_scale(iattenuation) = factor_scale_dble
+        one_minus_sum_beta(iattenuation) = one_minus_sum_beta_dble
+      endif
+    enddo
+
+! rescale shear modulus according to attenuation model
+
+    do ispec = 1,NSPEC_AB
+    if(not_fully_in_bedrock(ispec)) then
+      do k=1,NGLLZ
+        do j=1,NGLLY
+          do i=1,NGLLX
+
+! distinguish attenuation factors
+   if(flag_sediments(i,j,k,ispec)) then
+
+! use constant attenuation of Q = 90
+! or use scaling rule similar to Olsen et al. (2003)
+! We might need to fix the attenuation part for the anisotropy case
+! At this stage, we turn the ATTENUATION flag off always, and still keep mustore
+     if(USE_OLSEN_ATTENUATION) then
+       vs_val = mustore(i,j,k,ispec) / rho_vs(i,j,k,ispec)
+! use rule Q_mu = constant * v_s
+       Q_mu = OLSEN_ATTENUATION_RATIO * vs_val
+       int_Q_mu = 10 * nint(Q_mu / 10.)
+       if(int_Q_mu < 40) int_Q_mu = 40
+       if(int_Q_mu > 150) int_Q_mu = 150
+
+       if(int_Q_mu == 40) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_40
+       else if(int_Q_mu == 50) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_50
+       else if(int_Q_mu == 60) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_60
+       else if(int_Q_mu == 70) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_70
+       else if(int_Q_mu == 80) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_80
+       else if(int_Q_mu == 90) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_90
+       else if(int_Q_mu == 100) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_100
+       else if(int_Q_mu == 110) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_110
+       else if(int_Q_mu == 120) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_120
+       else if(int_Q_mu == 130) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_130
+       else if(int_Q_mu == 140) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_140
+       else if(int_Q_mu == 150) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_150
+       else
+         stop 'incorrect attenuation coefficient'
+       endif
+
+     else
+       iattenuation_sediments = IATTENUATION_SEDIMENTS_90
+     endif
+
+     scale_factor = factor_scale(iattenuation_sediments)
+   else
+     scale_factor = factor_scale(IATTENUATION_BEDROCK)
+   endif
+
+      mustore(i,j,k,ispec) = mustore(i,j,k,ispec) * scale_factor
+
+          enddo
+        enddo
+      enddo
+    endif
+    enddo
+
+  endif
+
+! allocate seismogram array
+  if (nrec_local > 0) then
+  allocate(seismograms_d(NDIM,nrec_local,NSTEP))
+  allocate(seismograms_v(NDIM,nrec_local,NSTEP))
+  allocate(seismograms_a(NDIM,nrec_local,NSTEP))
+! initialize seismograms
+  seismograms_d(:,:,:) = 0._CUSTOM_REAL
+  seismograms_v(:,:,:) = 0._CUSTOM_REAL
+  seismograms_a(:,:,:) = 0._CUSTOM_REAL
+  if (SIMULATION_TYPE == 2) then
+    ! allocate Frechet derivatives array
+    allocate(Mxx_der(nrec_local),Myy_der(nrec_local),Mzz_der(nrec_local),Mxy_der(nrec_local), &
+               Mxz_der(nrec_local),Myz_der(nrec_local), sloc_der(NDIM,nrec_local))
+    Mxx_der = 0._CUSTOM_REAL
+    Myy_der = 0._CUSTOM_REAL
+    Mzz_der = 0._CUSTOM_REAL
+    Mxy_der = 0._CUSTOM_REAL
+    Mxz_der = 0._CUSTOM_REAL
+    Myz_der = 0._CUSTOM_REAL
+    sloc_der = 0._CUSTOM_REAL
+    allocate(seismograms_eps(NDIM,NDIM,nrec_local,NSTEP))
+    seismograms_eps(:,:,:,:) = 0._CUSTOM_REAL
+  endif
+  endif
+
+! initialize arrays to zero
+  displ(:,:) = 0._CUSTOM_REAL
+  veloc(:,:) = 0._CUSTOM_REAL
+  accel(:,:) = 0._CUSTOM_REAL
+
+! put negligible initial value to avoid very slow underflow trapping
+  if(FIX_UNDERFLOW_PROBLEM) displ(:,:) = VERYSMALLVAL
+
+  if (SIMULATION_TYPE == 3)  then ! kernel calculation, read in last frame
+
+  open(unit=27,file=trim(prname)//'save_forward_arrays.bin',status='old',action='read',form='unformatted')
+  read(27) b_displ
+  read(27) b_veloc
+  read(27) b_accel
+
+  rho_kl(:,:,:,:) = 0._CUSTOM_REAL
+  mu_kl(:,:,:,:) = 0._CUSTOM_REAL
+  kappa_kl(:,:,:,:) = 0._CUSTOM_REAL
+
+  endif
+
+! allocate files to save movies and shaking map
+  if(MOVIE_SURFACE .or. CREATE_SHAKEMAP) then
+    if (USE_HIGHRES_FOR_MOVIES) then
+      nmovie_points = NGLLX * NGLLY * NSPEC2D_TOP
+    else
+      nmovie_points = NGNOD2D * NSPEC2D_TOP
+      iorderi(1) = 1
+      iorderi(2) = NGLLX
+      iorderi(3) = NGLLX
+      iorderi(4) = 1
+      iorderj(1) = 1
+      iorderj(2) = 1
+      iorderj(3) = NGLLY
+      iorderj(4) = NGLLY
+    endif
+    allocate(store_val_x(nmovie_points))
+    allocate(store_val_y(nmovie_points))
+    allocate(store_val_z(nmovie_points))
+    allocate(store_val_ux(nmovie_points))
+    allocate(store_val_uy(nmovie_points))
+    allocate(store_val_uz(nmovie_points))
+    allocate(store_val_norm_displ(nmovie_points))
+    allocate(store_val_norm_veloc(nmovie_points))
+    allocate(store_val_norm_accel(nmovie_points))
+
+    allocate(store_val_x_all(nmovie_points,0:NPROC-1))
+    allocate(store_val_y_all(nmovie_points,0:NPROC-1))
+    allocate(store_val_z_all(nmovie_points,0:NPROC-1))
+    allocate(store_val_ux_all(nmovie_points,0:NPROC-1))
+    allocate(store_val_uy_all(nmovie_points,0:NPROC-1))
+    allocate(store_val_uz_all(nmovie_points,0:NPROC-1))
+
+! to compute max of norm for shaking map
+    store_val_norm_displ(:) = -1.
+    store_val_norm_veloc(:) = -1.
+    store_val_norm_accel(:) = -1.
+  else if (MOVIE_VOLUME) then
+    allocate(div(NGLLX,NGLLY,NGLLZ,NSPEC_AB))
+    allocate(curl_x(NGLLX,NGLLY,NGLLZ,NSPEC_AB))
+    allocate(curl_y(NGLLX,NGLLY,NGLLZ,NSPEC_AB))
+    allocate(curl_z(NGLLX,NGLLY,NGLLZ,NSPEC_AB))
+  endif
+
+  if(myrank == 0) then
+    write(IMAIN,*)
+    write(IMAIN,*) '           time step: ',sngl(DT),' s'
+    write(IMAIN,*) 'number of time steps: ',NSTEP
+    write(IMAIN,*) 'total simulated time: ',sngl(NSTEP*DT),' seconds'
+    write(IMAIN,*)
+  endif
+
+! distinguish between single and double precision for reals
+  if(CUSTOM_REAL == SIZE_REAL) then
+    deltat = sngl(DT)
+  else
+    deltat = DT
+  endif
+  deltatover2 = deltat/2.
+  deltatsqover2 = deltat*deltat/2.
+  if (SIMULATION_TYPE == 3) then
+    if(CUSTOM_REAL == SIZE_REAL) then
+      b_deltat = - sngl(DT)
+    else
+      b_deltat = - DT
+    endif
+    b_deltatover2 = b_deltat/2.
+    b_deltatsqover2 = b_deltat*b_deltat/2.
+  endif
+
+! precompute Runge-Kutta coefficients if attenuation
+  if(ATTENUATION) then
+    tauinv(:,:) = - 1. / tau_sigma(:,:)
+    factor_common(:,:) = 2. * beta(:,:) * tauinv(:,:)
+    alphaval(:,:) = 1 + deltat*tauinv(:,:) + deltat**2*tauinv(:,:)**2 / 2. + &
+      deltat**3*tauinv(:,:)**3 / 6. + deltat**4*tauinv(:,:)**4 / 24.
+    betaval(:,:) = deltat / 2. + deltat**2*tauinv(:,:) / 3. + deltat**3*tauinv(:,:)**2 / 8. + deltat**4*tauinv(:,:)**3 / 24.
+    gammaval(:,:) = deltat / 2. + deltat**2*tauinv(:,:) / 6. + deltat**3*tauinv(:,:)**2 / 24.
+    if (SIMULATION_TYPE == 3) then
+      b_alphaval(:,:) = 1 + b_deltat*tauinv(:,:) + b_deltat**2*tauinv(:,:)**2 / 2. + &
+            b_deltat**3*tauinv(:,:)**3 / 6. + b_deltat**4*tauinv(:,:)**4 / 24.
+      b_betaval(:,:) = b_deltat / 2. + b_deltat**2*tauinv(:,:) / 3. + &
+            b_deltat**3*tauinv(:,:)**2 / 8. + b_deltat**4*tauinv(:,:)**3 / 24.
+      b_gammaval(:,:) = b_deltat / 2. + b_deltat**2*tauinv(:,:) / 6. + &
+            b_deltat**3*tauinv(:,:)**2 / 24.
+    endif
+  endif
+
+! clear memory variables if attenuation
+  if(ATTENUATION) then
+
+   ! initialize memory variables for attenuation
+    epsilondev_xx(:,:,:,:) = 0._CUSTOM_REAL
+    epsilondev_yy(:,:,:,:) = 0._CUSTOM_REAL
+    epsilondev_xy(:,:,:,:) = 0._CUSTOM_REAL
+    epsilondev_xz(:,:,:,:) = 0._CUSTOM_REAL
+    epsilondev_yz(:,:,:,:) = 0._CUSTOM_REAL
+
+    R_xx(:,:,:,:,:) = 0._CUSTOM_REAL
+    R_yy(:,:,:,:,:) = 0._CUSTOM_REAL
+    R_xy(:,:,:,:,:) = 0._CUSTOM_REAL
+    R_xz(:,:,:,:,:) = 0._CUSTOM_REAL
+    R_yz(:,:,:,:,:) = 0._CUSTOM_REAL
+
+    if(FIX_UNDERFLOW_PROBLEM) then
+      R_xx(:,:,:,:,:) = VERYSMALLVAL
+      R_yy(:,:,:,:,:) = VERYSMALLVAL
+      R_xy(:,:,:,:,:) = VERYSMALLVAL
+      R_xz(:,:,:,:,:) = VERYSMALLVAL
+      R_yz(:,:,:,:,:) = VERYSMALLVAL
+    endif
+
+    if (SIMULATION_TYPE == 3) then
+      read(27) b_R_xx
+      read(27) b_R_yy
+      read(27) b_R_xy
+      read(27) b_R_xz
+      read(27) b_R_yz
+      read(27) b_epsilondev_xx
+      read(27) b_epsilondev_yy
+      read(27) b_epsilondev_xy
+      read(27) b_epsilondev_xz
+      read(27) b_epsilondev_yz
+    endif
+
+  endif
+  close(27)
+
+!
+!   s t a r t   t i m e   i t e r a t i o n s
+!
+
+! synchronize all processes to make sure everybody is ready to start time loop
+  call sync_all()
+  if(myrank == 0) write(IMAIN,*) 'All processes are synchronized before time loop'
+
+  if(myrank == 0) then
+    write(IMAIN,*)
+    write(IMAIN,*) 'Starting time iteration loop...'
+    write(IMAIN,*)
+  endif
+
+! create an empty file to monitor the start of the simulation
+  if(myrank == 0) then
+    open(unit=IOUT,file=trim(OUTPUT_FILES)//'/starttimeloop.txt',status='unknown')
+    write(IOUT,*) 'starting time loop'
+    close(IOUT)
+  endif
+
+! initialize Moho boundary index
+  if (SAVE_MOHO_MESH .and. SIMULATION_TYPE == 3) then
+    ispec2D_moho_top = 0
+    ispec2D_moho_bot = 0
+    k_top = 1
+    k_bot = NGLLZ
+  endif
+
+! get MPI starting time
+  time_start = wtime()
+
+! *********************************************************
+! ************* MAIN LOOP OVER THE TIME STEPS *************
+! *********************************************************
+
+  do it = 1,NSTEP
+
+! compute the maximum of the norm of the displacement
+! in all the slices using an MPI reduction
+! and output timestamp file to check that simulation is running fine
+  if(mod(it,NTSTEP_BETWEEN_OUTPUT_INFO) == 0 .or. it == 5) then
+
+! compute maximum of norm of displacement in each slice
+    Usolidnorm = maxval(sqrt(displ(1,:)**2 + displ(2,:)**2 + displ(3,:)**2))
+
+! compute the maximum of the maxima for all the slices using an MPI reduction
+    call max_all_cr(Usolidnorm,Usolidnorm_all)
+
+    if (SIMULATION_TYPE == 3) then
+      b_Usolidnorm = maxval(sqrt(b_displ(1,:)**2 + b_displ(2,:)**2 + b_displ(3,:)**2))
+      call max_all_cr(b_Usolidnorm,b_Usolidnorm_all)
+    endif
+    if(myrank == 0) then
+
+      write(IMAIN,*) 'Time step # ',it
+      write(IMAIN,*) 'Time: ',sngl((it-1)*DT-t0),' seconds'
+
+! elapsed time since beginning of the simulation
+      tCPU = wtime() - time_start
+      int_tCPU = int(tCPU)
+      ihours = int_tCPU / 3600
+      iminutes = (int_tCPU - 3600*ihours) / 60
+      iseconds = int_tCPU - 3600*ihours - 60*iminutes
+      write(IMAIN,*) 'Elapsed time in seconds = ',tCPU
+      write(IMAIN,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
+      write(IMAIN,*) 'Mean elapsed time per time step in seconds = ',tCPU/dble(it)
+      write(IMAIN,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all
+      if (SIMULATION_TYPE == 3) write(IMAIN,*) &
+            'Max norm displacement vector U (backward) in all slices (m) = ',b_Usolidnorm_all
+      write(IMAIN,*)
+
+! write time stamp file to give information about progression of simulation
+      write(outputname,"('/timestamp',i6.6)") it
+      open(unit=IOUT,file=trim(OUTPUT_FILES)//outputname,status='unknown')
+      write(IOUT,*) 'Time step # ',it
+      write(IOUT,*) 'Time: ',sngl((it-1)*DT-t0),' seconds'
+      write(IOUT,*) 'Elapsed time in seconds = ',tCPU
+      write(IOUT,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
+      write(IOUT,*) 'Mean elapsed time per time step in seconds = ',tCPU/dble(it)
+      write(IOUT,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all
+      if (SIMULATION_TYPE == 3) write(IOUT,*) &
+            'Max norm displacement vector U (backward) in all slices (m) = ',b_Usolidnorm_all
+      close(IOUT)
+
+! check stability of the code, exit if unstable
+! negative values can occur with some compilers when the unstable value is greater
+! than the greatest possible floating-point number of the machine
+      if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0) &
+        call exit_MPI(myrank,'forward simulation became unstable and blew up')
+      if(SIMULATION_TYPE == 3 .and. (b_Usolidnorm_all > STABILITY_THRESHOLD .or. b_Usolidnorm_all < 0)) &
+        call exit_MPI(myrank,'backward simulation became unstable and blew up')
+
+    endif
+  endif
+
+! update displacement using finite difference time scheme
+  do i=1,NGLOB_AB
+    displ(:,i) = displ(:,i) + deltat*veloc(:,i) + deltatsqover2*accel(:,i)
+    veloc(:,i) = veloc(:,i) + deltatover2*accel(:,i)
+    accel(:,i) = 0._CUSTOM_REAL
+  enddo
+
+  if (SIMULATION_TYPE == 3) then
+    do i=1,NGLOB_AB
+      b_displ(:,i) = b_displ(:,i) + b_deltat*b_veloc(:,i) + b_deltatsqover2*b_accel(:,i)
+      b_veloc(:,i) = b_veloc(:,i) + b_deltatover2*b_accel(:,i)
+      b_accel(:,i) = 0._CUSTOM_REAL
+    enddo
+  endif
+
+  if (SAVE_MOHO_MESH .and. SIMULATION_TYPE == 3) then
+    ispec2D_moho_top = 0
+    ispec2D_moho_bot = 0
+  endif
+
+  do ispec = 1,NSPEC_AB
+
+    if (SAVE_MOHO_MESH .and. SIMULATION_TYPE == 3) then
+      if (is_moho_top(ispec)) then
+        ispec2D_moho_top = ispec2D_moho_top + 1
+      else if (is_moho_bot(ispec)) then
+        ispec2D_moho_bot = ispec2D_moho_bot + 1
+      endif
+    endif
+
+!---------------------------------------------------------------------------------------------------
+! beginning of nested loops on i,j,k to perform the forward calculations in a given element (ispec)
+!---------------------------------------------------------------------------------------------------
+
+    tempx1l(:,:,:) = 0.
+    tempx2l(:,:,:) = 0.
+    tempx3l(:,:,:) = 0.
+    tempy1l(:,:,:) = 0.
+    tempy2l(:,:,:) = 0.
+    tempy3l(:,:,:) = 0.
+    tempz1l(:,:,:) = 0.
+    tempz2l(:,:,:) = 0.
+    tempz3l(:,:,:) = 0.
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          do l=1,NGLLX
+            hp1 = hprime_xx(i,l)
+            iglob = ibool(l,j,k,ispec)
+            tempx1l(i,j,k) = tempx1l(i,j,k) + displ(1,iglob)*hp1
+            tempy1l(i,j,k) = tempy1l(i,j,k) + displ(2,iglob)*hp1
+            tempz1l(i,j,k) = tempz1l(i,j,k) + displ(3,iglob)*hp1
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            hp2 = hprime_yy(j,l)
+            iglob = ibool(i,l,k,ispec)
+            tempx2l(i,j,k) = tempx2l(i,j,k) + displ(1,iglob)*hp2
+            tempy2l(i,j,k) = tempy2l(i,j,k) + displ(2,iglob)*hp2
+            tempz2l(i,j,k) = tempz2l(i,j,k) + displ(3,iglob)*hp2
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            hp3 = hprime_zz(k,l)
+            iglob = ibool(i,j,l,ispec)
+            tempx3l(i,j,k) = tempx3l(i,j,k) + displ(1,iglob)*hp3
+            tempy3l(i,j,k) = tempy3l(i,j,k) + displ(2,iglob)*hp3
+            tempz3l(i,j,k) = tempz3l(i,j,k) + displ(3,iglob)*hp3
+          enddo
+
+!         get derivatives of ux, uy and uz with respect to x, y and z
+          xixl = xix(i,j,k,ispec)
+          xiyl = xiy(i,j,k,ispec)
+          xizl = xiz(i,j,k,ispec)
+          etaxl = etax(i,j,k,ispec)
+          etayl = etay(i,j,k,ispec)
+          etazl = etaz(i,j,k,ispec)
+          gammaxl = gammax(i,j,k,ispec)
+          gammayl = gammay(i,j,k,ispec)
+          gammazl = gammaz(i,j,k,ispec)
+          jacobianl = jacobian(i,j,k,ispec)
+
+          duxdxl = xixl*tempx1l(i,j,k) + etaxl*tempx2l(i,j,k) + gammaxl*tempx3l(i,j,k)
+          duxdyl = xiyl*tempx1l(i,j,k) + etayl*tempx2l(i,j,k) + gammayl*tempx3l(i,j,k)
+          duxdzl = xizl*tempx1l(i,j,k) + etazl*tempx2l(i,j,k) + gammazl*tempx3l(i,j,k)
+
+          duydxl = xixl*tempy1l(i,j,k) + etaxl*tempy2l(i,j,k) + gammaxl*tempy3l(i,j,k)
+          duydyl = xiyl*tempy1l(i,j,k) + etayl*tempy2l(i,j,k) + gammayl*tempy3l(i,j,k)
+          duydzl = xizl*tempy1l(i,j,k) + etazl*tempy2l(i,j,k) + gammazl*tempy3l(i,j,k)
+
+          duzdxl = xixl*tempz1l(i,j,k) + etaxl*tempz2l(i,j,k) + gammaxl*tempz3l(i,j,k)
+          duzdyl = xiyl*tempz1l(i,j,k) + etayl*tempz2l(i,j,k) + gammayl*tempz3l(i,j,k)
+          duzdzl = xizl*tempz1l(i,j,k) + etazl*tempz2l(i,j,k) + gammazl*tempz3l(i,j,k)
+
+! precompute some sums to save CPU time
+          duxdxl_plus_duydyl = duxdxl + duydyl
+          duxdxl_plus_duzdzl = duxdxl + duzdzl
+          duydyl_plus_duzdzl = duydyl + duzdzl
+          duxdyl_plus_duydxl = duxdyl + duydxl
+          duzdxl_plus_duxdzl = duzdxl + duxdzl
+          duzdyl_plus_duydzl = duzdyl + duydzl
+
+! precompute terms for attenuation if needed
+  if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+
+! compute deviatoric strain
+    epsilon_trace_over_3 = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+    epsilondev_xx_loc(i,j,k) = duxdxl - epsilon_trace_over_3
+    epsilondev_yy_loc(i,j,k) = duydyl - epsilon_trace_over_3
+    epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+    epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+    epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+
+! distinguish attenuation factors
+    if(flag_sediments(i,j,k,ispec)) then
+
+! use constant attenuation of Q = 90
+! or use scaling rule similar to Olsen et al. (2003)
+     if(USE_OLSEN_ATTENUATION) then
+       vs_val = mustore(i,j,k,ispec) / rho_vs(i,j,k,ispec)
+! use rule Q_mu = constant * v_s
+       Q_mu = OLSEN_ATTENUATION_RATIO * vs_val
+       int_Q_mu = 10 * nint(Q_mu / 10.)
+       if(int_Q_mu < 40) int_Q_mu = 40
+       if(int_Q_mu > 150) int_Q_mu = 150
+
+       if(int_Q_mu == 40) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_40
+       else if(int_Q_mu == 50) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_50
+       else if(int_Q_mu == 60) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_60
+       else if(int_Q_mu == 70) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_70
+       else if(int_Q_mu == 80) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_80
+       else if(int_Q_mu == 90) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_90
+       else if(int_Q_mu == 100) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_100
+       else if(int_Q_mu == 110) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_110
+       else if(int_Q_mu == 120) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_120
+       else if(int_Q_mu == 130) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_130
+       else if(int_Q_mu == 140) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_140
+       else if(int_Q_mu == 150) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_150
+       else
+         stop 'incorrect attenuation coefficient'
+       endif
+
+     else
+       iattenuation_sediments = IATTENUATION_SEDIMENTS_90
+     endif
+
+      iselected = iattenuation_sediments
+    else
+      iselected = IATTENUATION_BEDROCK
+    endif
+
+    one_minus_sum_beta_use = one_minus_sum_beta(iselected)
+    minus_sum_beta =  one_minus_sum_beta_use - 1.
+
+  endif
+
+    kappal = kappastore(i,j,k,ispec)
+    mul = mustore(i,j,k,ispec)
+
+! For fully anisotropic case
+    if(ANISOTROPY_VAL) then
+       c11 = c11store(i,j,k,ispec)
+       c12 = c12store(i,j,k,ispec)
+       c13 = c13store(i,j,k,ispec)
+       c14 = c14store(i,j,k,ispec)
+       c15 = c15store(i,j,k,ispec)
+       c16 = c16store(i,j,k,ispec)
+       c22 = c22store(i,j,k,ispec)
+       c23 = c23store(i,j,k,ispec)
+       c24 = c24store(i,j,k,ispec)
+       c25 = c25store(i,j,k,ispec)
+       c26 = c26store(i,j,k,ispec)
+       c33 = c33store(i,j,k,ispec)
+       c34 = c34store(i,j,k,ispec)
+       c35 = c35store(i,j,k,ispec)
+       c36 = c36store(i,j,k,ispec)
+       c44 = c44store(i,j,k,ispec)
+       c45 = c45store(i,j,k,ispec)
+       c46 = c46store(i,j,k,ispec)
+       c55 = c55store(i,j,k,ispec)
+       c56 = c56store(i,j,k,ispec)
+       c66 = c66store(i,j,k,ispec)
+       !if(ATTENUATION_VAL.and. not_fully_in_bedrock(ispec)) then
+       !   mul = c44
+       !   c11 = c11 + FOUR_THIRDS * minus_sum_beta * mul
+       !   c12 = c12 - TWO_THIRDS * minus_sum_beta * mul
+       !   c13 = c13 - TWO_THIRDS * minus_sum_beta * mul
+       !   c22 = c22 + FOUR_THIRDS * minus_sum_beta * mul
+       !   c23 = c23 - TWO_THIRDS * minus_sum_beta * mul
+       !   c33 = c33 + FOUR_THIRDS * minus_sum_beta * mul
+       !   c44 = c44 + minus_sum_beta * mul
+       !   c55 = c55 + minus_sum_beta * mul
+       !   c66 = c66 + minus_sum_beta * mul
+       !endif
+
+       sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+            c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+
+       sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+            c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+
+       sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+            c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+
+       sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+            c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+
+       sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+            c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+
+       sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+            c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+    else
+
+! For isotropic case
+! use unrelaxed parameters if attenuation
+       if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) mul = mul * one_minus_sum_beta_use
+
+       lambdalplus2mul = kappal + FOUR_THIRDS * mul
+       lambdal = lambdalplus2mul - 2.*mul
+
+! compute stress sigma
+       sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+       sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+       sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+       sigma_xy = mul*duxdyl_plus_duydxl
+       sigma_xz = mul*duzdxl_plus_duxdzl
+       sigma_yz = mul*duzdyl_plus_duydzl
+
+    endif
+
+! subtract memory variables if attenuation
+    if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+      do i_sls = 1,N_SLS
+        R_xx_val = R_xx(i,j,k,ispec,i_sls)
+        R_yy_val = R_yy(i,j,k,ispec,i_sls)
+        sigma_xx = sigma_xx - R_xx_val
+        sigma_yy = sigma_yy - R_yy_val
+        sigma_zz = sigma_zz + R_xx_val + R_yy_val
+        sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+        sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+        sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+      enddo
+    endif
+
+! form dot product with test vector, symmetric form
+      tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl)
+      tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl)
+      tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl)
+
+      tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl)
+      tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl)
+      tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl)
+
+      tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl)
+      tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl)
+      tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl)
+
+        enddo
+      enddo
+    enddo
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+!! second part
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+    tempx1lbis(:,:,:) = 0.
+    tempx2lbis(:,:,:) = 0.
+    tempx3lbis(:,:,:) = 0.
+    tempy1lbis(:,:,:) = 0.
+    tempy2lbis(:,:,:) = 0.
+    tempy3lbis(:,:,:) = 0.
+    tempz1lbis(:,:,:) = 0.
+    tempz2lbis(:,:,:) = 0.
+    tempz3lbis(:,:,:) = 0.
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          do l=1,NGLLX
+            fac1 = hprimewgll_xx(l,i)
+            tempx1lbis(i,j,k) = tempx1lbis(i,j,k) + tempx1(l,j,k)*fac1
+            tempy1lbis(i,j,k) = tempy1lbis(i,j,k) + tempy1(l,j,k)*fac1
+            tempz1lbis(i,j,k) = tempz1lbis(i,j,k) + tempz1(l,j,k)*fac1
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            fac2 = hprimewgll_yy(l,j)
+            tempx2lbis(i,j,k) = tempx2lbis(i,j,k) + tempx2(i,l,k)*fac2
+            tempy2lbis(i,j,k) = tempy2lbis(i,j,k) + tempy2(i,l,k)*fac2
+            tempz2lbis(i,j,k) = tempz2lbis(i,j,k) + tempz2(i,l,k)*fac2
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            fac3 = hprimewgll_zz(l,k)
+            tempx3lbis(i,j,k) = tempx3lbis(i,j,k) + tempx3(i,j,l)*fac3
+            tempy3lbis(i,j,k) = tempy3lbis(i,j,k) + tempy3(i,j,l)*fac3
+            tempz3lbis(i,j,k) = tempz3lbis(i,j,k) + tempz3(i,j,l)*fac3
+          enddo
+
+          fac1 = wgllwgll_yz(j,k)
+          fac2 = wgllwgll_xz(i,k)
+          fac3 = wgllwgll_xy(i,j)
+
+! sum contributions from each element to the global mesh
+  iglob = ibool(i,j,k,ispec)
+  accel(1,iglob) = accel(1,iglob) - (fac1*tempx1lbis(i,j,k) + fac2*tempx2lbis(i,j,k) + fac3*tempx3lbis(i,j,k))
+  accel(2,iglob) = accel(2,iglob) - (fac1*tempy1lbis(i,j,k) + fac2*tempy2lbis(i,j,k) + fac3*tempy3lbis(i,j,k))
+  accel(3,iglob) = accel(3,iglob) - (fac1*tempz1lbis(i,j,k) + fac2*tempz2lbis(i,j,k) + fac3*tempz3lbis(i,j,k))
+
+! update memory variables based upon the Runge-Kutta scheme
+
+  if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+
+! use Runge-Kutta scheme to march in time
+  do i_sls = 1,N_SLS
+
+! get coefficients for that standard linear solid
+
+  factor_loc = mustore(i,j,k,ispec) * factor_common(iselected,i_sls)
+  alphaval_loc = alphaval(iselected,i_sls)
+  betaval_loc = betaval(iselected,i_sls)
+  gammaval_loc = gammaval(iselected,i_sls)
+
+! term in xx
+  Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+  Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+  R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
+
+! term in yy
+  Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+  Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+  R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
+
+! term in zz not computed since zero trace
+! This is because we only implement Q_\mu attenuation and not Q_\kappa.
+! Note that this does *NOT* imply that there is no attenuation for P waves
+! because for Q_\kappa = infinity one gets (see for instance Dahlen and Tromp (1998)
+! equation (9.59) page 350): Q_\alpha = Q_\mu * 3 * (V_p/V_s)^2 / 4
+! therefore Q_\alpha is not zero; for instance for V_p / V_s = sqrt(3)
+! we get Q_\alpha = (9 / 4) * Q_\mu = 2.25 * Q_\mu
+
+! term in xy
+  Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+  Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+  R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
+
+! term in xz
+  Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+  Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+  R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
+
+! term in yz
+  Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+  Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+  R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + betaval_loc * Sn + gammaval_loc * Snp1
+
+    enddo   ! end of loop on memory variables
+
+  endif  !  end attenuation
+
+        enddo
+      enddo
+    enddo
+
+
+!----------------------------------------------------------------------------------------------------
+! beginning of nested loops on i,j,k to perform the backward calculations in a given element (ispec)
+!----------------------------------------------------------------------------------------------------
+
+    if (SIMULATION_TYPE == 3) then
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+            b_tempx1l = 0.
+            b_tempx2l = 0.
+            b_tempx3l = 0.
+
+            b_tempy1l = 0.
+            b_tempy2l = 0.
+            b_tempy3l = 0.
+
+            b_tempz1l = 0.
+            b_tempz2l = 0.
+            b_tempz3l = 0.
+
+          do l=1,NGLLX
+            hp1 = hprime_xx(i,l)
+            iglob = ibool(l,j,k,ispec)
+            b_tempx1l = b_tempx1l + b_displ(1,iglob)*hp1
+            b_tempy1l = b_tempy1l + b_displ(2,iglob)*hp1
+            b_tempz1l = b_tempz1l + b_displ(3,iglob)*hp1
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            hp2 = hprime_yy(j,l)
+            iglob = ibool(i,l,k,ispec)
+            b_tempx2l = b_tempx2l + b_displ(1,iglob)*hp2
+            b_tempy2l = b_tempy2l + b_displ(2,iglob)*hp2
+            b_tempz2l = b_tempz2l + b_displ(3,iglob)*hp2
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            hp3 = hprime_zz(k,l)
+            iglob = ibool(i,j,l,ispec)
+            b_tempx3l = b_tempx3l + b_displ(1,iglob)*hp3
+            b_tempy3l = b_tempy3l + b_displ(2,iglob)*hp3
+            b_tempz3l = b_tempz3l + b_displ(3,iglob)*hp3
+          enddo
+
+!         get derivatives of ux, uy and uz with respect to x, y and z
+          xixl = xix(i,j,k,ispec)
+          xiyl = xiy(i,j,k,ispec)
+          xizl = xiz(i,j,k,ispec)
+          etaxl = etax(i,j,k,ispec)
+          etayl = etay(i,j,k,ispec)
+          etazl = etaz(i,j,k,ispec)
+          gammaxl = gammax(i,j,k,ispec)
+          gammayl = gammay(i,j,k,ispec)
+          gammazl = gammaz(i,j,k,ispec)
+          jacobianl = jacobian(i,j,k,ispec)
+
+          duxdxl = xixl*tempx1l(i,j,k) + etaxl*tempx2l(i,j,k) + gammaxl*tempx3l(i,j,k)
+          duxdyl = xiyl*tempx1l(i,j,k) + etayl*tempx2l(i,j,k) + gammayl*tempx3l(i,j,k)
+          duxdzl = xizl*tempx1l(i,j,k) + etazl*tempx2l(i,j,k) + gammazl*tempx3l(i,j,k)
+
+          duydxl = xixl*tempy1l(i,j,k) + etaxl*tempy2l(i,j,k) + gammaxl*tempy3l(i,j,k)
+          duydyl = xiyl*tempy1l(i,j,k) + etayl*tempy2l(i,j,k) + gammayl*tempy3l(i,j,k)
+          duydzl = xizl*tempy1l(i,j,k) + etazl*tempy2l(i,j,k) + gammazl*tempy3l(i,j,k)
+
+          duzdxl = xixl*tempz1l(i,j,k) + etaxl*tempz2l(i,j,k) + gammaxl*tempz3l(i,j,k)
+          duzdyl = xiyl*tempz1l(i,j,k) + etayl*tempz2l(i,j,k) + gammayl*tempz3l(i,j,k)
+          duzdzl = xizl*tempz1l(i,j,k) + etazl*tempz2l(i,j,k) + gammazl*tempz3l(i,j,k)
+
+! save strain on the Moho boundary
+          if (SAVE_MOHO_MESH) then
+            if (is_moho_top(ispec)) then
+              dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+              dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+              dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+              dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+              dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+              dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+              dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+              dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+              dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+            else if (is_moho_bot(ispec)) then
+              dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+              dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+              dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+              dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+              dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+              dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+              dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+              dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+              dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+            endif
+          endif
+
+! precompute some sums to save CPU time
+          duxdxl_plus_duydyl = duxdxl + duydyl
+          duxdxl_plus_duzdzl = duxdxl + duzdzl
+          duydyl_plus_duzdzl = duydyl + duzdzl
+          duxdyl_plus_duydxl = duxdyl + duydxl
+          duzdxl_plus_duxdzl = duzdxl + duxdzl
+          duzdyl_plus_duydzl = duzdyl + duydzl
+
+            dsxx = duxdxl
+            dsxy = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+            dsxz = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+            dsyy = duydyl
+            dsyz = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+            dszz = duzdzl
+
+            b_duxdxl = xixl*b_tempx1l + etaxl*b_tempx2l + gammaxl*b_tempx3l
+            b_duxdyl = xiyl*b_tempx1l + etayl*b_tempx2l + gammayl*b_tempx3l
+            b_duxdzl = xizl*b_tempx1l + etazl*b_tempx2l + gammazl*b_tempx3l
+
+            b_duydxl = xixl*b_tempy1l + etaxl*b_tempy2l + gammaxl*b_tempy3l
+            b_duydyl = xiyl*b_tempy1l + etayl*b_tempy2l + gammayl*b_tempy3l
+            b_duydzl = xizl*b_tempy1l + etazl*b_tempy2l + gammazl*b_tempy3l
+
+            b_duzdxl = xixl*b_tempz1l + etaxl*b_tempz2l + gammaxl*b_tempz3l
+            b_duzdyl = xiyl*b_tempz1l + etayl*b_tempz2l + gammayl*b_tempz3l
+            b_duzdzl = xizl*b_tempz1l + etazl*b_tempz2l + gammazl*b_tempz3l
+
+            b_duxdxl_plus_duydyl = b_duxdxl + b_duydyl
+            b_duxdxl_plus_duzdzl = b_duxdxl + b_duzdzl
+            b_duydyl_plus_duzdzl = b_duydyl + b_duzdzl
+            b_duxdyl_plus_duydxl = b_duxdyl + b_duydxl
+            b_duzdxl_plus_duxdzl = b_duzdxl + b_duxdzl
+            b_duzdyl_plus_duydzl = b_duzdyl + b_duydzl
+
+            b_dsxx =  b_duxdxl
+            b_dsxy = 0.5_CUSTOM_REAL * b_duxdyl_plus_duydxl
+            b_dsxz = 0.5_CUSTOM_REAL * b_duzdxl_plus_duxdzl
+            b_dsyy =  b_duydyl
+            b_dsyz = 0.5_CUSTOM_REAL * b_duzdyl_plus_duydzl
+            b_dszz =  b_duzdzl
+
+            kappa_k = (duxdxl + duydyl + duzdzl) *  (b_duxdxl + b_duydyl + b_duzdzl)
+            mu_k = dsxx * b_dsxx + dsyy * b_dsyy + dszz * b_dszz + &
+                  2 * (dsxy * b_dsxy + dsxz * b_dsxz + dsyz * b_dsyz) - ONE_THIRD * kappa_k
+            kappa_kl(i,j,k,ispec) = kappa_kl(i,j,k,ispec) + deltat * kappa_k
+            mu_kl(i,j,k,ispec) = mu_kl(i,j,k,ispec) + 2 * deltat * mu_k
+
+            if (SAVE_MOHO_MESH) then
+              if (is_moho_top(ispec)) then
+                b_dsdx_top(1,1,i,j,k,ispec2D_moho_top) = b_duxdxl
+                b_dsdx_top(1,2,i,j,k,ispec2D_moho_top) = b_duxdyl
+                b_dsdx_top(1,3,i,j,k,ispec2D_moho_top) = b_duxdzl
+                b_dsdx_top(2,1,i,j,k,ispec2D_moho_top) = b_duydxl
+                b_dsdx_top(2,2,i,j,k,ispec2D_moho_top) = b_duydyl
+                b_dsdx_top(2,3,i,j,k,ispec2D_moho_top) = b_duydzl
+                b_dsdx_top(3,1,i,j,k,ispec2D_moho_top) = b_duzdxl
+                b_dsdx_top(3,2,i,j,k,ispec2D_moho_top) = b_duzdyl
+                b_dsdx_top(3,3,i,j,k,ispec2D_moho_top) = b_duzdzl
+              else if (is_moho_bot(ispec)) then
+                b_dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = b_duxdxl
+                b_dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = b_duxdyl
+                b_dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = b_duxdzl
+                b_dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = b_duydxl
+                b_dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = b_duydyl
+                b_dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = b_duydzl
+                b_dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = b_duzdxl
+                b_dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = b_duzdyl
+                b_dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = b_duzdzl
+              endif
+            endif
+
+! precompute terms for attenuation if needed
+  if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+
+! compute deviatoric strain
+    b_epsilon_trace_over_3 = ONE_THIRD * (b_duxdxl + b_duydyl + b_duzdzl)
+    b_epsilondev_xx_loc(i,j,k) = b_duxdxl - b_epsilon_trace_over_3
+    b_epsilondev_yy_loc(i,j,k) = b_duydyl - b_epsilon_trace_over_3
+    b_epsilondev_xy_loc(i,j,k) = 0.5 * b_duxdyl_plus_duydxl
+    b_epsilondev_xz_loc(i,j,k) = 0.5 * b_duzdxl_plus_duxdzl
+    b_epsilondev_yz_loc(i,j,k) = 0.5 * b_duzdyl_plus_duydzl
+
+! distinguish attenuation factors
+    if(flag_sediments(i,j,k,ispec)) then
+
+! use constant attenuation of Q = 90
+! or use scaling rule similar to Olsen et al. (2003)
+     if(USE_OLSEN_ATTENUATION) then
+       vs_val = mustore(i,j,k,ispec) / rho_vs(i,j,k,ispec)
+! use rule Q_mu = constant * v_s
+       Q_mu = OLSEN_ATTENUATION_RATIO * vs_val
+       int_Q_mu = 10 * nint(Q_mu / 10.)
+       if(int_Q_mu < 40) int_Q_mu = 40
+       if(int_Q_mu > 150) int_Q_mu = 150
+
+       if(int_Q_mu == 40) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_40
+       else if(int_Q_mu == 50) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_50
+       else if(int_Q_mu == 60) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_60
+       else if(int_Q_mu == 70) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_70
+       else if(int_Q_mu == 80) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_80
+       else if(int_Q_mu == 90) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_90
+       else if(int_Q_mu == 100) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_100
+       else if(int_Q_mu == 110) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_110
+       else if(int_Q_mu == 120) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_120
+       else if(int_Q_mu == 130) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_130
+       else if(int_Q_mu == 140) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_140
+       else if(int_Q_mu == 150) then
+         iattenuation_sediments = IATTENUATION_SEDIMENTS_150
+       else
+         stop 'incorrect attenuation coefficient'
+       endif
+
+     else
+       iattenuation_sediments = IATTENUATION_SEDIMENTS_90
+     endif
+
+      iselected = iattenuation_sediments
+    else
+      iselected = IATTENUATION_BEDROCK
+    endif
+
+    one_minus_sum_beta_use = one_minus_sum_beta(iselected)
+    minus_sum_beta =  one_minus_sum_beta_use - 1.
+
+  endif
+
+    kappal = kappastore(i,j,k,ispec)
+    mul = mustore(i,j,k,ispec)
+
+! For fully anisotropic case
+    if(ANISOTROPY_VAL) then
+       c11 = c11store(i,j,k,ispec)
+       c12 = c12store(i,j,k,ispec)
+       c13 = c13store(i,j,k,ispec)
+       c14 = c14store(i,j,k,ispec)
+       c15 = c15store(i,j,k,ispec)
+       c16 = c16store(i,j,k,ispec)
+       c22 = c22store(i,j,k,ispec)
+       c23 = c23store(i,j,k,ispec)
+       c24 = c24store(i,j,k,ispec)
+       c25 = c25store(i,j,k,ispec)
+       c26 = c26store(i,j,k,ispec)
+       c33 = c33store(i,j,k,ispec)
+       c34 = c34store(i,j,k,ispec)
+       c35 = c35store(i,j,k,ispec)
+       c36 = c36store(i,j,k,ispec)
+       c44 = c44store(i,j,k,ispec)
+       c45 = c45store(i,j,k,ispec)
+       c46 = c46store(i,j,k,ispec)
+       c55 = c55store(i,j,k,ispec)
+       c56 = c56store(i,j,k,ispec)
+       c66 = c66store(i,j,k,ispec)
+       !if(ATTENUATION_VAL.and. not_fully_in_bedrock(ispec)) then
+       !   mul = c44
+       !   c11 = c11 + FOUR_THIRDS * minus_sum_beta * mul
+       !   c12 = c12 - TWO_THIRDS * minus_sum_beta * mul
+       !   c13 = c13 - TWO_THIRDS * minus_sum_beta * mul
+       !   c22 = c22 + FOUR_THIRDS * minus_sum_beta * mul
+       !   c23 = c23 - TWO_THIRDS * minus_sum_beta * mul
+       !   c33 = c33 + FOUR_THIRDS * minus_sum_beta * mul
+       !   c44 = c44 + minus_sum_beta * mul
+       !   c55 = c55 + minus_sum_beta * mul
+       !   c66 = c66 + minus_sum_beta * mul
+       !endif
+
+       b_sigma_xx = c11*b_duxdxl + c16*b_duxdyl_plus_duydxl + c12*b_duydyl + &
+             c15*b_duzdxl_plus_duxdzl + c14*b_duzdyl_plus_duydzl + c13*b_duzdzl
+
+       b_sigma_yy = c12*b_duxdxl + c26*b_duxdyl_plus_duydxl + c22*b_duydyl + &
+             c25*b_duzdxl_plus_duxdzl + c24*b_duzdyl_plus_duydzl + c23*b_duzdzl
+
+       b_sigma_zz = c13*b_duxdxl + c36*b_duxdyl_plus_duydxl + c23*b_duydyl + &
+             c35*b_duzdxl_plus_duxdzl + c34*b_duzdyl_plus_duydzl + c33*b_duzdzl
+
+       b_sigma_xy = c16*b_duxdxl + c66*b_duxdyl_plus_duydxl + c26*b_duydyl + &
+             c56*b_duzdxl_plus_duxdzl + c46*b_duzdyl_plus_duydzl + c36*b_duzdzl
+
+       b_sigma_xz = c15*b_duxdxl + c56*b_duxdyl_plus_duydxl + c25*b_duydyl + &
+             c55*b_duzdxl_plus_duxdzl + c45*b_duzdyl_plus_duydzl + c35*b_duzdzl
+
+       b_sigma_yz = c14*b_duxdxl + c46*b_duxdyl_plus_duydxl + c24*b_duydyl + &
+             c45*b_duzdxl_plus_duxdzl + c44*b_duzdyl_plus_duydzl + c34*b_duzdzl
+
+    else
+! For isotropic case
+! use unrelaxed parameters if attenuation
+       if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) mul = mul * one_minus_sum_beta_use
+
+       lambdalplus2mul = kappal + FOUR_THIRDS * mul
+       lambdal = lambdalplus2mul - 2.*mul
+
+! compute stress sigma
+       b_sigma_xx = lambdalplus2mul*b_duxdxl + lambdal*b_duydyl_plus_duzdzl
+       b_sigma_yy = lambdalplus2mul*b_duydyl + lambdal*b_duxdxl_plus_duzdzl
+       b_sigma_zz = lambdalplus2mul*b_duzdzl + lambdal*b_duxdxl_plus_duydyl
+
+       b_sigma_xy = mul*b_duxdyl_plus_duydxl
+       b_sigma_xz = mul*b_duzdxl_plus_duxdzl
+       b_sigma_yz = mul*b_duzdyl_plus_duydzl
+
+    endif
+
+! subtract memory variables if attenuation
+    if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+      do i_sls = 1,N_SLS
+        b_R_xx_val = b_R_xx(i,j,k,ispec,i_sls)
+        b_R_yy_val = b_R_yy(i,j,k,ispec,i_sls)
+        b_sigma_xx = b_sigma_xx - b_R_xx_val
+        b_sigma_yy = b_sigma_yy - b_R_yy_val
+        b_sigma_zz = b_sigma_zz + b_R_xx_val + b_R_yy_val
+        b_sigma_xy = b_sigma_xy - b_R_xy(i,j,k,ispec,i_sls)
+        b_sigma_xz = b_sigma_xz - b_R_xz(i,j,k,ispec,i_sls)
+        b_sigma_yz = b_sigma_yz - b_R_yz(i,j,k,ispec,i_sls)
+      enddo
+    endif
+
+! form dot product with test vector, symmetric form
+        b_tempx1(i,j,k) = jacobianl * (b_sigma_xx*xixl + b_sigma_xy*xiyl + b_sigma_xz*xizl)
+        b_tempy1(i,j,k) = jacobianl * (b_sigma_xy*xixl + b_sigma_yy*xiyl + b_sigma_yz*xizl)
+        b_tempz1(i,j,k) = jacobianl * (b_sigma_xz*xixl + b_sigma_yz*xiyl + b_sigma_zz*xizl)
+
+        b_tempx2(i,j,k) = jacobianl * (b_sigma_xx*etaxl + b_sigma_xy*etayl + b_sigma_xz*etazl)
+        b_tempy2(i,j,k) = jacobianl * (b_sigma_xy*etaxl + b_sigma_yy*etayl + b_sigma_yz*etazl)
+        b_tempz2(i,j,k) = jacobianl * (b_sigma_xz*etaxl + b_sigma_yz*etayl + b_sigma_zz*etazl)
+
+        b_tempx3(i,j,k) = jacobianl * (b_sigma_xx*gammaxl + b_sigma_xy*gammayl + b_sigma_xz*gammazl)
+        b_tempy3(i,j,k) = jacobianl * (b_sigma_xy*gammaxl + b_sigma_yy*gammayl + b_sigma_yz*gammazl)
+        b_tempz3(i,j,k) = jacobianl * (b_sigma_xz*gammaxl + b_sigma_yz*gammayl + b_sigma_zz*gammazl)
+
+        enddo
+      enddo
+    enddo
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+!! second part
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+        b_tempx1l = 0.
+        b_tempy1l = 0.
+        b_tempz1l = 0.
+
+        b_tempx2l = 0.
+        b_tempy2l = 0.
+        b_tempz2l = 0.
+
+        b_tempx3l = 0.
+        b_tempy3l = 0.
+        b_tempz3l = 0.
+
+        do l=1,NGLLX
+            fac1 = hprimewgll_xx(l,i)
+            b_tempx1l = b_tempx1l + b_tempx1(l,j,k)*fac1
+            b_tempy1l = b_tempy1l + b_tempy1(l,j,k)*fac1
+            b_tempz1l = b_tempz1l + b_tempz1(l,j,k)*fac1
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            fac2 = hprimewgll_yy(l,j)
+            b_tempx2l = b_tempx2l + b_tempx2(i,l,k)*fac2
+            b_tempy2l = b_tempy2l + b_tempy2(i,l,k)*fac2
+            b_tempz2l = b_tempz2l + b_tempz2(i,l,k)*fac2
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            fac3 = hprimewgll_zz(l,k)
+            b_tempx3l = b_tempx3l + b_tempx3(i,j,l)*fac3
+            b_tempy3l = b_tempy3l + b_tempy3(i,j,l)*fac3
+            b_tempz3l = b_tempz3l + b_tempz3(i,j,l)*fac3
+         enddo
+
+         fac1 = wgllwgll_yz(j,k)
+         fac2 = wgllwgll_xz(i,k)
+         fac3 = wgllwgll_xy(i,j)
+
+! sum contributions from each element to the global mesh
+         iglob = ibool(i,j,k,ispec)
+         b_accel(1,iglob) = b_accel(1,iglob) - (fac1*b_tempx1l + fac2*b_tempx2l + fac3*b_tempx3l)
+         b_accel(2,iglob) = b_accel(2,iglob) - (fac1*b_tempy1l + fac2*b_tempy2l + fac3*b_tempy3l)
+         b_accel(3,iglob) = b_accel(3,iglob) - (fac1*b_tempz1l + fac2*b_tempz2l + fac3*b_tempz3l)
+
+! update memory variables based upon the Runge-Kutta scheme
+
+  if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+
+! use Runge-Kutta scheme to march in time
+  do i_sls = 1,N_SLS
+
+! get coefficients for that standard linear solid
+    factor_loc = mustore(i,j,k,ispec) * factor_common(iselected,i_sls)
+    b_alphaval_loc = b_alphaval(iselected,i_sls)
+    b_betaval_loc = b_betaval(iselected,i_sls)
+    b_gammaval_loc = b_gammaval(iselected,i_sls)
+
+! term in xx
+    b_Sn   = factor_loc * b_epsilondev_xx(i,j,k,ispec)
+    b_Snp1   = factor_loc * b_epsilondev_xx_loc(i,j,k)
+    b_R_xx(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_xx(i,j,k,ispec,i_sls) + b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
+
+! term in yy
+    b_Sn   = factor_loc * b_epsilondev_yy(i,j,k,ispec)
+    b_Snp1   = factor_loc * b_epsilondev_yy_loc(i,j,k)
+    b_R_yy(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_yy(i,j,k,ispec,i_sls) + b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
+
+! term in zz not computed since zero trace
+
+! term in xy
+    b_Sn   = factor_loc * b_epsilondev_xy(i,j,k,ispec)
+    b_Snp1   = factor_loc * b_epsilondev_xy_loc(i,j,k)
+    b_R_xy(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_xy(i,j,k,ispec,i_sls) + b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
+
+! term in xz
+    b_Sn   = factor_loc * b_epsilondev_xz(i,j,k,ispec)
+    b_Snp1   = factor_loc * b_epsilondev_xz_loc(i,j,k)
+    b_R_xz(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_xz(i,j,k,ispec,i_sls) + b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
+
+! term in yz
+    b_Sn   = factor_loc * b_epsilondev_yz(i,j,k,ispec)
+    b_Snp1   = factor_loc * b_epsilondev_yz_loc(i,j,k)
+    b_R_yz(i,j,k,ispec,i_sls) = b_alphaval_loc * b_R_yz(i,j,k,ispec,i_sls) + b_betaval_loc * b_Sn + b_gammaval_loc * b_Snp1
+
+    enddo   ! end of loop on memory variables
+
+  endif  !  end attenuation
+
+        enddo
+      enddo
+    enddo
+
+    endif ! of test if SIMULATION_TYPE == 3
+
+!----------------------------------------------------------------------------------------------
+! end of nested loops on i,j,k to perform the backward calculations in a given element (ispec)
+!----------------------------------------------------------------------------------------------
+
+! save deviatoric strain for Runge-Kutta scheme
+  if(ATTENUATION_VAL .and. not_fully_in_bedrock(ispec)) then
+    epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+    epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+    epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+    epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+    epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+    if (SIMULATION_TYPE == 3) then
+      b_epsilondev_xx(:,:,:,ispec) = b_epsilondev_xx_loc(:,:,:)
+      b_epsilondev_yy(:,:,:,ispec) = b_epsilondev_yy_loc(:,:,:)
+      b_epsilondev_xy(:,:,:,ispec) = b_epsilondev_xy_loc(:,:,:)
+      b_epsilondev_xz(:,:,:,ispec) = b_epsilondev_xz_loc(:,:,:)
+      b_epsilondev_yz(:,:,:,ispec) = b_epsilondev_yz_loc(:,:,:)
+    endif
+  endif
+
+  enddo   ! of the spectral element loop
+
+! add Stacey conditions
+
+  if(ABSORBING_CONDITIONS) then
+
+!   xmin
+    if (SIMULATION_TYPE == 3 .and. nspec2D_xmin > 0)  then
+      read(31,rec=NSTEP-it+1) reclen1,absorb_xmin,reclen2
+      if (reclen1 /= reclen_xmin .or. reclen1 /= reclen2)  call exit_mpi(myrank,'Error reading absorbing contribution absorb_xmin')
+    endif
+    do ispec2D=1,nspec2D_xmin
+
+      ispec=ibelm_xmin(ispec2D)
+
+! exclude elements that are not on absorbing edges
+    if(nkmin_xi(1,ispec2D) == 0 .or. njmin(1,ispec2D) == 0) cycle
+
+      i=1
+      do k=nkmin_xi(1,ispec2D),NGLLZ
+        do j=njmin(1,ispec2D),njmax(1,ispec2D)
+          iglob=ibool(i,j,k,ispec)
+
+          vx=veloc(1,iglob)
+          vy=veloc(2,iglob)
+          vz=veloc(3,iglob)
+
+          nx=normal_xmin(1,j,k,ispec2D)
+          ny=normal_xmin(2,j,k,ispec2D)
+          nz=normal_xmin(3,j,k,ispec2D)
+
+          vn=vx*nx+vy*ny+vz*nz
+
+          tx=rho_vp(i,j,k,ispec)*vn*nx+rho_vs(i,j,k,ispec)*(vx-vn*nx)
+          ty=rho_vp(i,j,k,ispec)*vn*ny+rho_vs(i,j,k,ispec)*(vy-vn*ny)
+          tz=rho_vp(i,j,k,ispec)*vn*nz+rho_vs(i,j,k,ispec)*(vz-vn*nz)
+
+          weight=jacobian2D_xmin(j,k,ispec2D)*wgllwgll_yz(j,k)
+
+          accel(1,iglob)=accel(1,iglob) - tx*weight
+          accel(2,iglob)=accel(2,iglob) - ty*weight
+          accel(3,iglob)=accel(3,iglob) - tz*weight
+
+          if (SIMULATION_TYPE == 3) then
+            b_accel(1,iglob)=b_accel(1,iglob) - absorb_xmin(1,j,k,ispec2D)
+            b_accel(2,iglob)=b_accel(2,iglob) - absorb_xmin(2,j,k,ispec2D)
+            b_accel(3,iglob)=b_accel(3,iglob) - absorb_xmin(3,j,k,ispec2D)
+          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            absorb_xmin(1,j,k,ispec2D) = tx*weight
+            absorb_xmin(2,j,k,ispec2D) = ty*weight
+            absorb_xmin(3,j,k,ispec2D) = tz*weight
+          endif
+
+        enddo
+      enddo
+    enddo
+
+    if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmin > 0 ) write(31,rec=it) reclen_xmin,absorb_xmin,reclen_xmin
+
+!   xmax
+    if (SIMULATION_TYPE == 3 .and. nspec2D_xmax > 0)  then
+      read(32,rec=NSTEP-it+1) reclen1,absorb_xmax,reclen2
+      if (reclen1 /= reclen_xmax .or. reclen1 /= reclen2)  call exit_mpi(myrank,'Error reading absorbing contribution absorb_xmax')
+    endif
+    do ispec2D=1,nspec2D_xmax
+      ispec=ibelm_xmax(ispec2D)
+
+! exclude elements that are not on absorbing edges
+    if(nkmin_xi(2,ispec2D) == 0 .or. njmin(2,ispec2D) == 0) cycle
+
+      i=NGLLX
+      do k=nkmin_xi(2,ispec2D),NGLLZ
+        do j=njmin(2,ispec2D),njmax(2,ispec2D)
+          iglob=ibool(i,j,k,ispec)
+
+          vx=veloc(1,iglob)
+          vy=veloc(2,iglob)
+          vz=veloc(3,iglob)
+
+          nx=normal_xmax(1,j,k,ispec2D)
+          ny=normal_xmax(2,j,k,ispec2D)
+          nz=normal_xmax(3,j,k,ispec2D)
+
+          vn=vx*nx+vy*ny+vz*nz
+
+          tx=rho_vp(i,j,k,ispec)*vn*nx+rho_vs(i,j,k,ispec)*(vx-vn*nx)
+          ty=rho_vp(i,j,k,ispec)*vn*ny+rho_vs(i,j,k,ispec)*(vy-vn*ny)
+          tz=rho_vp(i,j,k,ispec)*vn*nz+rho_vs(i,j,k,ispec)*(vz-vn*nz)
+
+          weight=jacobian2D_xmax(j,k,ispec2D)*wgllwgll_yz(j,k)
+
+          accel(1,iglob)=accel(1,iglob) - tx*weight
+          accel(2,iglob)=accel(2,iglob) - ty*weight
+          accel(3,iglob)=accel(3,iglob) - tz*weight
+
+          if (SIMULATION_TYPE == 3) then
+            b_accel(1,iglob)=b_accel(1,iglob) - absorb_xmax(1,j,k,ispec2D)
+            b_accel(2,iglob)=b_accel(2,iglob) - absorb_xmax(2,j,k,ispec2D)
+            b_accel(3,iglob)=b_accel(3,iglob) - absorb_xmax(3,j,k,ispec2D)
+          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            absorb_xmax(1,j,k,ispec2D) = tx*weight
+            absorb_xmax(2,j,k,ispec2D) = ty*weight
+            absorb_xmax(3,j,k,ispec2D) = tz*weight
+          endif
+
+        enddo
+      enddo
+    enddo
+
+    if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmax > 0 ) write(32,rec=it) reclen_xmax,absorb_xmax,reclen_xmax
+
+!   ymin
+    if (SIMULATION_TYPE == 3 .and. nspec2D_ymin > 0)  then
+      read(33,rec=NSTEP-it+1) reclen1,absorb_ymin,reclen2
+      if (reclen1 /= reclen_ymin .or. reclen1 /= reclen2)  call exit_mpi(myrank,'Error reading absorbing contribution absorb_ymin')
+    endif
+    do ispec2D=1,nspec2D_ymin
+
+      ispec=ibelm_ymin(ispec2D)
+
+! exclude elements that are not on absorbing edges
+    if(nkmin_eta(1,ispec2D) == 0 .or. nimin(1,ispec2D) == 0) cycle
+
+      j=1
+      do k=nkmin_eta(1,ispec2D),NGLLZ
+        do i=nimin(1,ispec2D),nimax(1,ispec2D)
+          iglob=ibool(i,j,k,ispec)
+
+          vx=veloc(1,iglob)
+          vy=veloc(2,iglob)
+          vz=veloc(3,iglob)
+
+          nx=normal_ymin(1,i,k,ispec2D)
+          ny=normal_ymin(2,i,k,ispec2D)
+          nz=normal_ymin(3,i,k,ispec2D)
+
+          vn=vx*nx+vy*ny+vz*nz
+
+          tx=rho_vp(i,j,k,ispec)*vn*nx+rho_vs(i,j,k,ispec)*(vx-vn*nx)
+          ty=rho_vp(i,j,k,ispec)*vn*ny+rho_vs(i,j,k,ispec)*(vy-vn*ny)
+          tz=rho_vp(i,j,k,ispec)*vn*nz+rho_vs(i,j,k,ispec)*(vz-vn*nz)
+
+          weight=jacobian2D_ymin(i,k,ispec2D)*wgllwgll_xz(i,k)
+
+          accel(1,iglob)=accel(1,iglob) - tx*weight
+          accel(2,iglob)=accel(2,iglob) - ty*weight
+          accel(3,iglob)=accel(3,iglob) - tz*weight
+
+          if (SIMULATION_TYPE == 3) then
+            b_accel(1,iglob)=b_accel(1,iglob) - absorb_ymin(1,i,k,ispec2D)
+            b_accel(2,iglob)=b_accel(2,iglob) - absorb_ymin(2,i,k,ispec2D)
+            b_accel(3,iglob)=b_accel(3,iglob) - absorb_ymin(3,i,k,ispec2D)
+          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            absorb_ymin(1,i,k,ispec2D) = tx*weight
+            absorb_ymin(2,i,k,ispec2D) = ty*weight
+            absorb_ymin(3,i,k,ispec2D) = tz*weight
+          endif
+
+        enddo
+      enddo
+    enddo
+
+   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_ymin > 0 ) write(33,rec=it) reclen_ymin,absorb_ymin,reclen_ymin
+
+!   ymax
+    if (SIMULATION_TYPE == 3 .and. nspec2D_ymax > 0)  then
+      read(34,rec=NSTEP-it+1) reclen1,absorb_ymax,reclen2
+      if (reclen1 /= reclen_ymax .or. reclen1 /= reclen2)  call exit_mpi(myrank,'Error reading absorbing contribution absorb_ymax')
+    endif
+    do ispec2D=1,nspec2D_ymax
+
+      ispec=ibelm_ymax(ispec2D)
+
+! exclude elements that are not on absorbing edges
+    if(nkmin_eta(2,ispec2D) == 0 .or. nimin(2,ispec2D) == 0) cycle
+
+      j=NGLLY
+      do k=nkmin_eta(2,ispec2D),NGLLZ
+        do i=nimin(2,ispec2D),nimax(2,ispec2D)
+          iglob=ibool(i,j,k,ispec)
+
+          vx=veloc(1,iglob)
+          vy=veloc(2,iglob)
+          vz=veloc(3,iglob)
+
+          nx=normal_ymax(1,i,k,ispec2D)
+          ny=normal_ymax(2,i,k,ispec2D)
+          nz=normal_ymax(3,i,k,ispec2D)
+
+          vn=vx*nx+vy*ny+vz*nz
+
+          tx=rho_vp(i,j,k,ispec)*vn*nx+rho_vs(i,j,k,ispec)*(vx-vn*nx)
+          ty=rho_vp(i,j,k,ispec)*vn*ny+rho_vs(i,j,k,ispec)*(vy-vn*ny)
+          tz=rho_vp(i,j,k,ispec)*vn*nz+rho_vs(i,j,k,ispec)*(vz-vn*nz)
+
+          weight=jacobian2D_ymax(i,k,ispec2D)*wgllwgll_xz(i,k)
+
+          accel(1,iglob)=accel(1,iglob) - tx*weight
+          accel(2,iglob)=accel(2,iglob) - ty*weight
+          accel(3,iglob)=accel(3,iglob) - tz*weight
+
+          if (SIMULATION_TYPE == 3) then
+            b_accel(1,iglob)=b_accel(1,iglob) - absorb_ymax(1,i,k,ispec2D)
+            b_accel(2,iglob)=b_accel(2,iglob) - absorb_ymax(2,i,k,ispec2D)
+            b_accel(3,iglob)=b_accel(3,iglob) - absorb_ymax(3,i,k,ispec2D)
+          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            absorb_ymax(1,i,k,ispec2D) = tx*weight
+            absorb_ymax(2,i,k,ispec2D) = ty*weight
+            absorb_ymax(3,i,k,ispec2D) = tz*weight
+          endif
+
+        enddo
+      enddo
+    enddo
+
+    if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_ymax > 0) write(34,rec=it) reclen_ymax,absorb_ymax,reclen_ymax
+
+!   bottom (zmin)
+    if (SIMULATION_TYPE == 3 .and. NSPEC2D_BOTTOM > 0)  then
+      read(35,rec=NSTEP-it+1) reclen1,absorb_zmin,reclen2
+      if (reclen1 /= reclen_zmin .or. reclen1 /= reclen2)  call exit_mpi(myrank,'Error reading absorbing contribution absorb_zmin')
+    endif
+    do ispec2D=1,NSPEC2D_BOTTOM
+
+      ispec=ibelm_bottom(ispec2D)
+
+      k=1
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          iglob=ibool(i,j,k,ispec)
+
+          vx=veloc(1,iglob)
+          vy=veloc(2,iglob)
+          vz=veloc(3,iglob)
+
+          nx=normal_bottom(1,i,j,ispec2D)
+          ny=normal_bottom(2,i,j,ispec2D)
+          nz=normal_bottom(3,i,j,ispec2D)
+
+          vn=vx*nx+vy*ny+vz*nz
+
+          tx=rho_vp(i,j,k,ispec)*vn*nx+rho_vs(i,j,k,ispec)*(vx-vn*nx)
+          ty=rho_vp(i,j,k,ispec)*vn*ny+rho_vs(i,j,k,ispec)*(vy-vn*ny)
+          tz=rho_vp(i,j,k,ispec)*vn*nz+rho_vs(i,j,k,ispec)*(vz-vn*nz)
+
+          weight=jacobian2D_bottom(i,j,ispec2D)*wgllwgll_xy(i,j)
+
+          accel(1,iglob)=accel(1,iglob) - tx*weight
+          accel(2,iglob)=accel(2,iglob) - ty*weight
+          accel(3,iglob)=accel(3,iglob) - tz*weight
+
+          if (SIMULATION_TYPE == 3) then
+            b_accel(1,iglob)=b_accel(1,iglob) - absorb_zmin(1,i,j,ispec2D)
+            b_accel(2,iglob)=b_accel(2,iglob) - absorb_zmin(2,i,j,ispec2D)
+            b_accel(3,iglob)=b_accel(3,iglob) - absorb_zmin(3,i,j,ispec2D)
+          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            absorb_zmin(1,i,j,ispec2D) = tx*weight
+            absorb_zmin(2,i,j,ispec2D) = ty*weight
+            absorb_zmin(3,i,j,ispec2D) = tz*weight
+          endif
+
+        enddo
+      enddo
+    enddo
+
+    if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. NSPEC2D_BOTTOM > 0) write(35,rec=it) reclen_zmin,absorb_zmin,reclen_zmin
+
+  endif  ! end of Stacey conditions
+
+
+  if (SIMULATION_TYPE == 1) then
+
+  do isource = 1,NSOURCES
+
+!   add the source (only if this proc carries the source)
+    if(myrank == islice_selected_source(isource)) then
+
+      stf = comp_source_time_function(dble(it-1)*DT-t0-t_cmt(isource),hdur_gaussian(isource))
+
+!     distinguish between single and double precision for reals
+      if(CUSTOM_REAL == SIZE_REAL) then
+        stf_used = sngl(stf)
+      else
+        stf_used = stf
+      endif
+
+!     add source array
+      do k=1,NGLLZ
+        do j=1,NGLLY
+          do i=1,NGLLX
+            iglob = ibool(i,j,k,ispec_selected_source(isource))
+            accel(:,iglob) = accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used
+          enddo
+        enddo
+      enddo
+
+    endif
+
+  enddo
+
+  endif
+
+  if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
+
+    irec_local = 0
+    do irec = 1,nrec
+
+!   add the source (only if this proc carries the source)
+      if(myrank == islice_selected_rec(irec)) then
+
+        irec_local = irec_local + 1
+!     add source array
+        do k = 1,NGLLZ
+        do j=1,NGLLY
+          do i=1,NGLLX
+            iglob = ibool(i,j,k,ispec_selected_rec(irec))
+            accel(:,iglob) = accel(:,iglob) + adj_sourcearrays(irec_local,NSTEP-it+1,:,i,j,k)
+          enddo
+        enddo
+        enddo
+
+      endif
+
+    enddo
+
+  endif
+
+  if (SIMULATION_TYPE == 3) then
+    do isource = 1,NSOURCES
+
+!   add the source (only if this proc carries the source)
+    if(myrank == islice_selected_source(isource)) then
+
+      stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-t_cmt(isource),hdur_gaussian(isource))
+
+!     distinguish between single and double precision for reals
+      if(CUSTOM_REAL == SIZE_REAL) then
+        stf_used = sngl(stf)
+      else
+        stf_used = stf
+      endif
+
+!     add source array
+      do k=1,NGLLZ
+        do j=1,NGLLY
+          do i=1,NGLLX
+            iglob = ibool(i,j,k,ispec_selected_source(isource))
+            b_accel(:,iglob) = b_accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used
+          enddo
+        enddo
+      enddo
+
+    endif
+
+  enddo
+
+  endif
+
+
+! assemble all the contributions between slices using MPI
+  call assemble_MPI_vector(accel,iproc_xi,iproc_eta,addressing, &
+            iboolleft_xi,iboolright_xi,iboolleft_eta,iboolright_eta, &
+            buffer_send_faces_vector,buffer_received_faces_vector,npoin2D_xi,npoin2D_eta, &
+            NPROC_XI,NPROC_ETA,NPOIN2DMAX_XMIN_XMAX,NPOIN2DMAX_YMIN_YMAX,NPOIN2DMAX_XY)
+  if (SIMULATION_TYPE == 3) call assemble_MPI_vector(b_accel,iproc_xi,iproc_eta,addressing, &
+          iboolleft_xi,iboolright_xi,iboolleft_eta,iboolright_eta, &
+          buffer_send_faces_vector,buffer_received_faces_vector,npoin2D_xi,npoin2D_eta, &
+          NPROC_XI,NPROC_ETA,NPOIN2DMAX_XMIN_XMAX,NPOIN2DMAX_YMIN_YMAX,NPOIN2DMAX_XY)
+
+  do i=1,NGLOB_AB
+    accel(1,i) = accel(1,i)*rmass(i)
+    accel(2,i) = accel(2,i)*rmass(i)
+    accel(3,i) = accel(3,i)*rmass(i)
+  enddo
+  if (SIMULATION_TYPE == 3) then
+    do i=1,NGLOB_AB
+      b_accel(1,i) = b_accel(1,i)*rmass(i)
+      b_accel(2,i) = b_accel(2,i)*rmass(i)
+      b_accel(3,i) = b_accel(3,i)*rmass(i)
+    enddo
+  endif
+
+  if(OCEANS) then
+
+!   initialize the updates
+    updated_dof_ocean_load(:) = .false.
+
+! for surface elements exactly at the top of the model (ocean bottom)
+    do ispec2D = 1,NSPEC2D_TOP
+
+      ispec = ibelm_top(ispec2D)
+
+! only for DOFs exactly at the top of the model (ocean bottom)
+      k = NGLLZ
+
+      do j = 1,NGLLY
+        do i = 1,NGLLX
+
+! get global point number
+          iglob = ibool(i,j,k,ispec)
+
+! only update once
+          if(.not. updated_dof_ocean_load(iglob)) then
+
+! get normal
+            nx = normal_top(1,i,j,ispec2D)
+            ny = normal_top(2,i,j,ispec2D)
+            nz = normal_top(3,i,j,ispec2D)
+
+! make updated component of right-hand side
+! we divide by rmass() which is 1 / M
+! we use the total force which includes the Coriolis term above
+            force_normal_comp = (accel(1,iglob)*nx + &
+                 accel(2,iglob)*ny + accel(3,iglob)*nz) / rmass(iglob)
+
+            additional_term = (rmass_ocean_load(iglob) - rmass(iglob)) * force_normal_comp
+
+            accel(1,iglob) = accel(1,iglob) + additional_term * nx
+            accel(2,iglob) = accel(2,iglob) + additional_term * ny
+            accel(3,iglob) = accel(3,iglob) + additional_term * nz
+
+            if (SIMULATION_TYPE == 3) then
+              b_force_normal_comp = (b_accel(1,iglob)*nx + &
+                    b_accel(2,iglob)*ny + b_accel(3,iglob)*nz) / rmass(iglob)
+
+              b_additional_term = (rmass_ocean_load(iglob) - rmass(iglob)) * b_force_normal_comp
+
+              b_accel(1,iglob) = b_accel(1,iglob) + b_additional_term * nx
+              b_accel(2,iglob) = b_accel(2,iglob) + b_additional_term * ny
+              b_accel(3,iglob) = b_accel(3,iglob) + b_additional_term * nz
+            endif
+
+!           done with this point
+            updated_dof_ocean_load(iglob) = .true.
+
+          endif
+
+        enddo
+      enddo
+    enddo
+  endif
+
+  do i=1,NGLOB_AB
+    veloc(:,i) = veloc(:,i) + deltatover2*accel(:,i)
+  enddo
+  if (SIMULATION_TYPE == 3) then
+    do i=1,NGLOB_AB
+      b_veloc(:,i) = b_veloc(:,i) + b_deltatover2*b_accel(:,i)
+    enddo
+  endif
+
+! write the seismograms with time shift
+  if (nrec_local > 0) then
+  do irec_local = 1,nrec_local
+
+! get global number of that receiver
+    irec = number_receiver_global(irec_local)
+
+! perform the general interpolation using Lagrange polynomials
+    dxd = ZERO
+    dyd = ZERO
+    dzd = ZERO
+
+    vxd = ZERO
+    vyd = ZERO
+    vzd = ZERO
+
+    axd = ZERO
+    ayd = ZERO
+    azd = ZERO
+    if (SIMULATION_TYPE == 1)  then
+
+      do k = 1,NGLLZ
+        do j = 1,NGLLY
+          do i = 1,NGLLX
+
+! receivers are always located at the surface of the mesh
+            iglob = ibool(i,j,k,ispec_selected_rec(irec))
+
+            hlagrange = hxir_store(irec_local,i)*hetar_store(irec_local,j)*hgammar_store(irec_local,k)
+
+
+! save displacement
+            dxd = dxd + dble(displ(1,iglob))*hlagrange
+            dyd = dyd + dble(displ(2,iglob))*hlagrange
+            dzd = dzd + dble(displ(3,iglob))*hlagrange
+
+! save velocity
+            vxd = vxd + dble(veloc(1,iglob))*hlagrange
+            vyd = vyd + dble(veloc(2,iglob))*hlagrange
+            vzd = vzd + dble(veloc(3,iglob))*hlagrange
+
+! save acceleration
+            axd = axd + dble(accel(1,iglob))*hlagrange
+            ayd = ayd + dble(accel(2,iglob))*hlagrange
+            azd = azd + dble(accel(3,iglob))*hlagrange
+
+          enddo
+        enddo
+      enddo
+
+    else if (SIMULATION_TYPE == 2) then
+
+      do k = 1,NGLLZ
+        do j = 1,NGLLY
+          do i = 1,NGLLX
+
+            iglob = ibool(i,j,k,ispec_selected_source(irec))
+
+            hlagrange = hxir_store(irec_local,i)*hetar_store(irec_local,j)*hgammar_store(irec_local,k)
+
+            dxd = dxd + dble(displ(1,iglob))*hlagrange
+            dyd = dyd + dble(displ(2,iglob))*hlagrange
+            dzd = dzd + dble(displ(3,iglob))*hlagrange
+            vxd = vxd + dble(veloc(1,iglob))*hlagrange
+            vyd = vyd + dble(veloc(2,iglob))*hlagrange
+            vzd = vzd + dble(veloc(3,iglob))*hlagrange
+            axd = axd + dble(accel(1,iglob))*hlagrange
+            ayd = ayd + dble(accel(2,iglob))*hlagrange
+            azd = azd + dble(accel(3,iglob))*hlagrange
+
+            displ_s(:,i,j,k) = displ(:,iglob)
+
+          enddo
+        enddo
+      enddo
+
+      ispec = ispec_selected_source(irec)
+
+      call compute_adj_source_frechet(displ_s,Mxx(irec),Myy(irec),Mzz(irec),Mxy(irec),Mxz(irec),Myz(irec),eps_s,eps_m_s, &
+           hxir_store(irec_local,:),hetar_store(irec_local,:),hgammar_store(irec_local,:), &
+           hpxir_store(irec_local,:),hpetar_store(irec_local,:),hpgammar_store(irec_local,:),hprime_xx,hprime_yy,hprime_zz, &
+           xix(:,:,:,ispec),xiy(:,:,:,ispec),xiz(:,:,:,ispec),etax(:,:,:,ispec),etay(:,:,:,ispec),etaz(:,:,:,ispec), &
+           gammax(:,:,:,ispec),gammay(:,:,:,ispec),gammaz(:,:,:,ispec))
+
+      stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-t_cmt(irec),hdur_gaussian(irec))
+      stf_deltat = stf * deltat
+      Mxx_der(irec_local) = Mxx_der(irec_local) + eps_s(1,1) * stf_deltat
+      Myy_der(irec_local) = Myy_der(irec_local) + eps_s(2,2) * stf_deltat
+      Mzz_der(irec_local) = Mzz_der(irec_local) + eps_s(3,3) * stf_deltat
+      Mxy_der(irec_local) = Mxy_der(irec_local) + 2 * eps_s(1,2) * stf_deltat
+      Mxz_der(irec_local) = Mxz_der(irec_local) + 2 * eps_s(1,3) * stf_deltat
+      Myz_der(irec_local) = Myz_der(irec_local) + 2 * eps_s(2,3) * stf_deltat
+
+      sloc_der(:,irec_local) = sloc_der(:,irec_local) + eps_m_s(:) * stf_deltat
+
+    else if (SIMULATION_TYPE == 3) then
+
+      do k = 1,NGLLZ
+      do j = 1,NGLLY
+        do i = 1,NGLLX
+
+          iglob = ibool(i,j,k,ispec_selected_rec(irec))
+
+          hlagrange = hxir_store(irec_local,i)*hetar_store(irec_local,j)*hgammar_store(irec_local,k)
+
+          dxd = dxd + dble(b_displ(1,iglob))*hlagrange
+          dyd = dyd + dble(b_displ(2,iglob))*hlagrange
+          dzd = dzd + dble(b_displ(3,iglob))*hlagrange
+          vxd = vxd + dble(b_veloc(1,iglob))*hlagrange
+          vyd = vyd + dble(b_veloc(2,iglob))*hlagrange
+          vzd = vzd + dble(b_veloc(3,iglob))*hlagrange
+          axd = axd + dble(b_accel(1,iglob))*hlagrange
+          ayd = ayd + dble(b_accel(2,iglob))*hlagrange
+          azd = azd + dble(b_accel(3,iglob))*hlagrange
+        enddo
+      enddo
+      enddo
+    endif
+
+
+
+! store North, East and Vertical components
+
+! distinguish between single and double precision for reals
+      if(CUSTOM_REAL == SIZE_REAL) then
+        seismograms_d(:,irec_local,it) = sngl((nu(:,1,irec)*dxd + nu(:,2,irec)*dyd + nu(:,3,irec)*dzd))
+        seismograms_v(:,irec_local,it) = sngl((nu(:,1,irec)*vxd + nu(:,2,irec)*vyd + nu(:,3,irec)*vzd))
+        seismograms_a(:,irec_local,it) = sngl((nu(:,1,irec)*axd + nu(:,2,irec)*ayd + nu(:,3,irec)*azd))
+      else
+        seismograms_d(:,irec_local,it) = (nu(:,1,irec)*dxd + nu(:,2,irec)*dyd + nu(:,3,irec)*dzd)
+        seismograms_v(:,irec_local,it) = (nu(:,1,irec)*vxd + nu(:,2,irec)*vyd + nu(:,3,irec)*vzd)
+        seismograms_a(:,irec_local,it) = (nu(:,1,irec)*axd + nu(:,2,irec)*ayd + nu(:,3,irec)*azd)
+      endif
+
+      if (SIMULATION_TYPE == 2) then
+        seismograms_eps(:,:,irec_local,it) = eps_s(:,:)
+      endif
+
+
+  enddo
+
+! write the current or final seismograms
+  if(mod(it,NTSTEP_BETWEEN_OUTPUT_SEISMOS) == 0 .or. it == NSTEP) then
+    if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
+      call write_seismograms(myrank,seismograms_d,number_receiver_global,station_name, &
+            network_name,nrec,nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,1)
+      call write_seismograms(myrank,seismograms_v,number_receiver_global,station_name, &
+            network_name,nrec,nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,2)
+      call write_seismograms(myrank,seismograms_a,number_receiver_global,station_name, &
+            network_name,nrec,nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,3)
+    else
+      call write_adj_seismograms(myrank,seismograms_d,number_receiver_global, &
+            nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,1)
+    endif
+  endif
+
+  endif ! nrec_local
+
+! resetting d/v/a/R/eps for the backward reconstruction with attenuation
+  if (ATTENUATION .and. it > 1 .and. it < NSTEP) then
+  if (SIMULATION_TYPE == 3 .and. mod(NSTEP-it,NSTEP_Q_SAVE) == 0) then
+    write(outputname,"('save_Q_arrays_',i6.6,'.bin')") NSTEP-it
+    open(unit=27,file=trim(prname_Q)//trim(outputname),status='old',action='read',form='unformatted')
+    read(27) b_displ
+    read(27) b_veloc
+    read(27) b_accel
+    read(27) b_R_xx
+    read(27) b_R_yy
+    read(27) b_R_xy
+    read(27) b_R_xz
+    read(27) b_R_yz
+    read(27) b_epsilondev_xx
+    read(27) b_epsilondev_yy
+    read(27) b_epsilondev_xy
+    read(27) b_epsilondev_xz
+    read(27) b_epsilondev_yz
+    close(27)
+  else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. mod(it,NSTEP_Q_SAVE) == 0) then
+    write(outputname,"('save_Q_arrays_',i6.6,'.bin')") it
+    open(unit=27,file=trim(prname_Q)//trim(outputname),status='unknown',action='write',form='unformatted')
+    write(27) displ
+    write(27) veloc
+    write(27) accel
+    write(27) R_xx
+    write(27) R_yy
+    write(27) R_xy
+    write(27) R_xz
+    write(27) R_yz
+    write(27) epsilondev_xx
+    write(27) epsilondev_yy
+    write(27) epsilondev_xy
+    write(27) epsilondev_xz
+    write(27) epsilondev_yz
+    close(27)
+  endif
+  endif
+
+! kernel calculations
+  if (SIMULATION_TYPE == 3) then
+    do ispec = 1, NSPEC_AB
+      do k = 1, NGLLZ
+        do j = 1, NGLLY
+          do i = 1, NGLLX
+            iglob = ibool(i,j,k,ispec)
+            rho_kl(i,j,k,ispec) =  rho_kl(i,j,k,ispec) + deltat * dot_product(accel(:,iglob), b_displ(:,iglob))
+          enddo
+        enddo
+      enddo
+    enddo
+
+    if (SAVE_MOHO_MESH) then
+      do ispec2D = 1, nspec2D_moho
+        ispec_top = ibelm_moho_top(ispec2D)
+        ispec_bot = ibelm_moho_bot(ispec2D)
+        do j = 1, NGLLY
+          do i = 1, NGLLX
+            iglob_top = ibool(i,j,k_top,ispec_top)
+
+            call compute_boundary_kernel(kernel_moho_top, &
+                       mustore(i,j,k_top,ispec_top), kappastore(i,j,k_top,ispec_top), rho_vs(i,j,k_top,ispec_top), &
+                       accel(:,iglob_top),b_displ(:,iglob_top),dsdx_top(:,:,i,j,k_top,ispec2D), b_dsdx_top(:,:,i,j,k_top,ispec2D), &
+                       normal_moho(:,i,j,ispec2D))
+
+            iglob_bot = ibool(i,j,k_bot,ispec_bot)
+            ! iglob_top == iglob_bot!
+
+            call compute_boundary_kernel(kernel_moho_bot, &
+                       mustore(i,j,k_bot,ispec_bot), kappastore(i,j,k_bot,ispec_bot), rho_vs(i,j,k_bot,ispec_bot), &
+                       accel(:,iglob_bot),b_displ(:,iglob_bot),dsdx_bot(:,:,i,j,k_bot,ispec2D), b_dsdx_bot(:,:,i,j,k_bot,ispec2D), &
+                       normal_moho(:,i,j,ispec2D))
+
+            moho_kl(i,j,ispec2D) = moho_kl(i,j,ispec2D) + (kernel_moho_top - kernel_moho_bot) * deltat
+
+          enddo
+        enddo
+      enddo
+    endif
+  endif
+
+! save MOVIE on the SURFACE
+  if(MOVIE_SURFACE .and. mod(it,NTSTEP_BETWEEN_FRAMES) == 0) then
+
+! get coordinates of surface mesh and surface displacement
+    ipoin = 0
+
+   k = NGLLZ
+   if (USE_HIGHRES_FOR_MOVIES) then
+     do ispec2D = 1,NSPEC2D_TOP
+       ispec = ibelm_top(ispec2D)
+       do j = 1,NGLLY
+         do i = 1,NGLLX
+           ipoin = ipoin + 1
+           iglob = ibool(i,j,k,ispec)
+           store_val_x(ipoin) = xstore(iglob)
+           store_val_y(ipoin) = ystore(iglob)
+           store_val_z(ipoin) = zstore(iglob)
+           if(SAVE_DISPLACEMENT) then
+             store_val_ux(ipoin) = displ(1,iglob)
+             store_val_uy(ipoin) = displ(2,iglob)
+             store_val_uz(ipoin) = displ(3,iglob)
+           else
+             store_val_ux(ipoin) = veloc(1,iglob)
+             store_val_uy(ipoin) = veloc(2,iglob)
+             store_val_uz(ipoin) = veloc(3,iglob)
+           endif
+         enddo
+       enddo
+     enddo ! ispec_top
+   else
+     do ispec2D = 1,NSPEC2D_TOP
+       ispec = ibelm_top(ispec2D)
+       do iloc = 1, NGNOD2D
+         ipoin = ipoin + 1
+         iglob = ibool(iorderi(iloc),iorderj(iloc),k,ispec)
+         store_val_x(ipoin) = xstore(iglob)
+         store_val_y(ipoin) = ystore(iglob)
+         store_val_z(ipoin) = zstore(iglob)
+         if(SAVE_DISPLACEMENT) then
+           store_val_ux(ipoin) = displ(1,iglob)
+           store_val_uy(ipoin) = displ(2,iglob)
+           store_val_uz(ipoin) = displ(3,iglob)
+         else
+           store_val_ux(ipoin) = veloc(1,iglob)
+           store_val_uy(ipoin) = veloc(2,iglob)
+           store_val_uz(ipoin) = veloc(3,iglob)
+         endif
+       enddo
+     enddo ! ispec_top
+   endif
+
+    ispec = nmovie_points
+
+    call gather_all_cr(store_val_x,ispec,store_val_x_all,ispec,NPROC)
+    call gather_all_cr(store_val_y,ispec,store_val_y_all,ispec,NPROC)
+    call gather_all_cr(store_val_z,ispec,store_val_z_all,ispec,NPROC)
+    call gather_all_cr(store_val_ux,ispec,store_val_ux_all,ispec,NPROC)
+    call gather_all_cr(store_val_uy,ispec,store_val_uy_all,ispec,NPROC)
+    call gather_all_cr(store_val_uz,ispec,store_val_uz_all,ispec,NPROC)
+
+! save movie data to disk in home directory
+    if(myrank == 0) then
+      write(outputname,"('/moviedata',i6.6)") it
+      open(unit=IOUT,file=trim(OUTPUT_FILES)//outputname,status='unknown',form='unformatted')
+      write(IOUT) store_val_x_all
+      write(IOUT) store_val_y_all
+      write(IOUT) store_val_z_all
+      write(IOUT) store_val_ux_all
+      write(IOUT) store_val_uy_all
+      write(IOUT) store_val_uz_all
+      close(IOUT)
+    endif
+
+  endif
+
+! compute SHAKING INTENSITY MAP
+ if(CREATE_SHAKEMAP) then
+
+    ipoin = 0
+    k = NGLLZ
+
+! save all points for high resolution, or only four corners for low resolution
+    if(USE_HIGHRES_FOR_MOVIES) then
+
+    do ispec2D = 1,NSPEC2D_TOP
+      ispec = ibelm_top(ispec2D)
+
+! loop on all the points inside the element
+      do j = 1,NGLLY
+        do i = 1,NGLLX
+          ipoin = ipoin + 1
+          iglob = ibool(i,j,k,ispec)
+          store_val_x(ipoin) = xstore(iglob)
+          store_val_y(ipoin) = ystore(iglob)
+          store_val_z(ipoin) = zstore(iglob)
+          store_val_norm_displ(ipoin) = max(store_val_norm_displ(ipoin),abs(displ(1,iglob)),abs(displ(2,iglob)))
+          store_val_norm_veloc(ipoin) = max(store_val_norm_veloc(ipoin),abs(veloc(1,iglob)),abs(veloc(2,iglob)))
+          store_val_norm_accel(ipoin) = max(store_val_norm_accel(ipoin),abs(accel(1,iglob)),abs(accel(2,iglob)))
+        enddo
+      enddo
+    enddo
+
+    else
+      do ispec2D = 1,NSPEC2D_TOP
+        ispec = ibelm_top(ispec2D)
+        do iloc = 1, NGNOD2D
+          ipoin = ipoin + 1
+          iglob = ibool(iorderi(iloc),iorderj(iloc),k,ispec)
+          store_val_x(ipoin) = xstore(iglob)
+          store_val_y(ipoin) = ystore(iglob)
+          store_val_z(ipoin) = zstore(iglob)
+          store_val_norm_displ(ipoin) = max(store_val_norm_displ(ipoin),abs(displ(1,iglob)),abs(displ(2,iglob)))
+          store_val_norm_veloc(ipoin) = max(store_val_norm_veloc(ipoin),abs(veloc(1,iglob)),abs(veloc(2,iglob)))
+          store_val_norm_accel(ipoin) = max(store_val_norm_accel(ipoin),abs(accel(1,iglob)),abs(accel(2,iglob)))
+        enddo
+      enddo
+    endif
+
+! save shakemap only at the end of the simulation
+    if(it == NSTEP) then
+    ispec = nmovie_points
+    call gather_all_cr(store_val_x,ispec,store_val_x_all,ispec,NPROC)
+    call gather_all_cr(store_val_y,ispec,store_val_y_all,ispec,NPROC)
+    call gather_all_cr(store_val_z,ispec,store_val_z_all,ispec,NPROC)
+    call gather_all_cr(store_val_norm_displ,ispec,store_val_ux_all,ispec,NPROC)
+    call gather_all_cr(store_val_norm_veloc,ispec,store_val_uy_all,ispec,NPROC)
+    call gather_all_cr(store_val_norm_accel,ispec,store_val_uz_all,ispec,NPROC)
+
+! save movie data to disk in home directory
+    if(myrank == 0) then
+      open(unit=IOUT,file=trim(OUTPUT_FILES)//'/shakingdata',status='unknown',form='unformatted')
+      write(IOUT) store_val_x_all
+      write(IOUT) store_val_y_all
+      write(IOUT) store_val_z_all
+! this saves norm of displacement, velocity and acceleration
+! but we use the same ux, uy, uz arrays as for the movies to save memory
+      write(IOUT) store_val_ux_all
+      write(IOUT) store_val_uy_all
+      write(IOUT) store_val_uz_all
+      close(IOUT)
+    endif
+
+    endif
+  endif
+
+! save MOVIE in full 3D MESH
+  if(MOVIE_VOLUME .and. mod(it,NTSTEP_BETWEEN_FRAMES) == 0) then
+
+! save velocity here to avoid static offset on displacement for movies
+
+! save full snapshot data to local disk
+
+! calculate strain div and curl
+    do ispec=1,NSPEC_AB
+
+       tempx1l(:,:,:) = 0._CUSTOM_REAL
+       tempx2l(:,:,:) = 0._CUSTOM_REAL
+       tempx3l(:,:,:) = 0._CUSTOM_REAL
+       
+       tempy1l(:,:,:) = 0._CUSTOM_REAL
+       tempy2l(:,:,:) = 0._CUSTOM_REAL
+       tempy3l(:,:,:) = 0._CUSTOM_REAL
+
+       tempz1l(:,:,:) = 0._CUSTOM_REAL
+       tempz2l(:,:,:) = 0._CUSTOM_REAL
+       tempz3l(:,:,:) = 0._CUSTOM_REAL   
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          do l=1,NGLLX
+            hp1 = hprime_xx(i,l)
+            iglob = ibool(l,j,k,ispec)
+            tempx1l(i,j,k) = tempx1l(i,j,k) + veloc(1,iglob)*hp1
+            tempy1l(i,j,k) = tempy1l(i,j,k) + veloc(2,iglob)*hp1
+            tempz1l(i,j,k) = tempz1l(i,j,k) + veloc(3,iglob)*hp1
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            hp2 = hprime_yy(j,l)
+            iglob = ibool(i,l,k,ispec)
+            tempx2l(i,j,k) = tempx2l(i,j,k) + veloc(1,iglob)*hp2
+            tempy2l(i,j,k) = tempy2l(i,j,k) + veloc(2,iglob)*hp2
+            tempz2l (i,j,k)= tempz2l(i,j,k) + veloc(3,iglob)*hp2
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            hp3 = hprime_zz(k,l)
+            iglob = ibool(i,j,l,ispec)
+            tempx3l(i,j,k) = tempx3l(i,j,k) + veloc(1,iglob)*hp3
+            tempy3l(i,j,k) = tempy3l(i,j,k) + veloc(2,iglob)*hp3
+            tempz3l(i,j,k) = tempz3l(i,j,k) + veloc(3,iglob)*hp3
+          enddo
+
+!         get derivatives of ux, uy and uz with respect to x, y and z
+
+          xixl = xix(i,j,k,ispec)
+          xiyl = xiy(i,j,k,ispec)
+          xizl = xiz(i,j,k,ispec)
+          etaxl = etax(i,j,k,ispec)
+          etayl = etay(i,j,k,ispec)
+          etazl = etaz(i,j,k,ispec)
+          gammaxl = gammax(i,j,k,ispec)
+          gammayl = gammay(i,j,k,ispec)
+          gammazl = gammaz(i,j,k,ispec)
+
+          dvxdxl(i,j,k) = xixl*tempx1l(i,j,k) + etaxl*tempx2l(i,j,k) + gammaxl*tempx3l(i,j,k)
+          dvxdyl(i,j,k) = xiyl*tempx1l(i,j,k) + etayl*tempx2l(i,j,k) + gammayl*tempx3l(i,j,k)
+          dvxdzl(i,j,k) = xizl*tempx1l(i,j,k) + etazl*tempx2l(i,j,k) + gammazl*tempx3l(i,j,k)
+
+          dvydxl(i,j,k) = xixl*tempy1l(i,j,k) + etaxl*tempy2l(i,j,k) + gammaxl*tempy3l(i,j,k)
+          dvydyl(i,j,k) = xiyl*tempy1l(i,j,k) + etayl*tempy2l(i,j,k) + gammayl*tempy3l(i,j,k)
+          dvydzl(i,j,k) = xizl*tempy1l(i,j,k) + etazl*tempy2l(i,j,k) + gammazl*tempy3l(i,j,k)
+
+          dvzdxl(i,j,k) = xixl*tempz1l(i,j,k) + etaxl*tempz2l(i,j,k) + gammaxl*tempz3l(i,j,k)
+          dvzdyl(i,j,k) = xiyl*tempz1l(i,j,k) + etayl*tempz2l(i,j,k) + gammayl*tempz3l(i,j,k)
+          dvzdzl(i,j,k) = xizl*tempz1l(i,j,k) + etazl*tempz2l(i,j,k) + gammazl*tempz3l(i,j,k)
+
+        enddo
+      enddo
+    enddo
+
+      do k = 1,NGLLZ
+        do j = 1,NGLLY
+          do i = 1,NGLLX
+            div(i,j,k,ispec) = dvxdxl(i,j,k) + dvydyl(i,j,k) + dvzdzl(i,j,k)
+            curl_x(i,j,k,ispec) = dvzdyl(i,j,k) - dvydzl(i,j,k)
+            curl_y(i,j,k,ispec) = dvxdzl(i,j,k) - dvzdxl(i,j,k)
+            curl_z(i,j,k,ispec) = dvydxl(i,j,k) - dvxdyl(i,j,k)
+          enddo
+        enddo
+      enddo
+    enddo
+
+    write(outputname,"('div_proc',i6.6,'_it',i6.6,'.bin')") myrank,it
+    open(unit=27,file=trim(LOCAL_PATH)//trim(outputname),status='unknown',form='unformatted')
+    write(27) div
+    close(27)
+    write(outputname,"('curl_x_proc',i6.6,'_it',i6.6,'.bin')") myrank,it
+    open(unit=27,file=trim(LOCAL_PATH)//trim(outputname),status='unknown',form='unformatted')
+    write(27) curl_x
+    close(27)
+    write(outputname,"('curl_y_proc',i6.6,'_it',i6.6,'.bin')") myrank,it
+    open(unit=27,file=trim(LOCAL_PATH)//trim(outputname),status='unknown',form='unformatted')
+    write(27) curl_y
+    close(27)
+    write(outputname,"('curl_z_proc',i6.6,'_it',i6.6,'.bin')") myrank,it
+    open(unit=27,file=trim(LOCAL_PATH)//trim(outputname),status='unknown',form='unformatted')
+    write(27) curl_z
+    close(27)
+    write(outputname,"('veloc_proc',i6.6,'_it',i6.6,'.bin')") myrank,it
+    open(unit=27,file=trim(LOCAL_PATH)//trim(outputname),status='unknown',form='unformatted')
+    write(27) veloc
+    close(27)
+
+  endif
+
+!
+!---- end of time iteration loop
+!
+  enddo   ! end of main time loop
+
+! save last frame
+
+  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+    open(unit=27,file=prname(1:len_trim(prname))//'save_forward_arrays.bin',status='unknown',form='unformatted')
+    write(27) displ
+    write(27) veloc
+    write(27) accel
+    if (ATTENUATION) then
+      write(27) R_xx
+      write(27) R_yy
+      write(27) R_xy
+      write(27) R_xz
+      write(27) R_yz
+      write(27) epsilondev_xx
+      write(27) epsilondev_yy
+      write(27) epsilondev_xy
+      write(27) epsilondev_xz
+      write(27) epsilondev_yz
+    endif
+    close(27)
+
+  else if (SIMULATION_TYPE == 3) then
+
+    ! rhop, beta, alpha kernels
+    do ispec = 1, NSPEC_AB
+      do k = 1, NGLLZ
+        do j = 1, NGLLY
+          do i = 1, NGLLX
+            iglob = ibool(i,j,k,ispec)
+            rho_kl(i,j,k,ispec) = - rho_vs(i,j,k,ispec) **2  * rho_kl(i,j,k,ispec) / mustore(i,j,k,ispec)
+            mu_kl(i,j,k,ispec) = - mustore(i,j,k,ispec) * mu_kl(i,j,k,ispec)
+            kappa_kl(i,j,k,ispec) = - kappastore(i,j,k,ispec) * kappa_kl(i,j,k,ispec)
+            rhop_kl(i,j,k,ispec) = rho_kl(i,j,k,ispec) + kappa_kl(i,j,k,ispec) + mu_kl(i,j,k,ispec)
+            beta_kl(i,j,k,ispec) = 2._CUSTOM_REAL * (mu_kl(i,j,k,ispec) - 4._CUSTOM_REAL * mustore(i,j,k,ispec) &
+                  / (3._CUSTOM_REAL * kappastore(i,j,k,ispec)) * kappa_kl(i,j,k,ispec))
+            alpha_kl(i,j,k,ispec) = 2._CUSTOM_REAL * (1._CUSTOM_REAL + &
+                  4._CUSTOM_REAL * mustore(i,j,k,ispec)/ (3._CUSTOM_REAL * kappastore(i,j,k,ispec))) &
+                  * kappa_kl(i,j,k,ispec)
+          enddo
+        enddo
+      enddo
+    enddo
+
+! save kernels to binary files
+    open(unit=27,file=prname(1:len_trim(prname))//'rho_kernel.bin',status='unknown',form='unformatted')
+    write(27) rho_kl
+    close(27)
+    open(unit=27,file=prname(1:len_trim(prname))//'mu_kernel.bin',status='unknown',form='unformatted')
+    write(27) mu_kl
+    close(27)
+    open(unit=27,file=prname(1:len_trim(prname))//'kappa_kernel.bin',status='unknown',form='unformatted')
+    write(27) kappa_kl
+    close(27)
+    open(unit=27,file=prname(1:len_trim(prname))//'rhop_kernel.bin',status='unknown',form='unformatted')
+    write(27) rhop_kl
+    close(27)
+    open(unit=27,file=prname(1:len_trim(prname))//'beta_kernel.bin',status='unknown',form='unformatted')
+    write(27) beta_kl
+    close(27)
+    open(unit=27,file=prname(1:len_trim(prname))//'alpha_kernel.bin',status='unknown',form='unformatted')
+    write(27) alpha_kl
+    close(27)
+    if (SAVE_MOHO_MESH) then
+      open(unit=27,file=prname(1:len_trim(prname))//'moho_kernel.bin',status='unknown',form='unformatted')
+      write(27) moho_kl
+      close(27)
+    endif
+
+  endif
+
+  if(ABSORBING_CONDITIONS .and. (SIMULATION_TYPE == 3 .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+    if (nspec2D_xmin > 0) close(31)
+    if (nspec2D_xmax > 0) close(32)
+    if (nspec2D_ymin > 0) close(33)
+    if (nspec2D_ymax > 0) close(34)
+    if (NSPEC2D_BOTTOM > 0) close(35)
+  endif
+
+  if (nrec_local > 0) then
+    if (.not. (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3)) then
+!      call write_adj_seismograms(myrank,seismograms_d,number_receiver_global, &
+!          nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,1)
+      call write_adj_seismograms2(myrank,seismograms_eps,number_receiver_global, &
+            nrec_local,it,DT,NSTEP,t0,LOCAL_PATH)
+      do irec_local = 1, nrec_local
+        write(outputname,'(a,i5.5)') 'OUTPUT_FILES/src_frechet.',number_receiver_global(irec_local)
+        open(unit=27,file=trim(outputname),status='unknown')
+!
+! r -> z, theta -> -y, phi -> x
+!
+!  Mrr =  Mzz
+!  Mtt =  Myy
+!  Mpp =  Mxx
+!  Mrt = -Myz
+!  Mrp =  Mxz
+!  Mtp = -Mxy
+
+        write(27,*) Mzz_der(irec_local)
+        write(27,*) Myy_der(irec_local)
+        write(27,*) Mxx_der(irec_local)
+        write(27,*) -Myz_der(irec_local)
+        write(27,*) Mxz_der(irec_local)
+        write(27,*) -Mxy_der(irec_local)
+        write(27,*) sloc_der(1,irec_local)
+        write(27,*) sloc_der(2,irec_local)
+        write(27,*) sloc_der(3,irec_local)
+        close(27)
+      enddo
+    endif
+  endif
+
+! close the main output file
+  if(myrank == 0) then
+    write(IMAIN,*)
+    write(IMAIN,*) 'End of the simulation'
+    write(IMAIN,*)
+    close(IMAIN)
+  endif
+
+! synchronize all the processes to make sure everybody has finished
+  call sync_all()
+
+  end subroutine specfem3D
+