[cig-commits] r22609 - seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D

Sun Jul 14 09:21:08 PDT 2013

Author: dkomati1
Date: 2013-07-14 09:21:07 -0700 (Sun, 14 Jul 2013)
New Revision: 22609

Modified:
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D.F90
Log:
done vectorizing compute_element.F90


Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90	2013-07-14 15:42:48 UTC (rev 22608)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90	2013-07-14 16:21:07 UTC (rev 22609)
@@ -110,14 +110,229 @@
   double precision Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl
 
   real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
-  integer :: i_SLS
 
-  integer :: i,j,k
   integer :: int_radius
   integer :: iglob
 
+#ifdef FORCE_VECTORIZATION
+  integer :: ijk
+#else
+  integer :: i,j,k
+  integer :: i_SLS
+#endif
+
   ! isotropic element
 
+#ifdef FORCE_VECTORIZATION
+! in this vectorized version we have to assume that N_SLS == 3 in order to be able to unroll and thus suppress
+! an inner loop that would otherwise prevent vectorization; this is safe in practice in all cases because N_SLS == 3
+! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop otherwise
+      do ijk=1,NGLLCUBE
+        ! get derivatives of ux, uy and uz with respect to x, y and z
+        xixl = xix(ijk,1,1,ispec)
+        xiyl = xiy(ijk,1,1,ispec)
+        xizl = xiz(ijk,1,1,ispec)
+        etaxl = etax(ijk,1,1,ispec)
+        etayl = etay(ijk,1,1,ispec)
+        etazl = etaz(ijk,1,1,ispec)
+        gammaxl = gammax(ijk,1,1,ispec)
+        gammayl = gammay(ijk,1,1,ispec)
+        gammazl = gammaz(ijk,1,1,ispec)
+
+        ! compute the jacobian
+        jacobianl = 1.0_CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
+                                     - xiyl*(etaxl*gammazl-etazl*gammaxl) &
+                                     + xizl*(etaxl*gammayl-etayl*gammaxl))
+
+        duxdxl = xixl*tempx1(ijk,1,1) + etaxl*tempx2(ijk,1,1) + gammaxl*tempx3(ijk,1,1)
+        duxdyl = xiyl*tempx1(ijk,1,1) + etayl*tempx2(ijk,1,1) + gammayl*tempx3(ijk,1,1)
+        duxdzl = xizl*tempx1(ijk,1,1) + etazl*tempx2(ijk,1,1) + gammazl*tempx3(ijk,1,1)
+
+        duydxl = xixl*tempy1(ijk,1,1) + etaxl*tempy2(ijk,1,1) + gammaxl*tempy3(ijk,1,1)
+        duydyl = xiyl*tempy1(ijk,1,1) + etayl*tempy2(ijk,1,1) + gammayl*tempy3(ijk,1,1)
+        duydzl = xizl*tempy1(ijk,1,1) + etazl*tempy2(ijk,1,1) + gammazl*tempy3(ijk,1,1)
+
+        duzdxl = xixl*tempz1(ijk,1,1) + etaxl*tempz2(ijk,1,1) + gammaxl*tempz3(ijk,1,1)
+        duzdyl = xiyl*tempz1(ijk,1,1) + etayl*tempz2(ijk,1,1) + gammayl*tempz3(ijk,1,1)
+        duzdzl = xizl*tempz1(ijk,1,1) + etazl*tempz2(ijk,1,1) + gammazl*tempz3(ijk,1,1)
+
+        ! precompute some sums to save CPU time
+        duxdxl_plus_duydyl = duxdxl + duydyl
+        duxdxl_plus_duzdzl = duxdxl + duzdzl
+        duydyl_plus_duzdzl = duydyl + duzdzl
+        duxdyl_plus_duydxl = duxdyl + duydxl
+        duzdxl_plus_duxdzl = duzdxl + duxdzl
+        duzdyl_plus_duydzl = duzdyl + duydzl
+
+        ! compute deviatoric strain
+!ZN beware, here the expression differs from the strain used in memory variable equation (6) in D. Komatitsch and J. Tromp 1999,
+!ZN here Brian Savage uses the engineering strain which are epsilon = 1/2*(grad U + (grad U)^T),
+!ZN where U is the displacement vector and grad the gradient operator, i.e. there is a 1/2 factor difference between the two.
+!ZN Both expressions are fine, but we need to keep in mind that if we put the 1/2 factor here there we need to remove it
+!ZN from the expression in which we use the strain later in the code.
+        if (COMPUTE_AND_STORE_STRAIN_VAL) then
+          templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+          eps_trace_over_3_loc(ijk,1,1) = templ
+          epsilondev_loc(1,ijk,1,1) = duxdxl - templ
+          epsilondev_loc(2,ijk,1,1) = duydyl - templ
+          epsilondev_loc(3,ijk,1,1) = 0.5 * duxdyl_plus_duydxl
+          epsilondev_loc(4,ijk,1,1) = 0.5 * duzdxl_plus_duxdzl
+          epsilondev_loc(5,ijk,1,1) = 0.5 * duzdyl_plus_duydzl
+        endif
+
+       ! precompute terms for attenuation if needed
+        if(ATTENUATION_3D_VAL) then
+          one_minus_sum_beta_use = one_minus_sum_beta(ijk,1,1,ispec)
+        else if(ATTENUATION_VAL) then
+          one_minus_sum_beta_use = one_minus_sum_beta(1,1,1,ispec)
+        endif
+
+        !
+        ! compute  isotropic  elements
+        !
+
+        ! layer with no transverse isotropy, use kappav and muv
+        kappal = kappavstore(ijk,1,1,ispec)
+        mul = muvstore(ijk,1,1,ispec)
+
+        ! use unrelaxed parameters if attenuation
+        if(ATTENUATION_VAL) mul = mul * one_minus_sum_beta_use
+
+        lambdalplus2mul = kappal + FOUR_THIRDS * mul
+        lambdal = lambdalplus2mul - 2.0_CUSTOM_REAL*mul
+
+        ! compute stress sigma
+        sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+        sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+        sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+        sigma_xy = mul*duxdyl_plus_duydxl
+        sigma_xz = mul*duzdxl_plus_duxdzl
+        sigma_yz = mul*duzdyl_plus_duydzl
+
+        ! subtract memory variables if attenuation
+        if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL) then
+! do NOT put this is a subroutine, otherwise the call to the subroutine prevents compilers from vectorizing the outer loop
+! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop in order to vectorize the outer loop
+          R_xx_val = R_memory(1,1,ijk,1,1,ispec)
+          R_yy_val = R_memory(2,1,ijk,1,1,ispec)
+          sigma_xx = sigma_xx - R_xx_val
+          sigma_yy = sigma_yy - R_yy_val
+          sigma_zz = sigma_zz + R_xx_val + R_yy_val
+          sigma_xy = sigma_xy - R_memory(3,1,ijk,1,1,ispec)
+          sigma_xz = sigma_xz - R_memory(4,1,ijk,1,1,ispec)
+          sigma_yz = sigma_yz - R_memory(5,1,ijk,1,1,ispec)
+
+          R_xx_val = R_memory(1,2,ijk,1,1,ispec)
+          R_yy_val = R_memory(2,2,ijk,1,1,ispec)
+          sigma_xx = sigma_xx - R_xx_val
+          sigma_yy = sigma_yy - R_yy_val
+          sigma_zz = sigma_zz + R_xx_val + R_yy_val
+          sigma_xy = sigma_xy - R_memory(3,2,ijk,1,1,ispec)
+          sigma_xz = sigma_xz - R_memory(4,2,ijk,1,1,ispec)
+          sigma_yz = sigma_yz - R_memory(5,2,ijk,1,1,ispec)
+
+          R_xx_val = R_memory(1,3,ijk,1,1,ispec)
+          R_yy_val = R_memory(2,3,ijk,1,1,ispec)
+          sigma_xx = sigma_xx - R_xx_val
+          sigma_yy = sigma_yy - R_yy_val
+          sigma_zz = sigma_zz + R_xx_val + R_yy_val
+          sigma_xy = sigma_xy - R_memory(3,3,ijk,1,1,ispec)
+          sigma_xz = sigma_xz - R_memory(4,3,ijk,1,1,ispec)
+          sigma_yz = sigma_yz - R_memory(5,3,ijk,1,1,ispec)
+        endif ! ATTENUATION_VAL
+
+        ! define symmetric components of sigma for gravity
+        sigma_yx = sigma_xy
+        sigma_zx = sigma_xz
+        sigma_zy = sigma_yz
+
+        ! compute non-symmetric terms for gravity
+        if(GRAVITY_VAL) then
+            ! use mesh coordinates to get theta and phi
+            ! x y and z contain r theta and phi
+            iglob = ibool(ijk,1,1,ispec)
+
+            dtheta = dble(ystore(iglob))
+            dphi = dble(zstore(iglob))
+
+            cos_theta = dcos(dtheta)
+            sin_theta = dsin(dtheta)
+            cos_phi = dcos(dphi)
+            sin_phi = dsin(dphi)
+
+            cos_theta_sq = cos_theta*cos_theta
+            sin_theta_sq = sin_theta*sin_theta
+            cos_phi_sq = cos_phi*cos_phi
+            sin_phi_sq = sin_phi*sin_phi
+
+            ! get g, rho and dg/dr=dg
+            ! spherical components of the gravitational acceleration
+            ! for efficiency replace with lookup table every 100 m in radial direction
+            radius = dble(xstore(iglob))
+
+            int_radius = nint(10.d0 * radius * R_EARTH_KM )
+            minus_g = minus_gravity_table(int_radius)
+            minus_dg = minus_deriv_gravity_table(int_radius)
+            rho = density_table(int_radius)
+
+            ! Cartesian components of the gravitational acceleration
+            gxl = minus_g*sin_theta*cos_phi
+            gyl = minus_g*sin_theta*sin_phi
+            gzl = minus_g*cos_theta
+
+            ! Cartesian components of gradient of gravitational acceleration
+            ! obtained from spherical components
+            minus_g_over_radius = minus_g / radius
+            minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius
+
+            Hxxl = minus_g_over_radius*(cos_phi_sq*cos_theta_sq + sin_phi_sq) + cos_phi_sq*minus_dg*sin_theta_sq
+            Hyyl = minus_g_over_radius*(cos_phi_sq + cos_theta_sq*sin_phi_sq) + minus_dg*sin_phi_sq*sin_theta_sq
+            Hzzl = cos_theta_sq*minus_dg + minus_g_over_radius*sin_theta_sq
+            Hxyl = cos_phi*minus_dg_plus_g_over_radius*sin_phi*sin_theta_sq
+            Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta
+            Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta
+
+            ! get displacement and multiply by density to compute G tensor
+            sx_l = rho * dummyx_loc(ijk,1,1)
+            sy_l = rho * dummyy_loc(ijk,1,1)
+            sz_l = rho * dummyz_loc(ijk,1,1)
+
+            ! compute G tensor from s . g and add to sigma (not symmetric)
+            sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
+            sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
+            sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
+
+            sigma_xy = sigma_xy - sx_l * gyl
+            sigma_yx = sigma_yx - sy_l * gxl
+
+            sigma_xz = sigma_xz - sx_l * gzl
+            sigma_zx = sigma_zx - sz_l * gxl
+
+            sigma_yz = sigma_yz - sy_l * gzl
+            sigma_zy = sigma_zy - sz_l * gyl
+
+            ! precompute vector
+            factor = jacobianl * wgll_cube(ijk,1,1)
+            rho_s_H(1,ijk,1,1) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+            rho_s_H(2,ijk,1,1) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+            rho_s_H(3,ijk,1,1) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
+        endif  ! end of section with gravity terms
+
+        ! form dot product with test vector, non-symmetric form
+        tempx1(ijk,1,1) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+        tempy1(ijk,1,1) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+        tempz1(ijk,1,1) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+        tempx2(ijk,1,1) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+        tempy2(ijk,1,1) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+        tempz2(ijk,1,1) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+        tempx3(ijk,1,1) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+        tempy3(ijk,1,1) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+        tempz3(ijk,1,1) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+      enddo
+#else
   do k=1,NGLLZ
     do j=1,NGLLY
       do i=1,NGLLX
@@ -311,6 +526,7 @@
       enddo ! NGLLX
     enddo ! NGLLY
   enddo ! NGLLZ
+#endif
 
   end subroutine compute_element_iso
 
@@ -418,14 +634,415 @@
   real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
 
   real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
-  integer :: i_SLS
 
-  integer :: i,j,k
   integer :: int_radius
   integer :: iglob
 
+#ifdef FORCE_VECTORIZATION
+  integer :: ijk
+#else
+  integer :: i,j,k
+  integer :: i_SLS
+#endif
+
   ! transverse isotropic element
 
+#ifdef FORCE_VECTORIZATION
+! in this vectorized version we have to assume that N_SLS == 3 in order to be able to unroll and thus suppress
+! an inner loop that would otherwise prevent vectorization; this is safe in practice in all cases because N_SLS == 3
+! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop otherwise
+      do ijk=1,NGLLCUBE
+        ! get derivatives of ux, uy and uz with respect to x, y and z
+        xixl = xix(ijk,1,1,ispec)
+        xiyl = xiy(ijk,1,1,ispec)
+        xizl = xiz(ijk,1,1,ispec)
+        etaxl = etax(ijk,1,1,ispec)
+        etayl = etay(ijk,1,1,ispec)
+        etazl = etaz(ijk,1,1,ispec)
+        gammaxl = gammax(ijk,1,1,ispec)
+        gammayl = gammay(ijk,1,1,ispec)
+        gammazl = gammaz(ijk,1,1,ispec)
+
+        ! compute the jacobian
+        jacobianl = 1.0_CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
+                      - xiyl*(etaxl*gammazl-etazl*gammaxl) &
+                      + xizl*(etaxl*gammayl-etayl*gammaxl))
+
+        duxdxl = xixl*tempx1(ijk,1,1) + etaxl*tempx2(ijk,1,1) + gammaxl*tempx3(ijk,1,1)
+        duxdyl = xiyl*tempx1(ijk,1,1) + etayl*tempx2(ijk,1,1) + gammayl*tempx3(ijk,1,1)
+        duxdzl = xizl*tempx1(ijk,1,1) + etazl*tempx2(ijk,1,1) + gammazl*tempx3(ijk,1,1)
+
+        duydxl = xixl*tempy1(ijk,1,1) + etaxl*tempy2(ijk,1,1) + gammaxl*tempy3(ijk,1,1)
+        duydyl = xiyl*tempy1(ijk,1,1) + etayl*tempy2(ijk,1,1) + gammayl*tempy3(ijk,1,1)
+        duydzl = xizl*tempy1(ijk,1,1) + etazl*tempy2(ijk,1,1) + gammazl*tempy3(ijk,1,1)
+
+        duzdxl = xixl*tempz1(ijk,1,1) + etaxl*tempz2(ijk,1,1) + gammaxl*tempz3(ijk,1,1)
+        duzdyl = xiyl*tempz1(ijk,1,1) + etayl*tempz2(ijk,1,1) + gammayl*tempz3(ijk,1,1)
+        duzdzl = xizl*tempz1(ijk,1,1) + etazl*tempz2(ijk,1,1) + gammazl*tempz3(ijk,1,1)
+
+        ! precompute some sums to save CPU time
+        duxdxl_plus_duydyl = duxdxl + duydyl
+        duxdxl_plus_duzdzl = duxdxl + duzdzl
+        duydyl_plus_duzdzl = duydyl + duzdzl
+        duxdyl_plus_duydxl = duxdyl + duydxl
+        duzdxl_plus_duxdzl = duzdxl + duxdzl
+        duzdyl_plus_duydzl = duzdyl + duydzl
+
+        ! compute deviatoric strain
+!ZN beware, here the expression differs from the strain used in memory variable equation (6) in D. Komatitsch and J. Tromp 1999,
+!ZN here Brian Savage uses the engineering strain which are epsilon = 1/2*(grad U + (grad U)^T),
+!ZN where U is the displacement vector and grad the gradient operator, i.e. there is a 1/2 factor difference between the two.
+!ZN Both expressions are fine, but we need to keep in mind that if we put the 1/2 factor here there we need to remove it
+!ZN from the expression in which we use the strain later in the code.
+        if (COMPUTE_AND_STORE_STRAIN_VAL) then
+          templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+          eps_trace_over_3_loc(ijk,1,1) = templ
+          epsilondev_loc(1,ijk,1,1) = duxdxl - templ
+          epsilondev_loc(2,ijk,1,1) = duydyl - templ
+          epsilondev_loc(3,ijk,1,1) = 0.5 * duxdyl_plus_duydxl
+          epsilondev_loc(4,ijk,1,1) = 0.5 * duzdxl_plus_duxdzl
+          epsilondev_loc(5,ijk,1,1) = 0.5 * duzdyl_plus_duydzl
+        endif
+
+        ! precompute terms for attenuation if needed
+        if(ATTENUATION_3D_VAL) then
+          one_minus_sum_beta_use = one_minus_sum_beta(ijk,1,1,ispec)
+        else if(ATTENUATION_VAL) then
+          one_minus_sum_beta_use = one_minus_sum_beta(1,1,1,ispec)
+        endif
+
+        !
+        ! compute either isotropic or anisotropic elements
+        !
+
+! note: the mesh is built such that anisotropic elements are created first in anisotropic layers,
+!           thus they are listed first ( see in create_regions_mesh.f90: perm_layer() ordering )
+!           this is therefore still in bounds of 1:NSPECMAX_TISO_MANTLE even if NSPECMAX_TISO is less than NSPEC
+
+        ! use kappa and mu from transversely isotropic model
+        kappavl = kappavstore(ijk,1,1,ispec)
+        muvl = muvstore(ijk,1,1,ispec)
+
+        kappahl = kappahstore(ijk,1,1,ispec)
+        muhl = muhstore(ijk,1,1,ispec)
+
+        ! use unrelaxed parameters if attenuation
+        ! eta does not need to be shifted since it is a ratio
+        if(ATTENUATION_VAL) then
+          muvl = muvl * one_minus_sum_beta_use
+          muhl = muhl * one_minus_sum_beta_use
+        endif
+
+        rhovpvsq = kappavl + FOUR_THIRDS * muvl  !!! that is C
+        rhovphsq = kappahl + FOUR_THIRDS * muhl  !!! that is A
+
+        rhovsvsq = muvl  !!! that is L
+        rhovshsq = muhl  !!! that is N
+
+        eta_aniso = eta_anisostore(ijk,1,1,ispec)  !!! that is  F / (A - 2 L)
+
+        ! use mesh coordinates to get theta and phi
+        ! ystore and zstore contain theta and phi
+        iglob = ibool(ijk,1,1,ispec)
+
+        theta = ystore(iglob)
+        phi = zstore(iglob)
+
+         ! precompute some products to reduce the CPU time
+
+        costheta = cos(theta)
+        sintheta = sin(theta)
+        cosphi = cos(phi)
+        sinphi = sin(phi)
+
+        costhetasq = costheta * costheta
+        sinthetasq = sintheta * sintheta
+        cosphisq = cosphi * cosphi
+        sinphisq = sinphi * sinphi
+
+        costhetafour = costhetasq * costhetasq
+        sinthetafour = sinthetasq * sinthetasq
+        cosphifour = cosphisq * cosphisq
+        sinphifour = sinphisq * sinphisq
+
+        costwotheta = cos(2.0_CUSTOM_REAL*theta)
+        sintwotheta = sin(2.0_CUSTOM_REAL*theta)
+        costwophi = cos(2.0_CUSTOM_REAL*phi)
+        sintwophi = sin(2.0_CUSTOM_REAL*phi)
+
+        cosfourtheta = cos(4.0_CUSTOM_REAL*theta)
+        cosfourphi = cos(4.0_CUSTOM_REAL*phi)
+
+        costwothetasq = costwotheta * costwotheta
+
+        costwophisq = costwophi * costwophi
+        sintwophisq = sintwophi * sintwophi
+
+        etaminone = eta_aniso - 1.0_CUSTOM_REAL
+        twoetaminone = 2.0_CUSTOM_REAL * eta_aniso - 1.0_CUSTOM_REAL
+
+        ! precompute some products to reduce the CPU time
+        two_eta_aniso = 2.0_CUSTOM_REAL*eta_aniso
+        four_eta_aniso = 4.0_CUSTOM_REAL*eta_aniso
+        six_eta_aniso = 6.0_CUSTOM_REAL*eta_aniso
+
+        two_rhovsvsq = 2.0_CUSTOM_REAL*rhovsvsq
+        two_rhovshsq = 2.0_CUSTOM_REAL*rhovshsq
+        four_rhovsvsq = 4.0_CUSTOM_REAL*rhovsvsq
+        four_rhovshsq = 4.0_CUSTOM_REAL*rhovshsq
+
+        ! pre-compute temporary values
+        templ1 = four_rhovsvsq - rhovpvsq + twoetaminone*rhovphsq - four_eta_aniso*rhovsvsq
+        templ1_cos = rhovphsq - rhovpvsq + costwotheta*templ1
+        templ2 = four_rhovsvsq - rhovpvsq - rhovphsq + two_eta_aniso*rhovphsq - four_eta_aniso*rhovsvsq
+        templ2_cos = rhovpvsq - rhovphsq + costwotheta*templ2
+        templ3 = rhovphsq + rhovpvsq - two_eta_aniso*rhovphsq + four_eta_aniso*rhovsvsq
+        templ3_two = templ3 - two_rhovshsq - two_rhovsvsq
+        templ3_cos = templ3_two + costwotheta*templ2
+
+        ! reordering operations to facilitate compilation, avoiding divisions, using locality for temporary values
+        c11 = rhovphsq*sinphifour &
+              + 2.0_CUSTOM_REAL*cosphisq*sinphisq* &
+              ( rhovphsq*costhetasq + sinthetasq*(eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq) ) &
+              + cosphifour*(rhovphsq*costhetafour &
+                + 2.0_CUSTOM_REAL*costhetasq*sinthetasq*(eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq) &
+                + rhovpvsq*sinthetafour)
+
+        c12 = 0.25_CUSTOM_REAL*costhetasq*(rhovphsq - two_rhovshsq)*(3.0_CUSTOM_REAL + cosfourphi) &
+              - four_rhovshsq*cosphisq*costhetasq*sinphisq &
+              + 0.03125_CUSTOM_REAL*rhovphsq*sintwophisq*(11.0_CUSTOM_REAL + cosfourtheta + 4.0*costwotheta) &
+              + eta_aniso*sinthetasq*(rhovphsq - two_rhovsvsq) &
+                         *(cosphifour + sinphifour + 2.0_CUSTOM_REAL*cosphisq*costhetasq*sinphisq) &
+              + rhovpvsq*cosphisq*sinphisq*sinthetafour &
+              - rhovsvsq*sintwophisq*sinthetafour
+
+        c13 = 0.125_CUSTOM_REAL*cosphisq*(rhovphsq + six_eta_aniso*rhovphsq + rhovpvsq - four_rhovsvsq &
+                    - 12.0_CUSTOM_REAL*eta_aniso*rhovsvsq + cosfourtheta*templ1) &
+              + sinphisq*(eta_aniso*costhetasq*(rhovphsq - two_rhovsvsq) + sinthetasq*(rhovphsq - two_rhovshsq))
+
+        ! uses temporary templ1 from c13
+        c15 = cosphi*costheta*sintheta* &
+              ( 0.5_CUSTOM_REAL*cosphisq* (rhovpvsq - rhovphsq + costwotheta*templ1) &
+                + etaminone*sinphisq*(rhovphsq - two_rhovsvsq))
+
+        c14 = costheta*sinphi*sintheta* &
+              ( 0.5_CUSTOM_REAL*cosphisq*(templ2_cos + four_rhovshsq - four_rhovsvsq) &
+                + sinphisq*(etaminone*rhovphsq + 2.0_CUSTOM_REAL*(rhovshsq - eta_aniso*rhovsvsq)) )
+
+        ! uses temporary templ2_cos from c14
+        c16 = 0.5_CUSTOM_REAL*cosphi*sinphi*sinthetasq* &
+              ( cosphisq*templ2_cos &
+                + 2.0_CUSTOM_REAL*etaminone*sinphisq*(rhovphsq - two_rhovsvsq) )
+
+        c22 = rhovphsq*cosphifour + 2.0_CUSTOM_REAL*cosphisq*sinphisq* &
+              (rhovphsq*costhetasq + sinthetasq*(eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)) &
+              + sinphifour* &
+              (rhovphsq*costhetafour + 2.0_CUSTOM_REAL*costhetasq*sinthetasq*(eta_aniso*rhovphsq  &
+                    + two_rhovsvsq - two_eta_aniso*rhovsvsq) + rhovpvsq*sinthetafour)
+
+        ! uses temporary templ1 from c13
+        c23 = 0.125_CUSTOM_REAL*sinphisq*(rhovphsq + six_eta_aniso*rhovphsq &
+                + rhovpvsq - four_rhovsvsq - 12.0_CUSTOM_REAL*eta_aniso*rhovsvsq + cosfourtheta*templ1) &
+              + cosphisq*(eta_aniso*costhetasq*(rhovphsq - two_rhovsvsq) + sinthetasq*(rhovphsq - two_rhovshsq))
+
+        ! uses temporary templ1 from c13
+        c24 = costheta*sinphi*sintheta* &
+              ( etaminone*cosphisq*(rhovphsq - two_rhovsvsq) &
+                + 0.5_CUSTOM_REAL*sinphisq*(rhovpvsq - rhovphsq + costwotheta*templ1) )
+
+        ! uses temporary templ2_cos from c14
+        c25 = cosphi*costheta*sintheta* &
+              ( cosphisq*(etaminone*rhovphsq + 2.0_CUSTOM_REAL*(rhovshsq - eta_aniso*rhovsvsq)) &
+                + 0.5_CUSTOM_REAL*sinphisq*(templ2_cos + four_rhovshsq - four_rhovsvsq) )
+
+        ! uses temporary templ2_cos from c14
+        c26 = 0.5_CUSTOM_REAL*cosphi*sinphi*sinthetasq* &
+              ( 2.0_CUSTOM_REAL*etaminone*cosphisq*(rhovphsq - two_rhovsvsq) &
+                + sinphisq*templ2_cos )
+
+        c33 = rhovpvsq*costhetafour &
+              + 2.0_CUSTOM_REAL*costhetasq*sinthetasq*(two_rhovsvsq + eta_aniso*(rhovphsq - two_rhovsvsq)) &
+              + rhovphsq*sinthetafour
+
+        ! uses temporary templ1_cos from c13
+        c34 = - 0.25_CUSTOM_REAL*sinphi*sintwotheta*templ1_cos
+
+        ! uses temporary templ1_cos from c34
+        c35 = - 0.25_CUSTOM_REAL*cosphi*sintwotheta*templ1_cos
+
+        ! uses temporary templ1_cos from c34
+        c36 = - 0.25_CUSTOM_REAL*sintwophi*sinthetasq*(templ1_cos - four_rhovshsq + four_rhovsvsq)
+
+        c44 = cosphisq*(rhovsvsq*costhetasq + rhovshsq*sinthetasq) &
+              + sinphisq*(rhovsvsq*costwothetasq + costhetasq*sinthetasq*templ3)
+
+        ! uses temporary templ3 from c44
+        c46 = - cosphi*costheta*sintheta* &
+                ( cosphisq*(rhovshsq - rhovsvsq) - 0.5_CUSTOM_REAL*sinphisq*templ3_cos  )
+
+        ! uses templ3 from c46
+        c45 = 0.25_CUSTOM_REAL*sintwophi*sinthetasq* &
+              (templ3_two + costwotheta*(rhovphsq + rhovpvsq - two_eta_aniso*rhovphsq + 4.0_CUSTOM_REAL*etaminone*rhovsvsq))
+
+        c55 = sinphisq*(rhovsvsq*costhetasq + rhovshsq*sinthetasq) &
+              + cosphisq*(rhovsvsq*costwothetasq &
+                  + costhetasq*sinthetasq*(rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq + four_eta_aniso*rhovsvsq) )
+
+        ! uses temporary templ3_cos from c46
+        c56 = costheta*sinphi*sintheta* &
+              ( 0.5_CUSTOM_REAL*cosphisq*templ3_cos + sinphisq*(rhovsvsq - rhovshsq) )
+
+        c66 = rhovshsq*costwophisq*costhetasq &
+              - 2.0_CUSTOM_REAL*cosphisq*costhetasq*sinphisq*(rhovphsq - two_rhovshsq) &
+              + 0.03125_CUSTOM_REAL*rhovphsq*sintwophisq*(11.0_CUSTOM_REAL + 4.0_CUSTOM_REAL*costwotheta + cosfourtheta) &
+              - 0.125_CUSTOM_REAL*rhovsvsq*sinthetasq* &
+              ( -6.0_CUSTOM_REAL - 2.0_CUSTOM_REAL*costwotheta - 2.0_CUSTOM_REAL*cosfourphi &
+                        + cos(4.0_CUSTOM_REAL*phi - 2.0_CUSTOM_REAL*theta) &
+                        + cos(2.0_CUSTOM_REAL*(2.0_CUSTOM_REAL*phi + theta)) ) &
+              + rhovpvsq*cosphisq*sinphisq*sinthetafour &
+              - 0.5_CUSTOM_REAL*eta_aniso*sintwophisq*sinthetafour*(rhovphsq - two_rhovsvsq)
+
+        ! general expression of stress tensor for full Cijkl with 21 coefficients
+        sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                 c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+
+        sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                 c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+
+        sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                 c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+
+        sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                 c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+
+        sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                 c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+
+        sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                 c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+        ! subtract memory variables if attenuation
+        if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL) then
+! do NOT put this is a subroutine, otherwise the call to the subroutine prevents compilers from vectorizing the outer loop
+! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop in order to vectorize the outer loop
+          R_xx_val = R_memory(1,1,ijk,1,1,ispec)
+          R_yy_val = R_memory(2,1,ijk,1,1,ispec)
+          sigma_xx = sigma_xx - R_xx_val
+          sigma_yy = sigma_yy - R_yy_val
+          sigma_zz = sigma_zz + R_xx_val + R_yy_val
+          sigma_xy = sigma_xy - R_memory(3,1,ijk,1,1,ispec)
+          sigma_xz = sigma_xz - R_memory(4,1,ijk,1,1,ispec)
+          sigma_yz = sigma_yz - R_memory(5,1,ijk,1,1,ispec)
+
+          R_xx_val = R_memory(1,2,ijk,1,1,ispec)
+          R_yy_val = R_memory(2,2,ijk,1,1,ispec)
+          sigma_xx = sigma_xx - R_xx_val
+          sigma_yy = sigma_yy - R_yy_val
+          sigma_zz = sigma_zz + R_xx_val + R_yy_val
+          sigma_xy = sigma_xy - R_memory(3,2,ijk,1,1,ispec)
+          sigma_xz = sigma_xz - R_memory(4,2,ijk,1,1,ispec)
+          sigma_yz = sigma_yz - R_memory(5,2,ijk,1,1,ispec)
+
+          R_xx_val = R_memory(1,3,ijk,1,1,ispec)
+          R_yy_val = R_memory(2,3,ijk,1,1,ispec)
+          sigma_xx = sigma_xx - R_xx_val
+          sigma_yy = sigma_yy - R_yy_val
+          sigma_zz = sigma_zz + R_xx_val + R_yy_val
+          sigma_xy = sigma_xy - R_memory(3,3,ijk,1,1,ispec)
+          sigma_xz = sigma_xz - R_memory(4,3,ijk,1,1,ispec)
+          sigma_yz = sigma_yz - R_memory(5,3,ijk,1,1,ispec)
+        endif ! ATTENUATION_VAL
+
+        ! define symmetric components of sigma for gravity
+        sigma_yx = sigma_xy
+        sigma_zx = sigma_xz
+        sigma_zy = sigma_yz
+
+        ! compute non-symmetric terms for gravity
+        if(GRAVITY_VAL) then
+            ! use mesh coordinates to get theta and phi
+            ! x y and z contain r theta and phi
+            iglob = ibool(ijk,1,1,ispec)
+
+            dtheta = dble(ystore(iglob))
+            dphi = dble(zstore(iglob))
+            radius = dble(xstore(iglob))
+
+            cos_theta = dcos(dtheta)
+            sin_theta = dsin(dtheta)
+            cos_phi = dcos(dphi)
+            sin_phi = dsin(dphi)
+
+            cos_theta_sq = cos_theta*cos_theta
+            sin_theta_sq = sin_theta*sin_theta
+            cos_phi_sq = cos_phi*cos_phi
+            sin_phi_sq = sin_phi*sin_phi
+
+            ! get g, rho and dg/dr=dg
+            ! spherical components of the gravitational acceleration
+            ! for efficiency replace with lookup table every 100 m in radial direction
+            int_radius = nint(10.d0 * radius * R_EARTH_KM )
+            minus_g = minus_gravity_table(int_radius)
+            minus_dg = minus_deriv_gravity_table(int_radius)
+            rho = density_table(int_radius)
+
+            ! Cartesian components of the gravitational acceleration
+            gxl = minus_g*sin_theta*cos_phi
+            gyl = minus_g*sin_theta*sin_phi
+            gzl = minus_g*cos_theta
+
+            ! Cartesian components of gradient of gravitational acceleration
+            ! obtained from spherical components
+            minus_g_over_radius = minus_g / radius
+            minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius
+
+            Hxxl = minus_g_over_radius*(cos_phi_sq*cos_theta_sq + sin_phi_sq) + cos_phi_sq*minus_dg*sin_theta_sq
+            Hyyl = minus_g_over_radius*(cos_phi_sq + cos_theta_sq*sin_phi_sq) + minus_dg*sin_phi_sq*sin_theta_sq
+            Hzzl = cos_theta_sq*minus_dg + minus_g_over_radius*sin_theta_sq
+            Hxyl = cos_phi*minus_dg_plus_g_over_radius*sin_phi*sin_theta_sq
+            Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta
+            Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta
+
+            ! get displacement and multiply by density to compute G tensor
+            sx_l = rho * dummyx_loc(ijk,1,1)
+            sy_l = rho * dummyy_loc(ijk,1,1)
+            sz_l = rho * dummyz_loc(ijk,1,1)
+
+            ! compute G tensor from s . g and add to sigma (not symmetric)
+            sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
+            sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
+            sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
+
+            sigma_xy = sigma_xy - sx_l * gyl
+            sigma_yx = sigma_yx - sy_l * gxl
+
+            sigma_xz = sigma_xz - sx_l * gzl
+            sigma_zx = sigma_zx - sz_l * gxl
+
+            sigma_yz = sigma_yz - sy_l * gzl
+            sigma_zy = sigma_zy - sz_l * gyl
+
+            ! precompute vector
+            factor = jacobianl * wgll_cube(ijk,1,1)
+            rho_s_H(1,ijk,1,1) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+            rho_s_H(2,ijk,1,1) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+            rho_s_H(3,ijk,1,1) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
+        endif  ! end of section with gravity terms
+
+        ! form dot product with test vector, non-symmetric form
+        tempx1(ijk,1,1) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+        tempy1(ijk,1,1) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+        tempz1(ijk,1,1) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+        tempx2(ijk,1,1) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+        tempy2(ijk,1,1) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+        tempz2(ijk,1,1) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+        tempx3(ijk,1,1) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+        tempy3(ijk,1,1) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+        tempz3(ijk,1,1) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+      enddo
+#else
   do k=1,NGLLZ
     do j=1,NGLLY
       do i=1,NGLLX
@@ -805,6 +1422,7 @@
       enddo ! NGLLX
     enddo ! NGLLY
   enddo ! NGLLZ
+#endif
 
   end subroutine compute_element_tiso
 
@@ -903,14 +1521,265 @@
   real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
 
   real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
-  integer :: i_SLS
 
-  integer :: i,j,k
   integer :: int_radius
   integer :: iglob
 
+#ifdef FORCE_VECTORIZATION
+  integer :: ijk
+#else
+  integer :: i,j,k
+  integer :: i_SLS
+#endif
+
   ! anisotropic elements
 
+#ifdef FORCE_VECTORIZATION
+! in this vectorized version we have to assume that N_SLS == 3 in order to be able to unroll and thus suppress
+! an inner loop that would otherwise prevent vectorization; this is safe in practice in all cases because N_SLS == 3
+! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop otherwise
+      do ijk=1,NGLLCUBE
+        ! get derivatives of ux, uy and uz with respect to x, y and z
+        xixl = xix(ijk,1,1,ispec)
+        xiyl = xiy(ijk,1,1,ispec)
+        xizl = xiz(ijk,1,1,ispec)
+        etaxl = etax(ijk,1,1,ispec)
+        etayl = etay(ijk,1,1,ispec)
+        etazl = etaz(ijk,1,1,ispec)
+        gammaxl = gammax(ijk,1,1,ispec)
+        gammayl = gammay(ijk,1,1,ispec)
+        gammazl = gammaz(ijk,1,1,ispec)
+
+        ! compute the jacobian
+        jacobianl = 1.0_CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
+                      - xiyl*(etaxl*gammazl-etazl*gammaxl) &
+                      + xizl*(etaxl*gammayl-etayl*gammaxl))
+
+        duxdxl = xixl*tempx1(ijk,1,1) + etaxl*tempx2(ijk,1,1) + gammaxl*tempx3(ijk,1,1)
+        duxdyl = xiyl*tempx1(ijk,1,1) + etayl*tempx2(ijk,1,1) + gammayl*tempx3(ijk,1,1)
+        duxdzl = xizl*tempx1(ijk,1,1) + etazl*tempx2(ijk,1,1) + gammazl*tempx3(ijk,1,1)
+
+        duydxl = xixl*tempy1(ijk,1,1) + etaxl*tempy2(ijk,1,1) + gammaxl*tempy3(ijk,1,1)
+        duydyl = xiyl*tempy1(ijk,1,1) + etayl*tempy2(ijk,1,1) + gammayl*tempy3(ijk,1,1)
+        duydzl = xizl*tempy1(ijk,1,1) + etazl*tempy2(ijk,1,1) + gammazl*tempy3(ijk,1,1)
+
+        duzdxl = xixl*tempz1(ijk,1,1) + etaxl*tempz2(ijk,1,1) + gammaxl*tempz3(ijk,1,1)
+        duzdyl = xiyl*tempz1(ijk,1,1) + etayl*tempz2(ijk,1,1) + gammayl*tempz3(ijk,1,1)
+        duzdzl = xizl*tempz1(ijk,1,1) + etazl*tempz2(ijk,1,1) + gammazl*tempz3(ijk,1,1)
+
+        ! precompute some sums to save CPU time
+        duxdxl_plus_duydyl = duxdxl + duydyl
+        duxdxl_plus_duzdzl = duxdxl + duzdzl
+        duydyl_plus_duzdzl = duydyl + duzdzl
+        duxdyl_plus_duydxl = duxdyl + duydxl
+        duzdxl_plus_duxdzl = duzdxl + duxdzl
+        duzdyl_plus_duydzl = duzdyl + duydzl
+
+        ! compute deviatoric strain
+!ZN beware, here the expression differs from the strain used in memory variable equation (6) in D. Komatitsch and J. Tromp 1999,
+!ZN here Brian Savage uses the engineering strain which are epsilon = 1/2*(grad U + (grad U)^T),
+!ZN where U is the displacement vector and grad the gradient operator, i.e. there is a 1/2 factor difference between the two.
+!ZN Both expressions are fine, but we need to keep in mind that if we put the 1/2 factor here there we need to remove it
+!ZN from the expression in which we use the strain later in the code.
+        if (COMPUTE_AND_STORE_STRAIN_VAL) then
+          templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+          eps_trace_over_3_loc(ijk,1,1) = templ
+          epsilondev_loc(1,ijk,1,1) = duxdxl - templ
+          epsilondev_loc(2,ijk,1,1) = duydyl - templ
+          epsilondev_loc(3,ijk,1,1) = 0.5 * duxdyl_plus_duydxl
+          epsilondev_loc(4,ijk,1,1) = 0.5 * duzdxl_plus_duxdzl
+          epsilondev_loc(5,ijk,1,1) = 0.5 * duzdyl_plus_duydzl
+        endif
+
+        ! precompute terms for attenuation if needed
+        if(ATTENUATION_3D_VAL) then
+          one_minus_sum_beta_use = one_minus_sum_beta(ijk,1,1,ispec)
+          minus_sum_beta =  one_minus_sum_beta_use - 1.0
+        else if(ATTENUATION_VAL) then
+          one_minus_sum_beta_use = one_minus_sum_beta(1,1,1,ispec)
+          minus_sum_beta =  one_minus_sum_beta_use - 1.0
+        endif
+
+        !
+        ! compute anisotropic elements
+        !
+
+        c11 = c11store(ijk,1,1,ispec)
+        c12 = c12store(ijk,1,1,ispec)
+        c13 = c13store(ijk,1,1,ispec)
+        c14 = c14store(ijk,1,1,ispec)
+        c15 = c15store(ijk,1,1,ispec)
+        c16 = c16store(ijk,1,1,ispec)
+        c22 = c22store(ijk,1,1,ispec)
+        c23 = c23store(ijk,1,1,ispec)
+        c24 = c24store(ijk,1,1,ispec)
+        c25 = c25store(ijk,1,1,ispec)
+        c26 = c26store(ijk,1,1,ispec)
+        c33 = c33store(ijk,1,1,ispec)
+        c34 = c34store(ijk,1,1,ispec)
+        c35 = c35store(ijk,1,1,ispec)
+        c36 = c36store(ijk,1,1,ispec)
+        c44 = c44store(ijk,1,1,ispec)
+        c45 = c45store(ijk,1,1,ispec)
+        c46 = c46store(ijk,1,1,ispec)
+        c55 = c55store(ijk,1,1,ispec)
+        c56 = c56store(ijk,1,1,ispec)
+        c66 = c66store(ijk,1,1,ispec)
+
+        if(ATTENUATION_VAL) then
+          ! mul = c44
+          mul = c44 * minus_sum_beta
+          c11 = c11 + FOUR_THIRDS * mul ! * minus_sum_beta * mul
+          c12 = c12 - TWO_THIRDS * mul
+          c13 = c13 - TWO_THIRDS * mul
+          c22 = c22 + FOUR_THIRDS * mul
+          c23 = c23 - TWO_THIRDS * mul
+          c33 = c33 + FOUR_THIRDS * mul
+          c44 = c44 + mul
+          c55 = c55 + mul
+          c66 = c66 + mul
+        endif
+
+        sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                 c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+
+        sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                 c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+
+        sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                 c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+
+        sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                 c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+
+        sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                 c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+
+        sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                 c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+        ! subtract memory variables if attenuation
+        if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL) then
+! do NOT put this is a subroutine, otherwise the call to the subroutine prevents compilers from vectorizing the outer loop
+! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop in order to vectorize the outer loop
+          R_xx_val = R_memory(1,1,ijk,1,1,ispec)
+          R_yy_val = R_memory(2,1,ijk,1,1,ispec)
+          sigma_xx = sigma_xx - R_xx_val
+          sigma_yy = sigma_yy - R_yy_val
+          sigma_zz = sigma_zz + R_xx_val + R_yy_val
+          sigma_xy = sigma_xy - R_memory(3,1,ijk,1,1,ispec)
+          sigma_xz = sigma_xz - R_memory(4,1,ijk,1,1,ispec)
+          sigma_yz = sigma_yz - R_memory(5,1,ijk,1,1,ispec)
+
+          R_xx_val = R_memory(1,2,ijk,1,1,ispec)
+          R_yy_val = R_memory(2,2,ijk,1,1,ispec)
+          sigma_xx = sigma_xx - R_xx_val
+          sigma_yy = sigma_yy - R_yy_val
+          sigma_zz = sigma_zz + R_xx_val + R_yy_val
+          sigma_xy = sigma_xy - R_memory(3,2,ijk,1,1,ispec)
+          sigma_xz = sigma_xz - R_memory(4,2,ijk,1,1,ispec)
+          sigma_yz = sigma_yz - R_memory(5,2,ijk,1,1,ispec)
+
+          R_xx_val = R_memory(1,3,ijk,1,1,ispec)
+          R_yy_val = R_memory(2,3,ijk,1,1,ispec)
+          sigma_xx = sigma_xx - R_xx_val
+          sigma_yy = sigma_yy - R_yy_val
+          sigma_zz = sigma_zz + R_xx_val + R_yy_val
+          sigma_xy = sigma_xy - R_memory(3,3,ijk,1,1,ispec)
+          sigma_xz = sigma_xz - R_memory(4,3,ijk,1,1,ispec)
+          sigma_yz = sigma_yz - R_memory(5,3,ijk,1,1,ispec)
+        endif ! ATTENUATION_VAL
+
+        ! define symmetric components of sigma for gravity
+        sigma_yx = sigma_xy
+        sigma_zx = sigma_xz
+        sigma_zy = sigma_yz
+
+        ! compute non-symmetric terms for gravity
+        if(GRAVITY_VAL) then
+            ! use mesh coordinates to get theta and phi
+            ! x y and z contain r theta and phi
+            iglob = ibool(ijk,1,1,ispec)
+
+            dtheta = dble(ystore(iglob))
+            dphi = dble(zstore(iglob))
+            radius = dble(xstore(iglob))
+
+            cos_theta = dcos(dtheta)
+            sin_theta = dsin(dtheta)
+            cos_phi = dcos(dphi)
+            sin_phi = dsin(dphi)
+
+            cos_theta_sq = cos_theta*cos_theta
+            sin_theta_sq = sin_theta*sin_theta
+            cos_phi_sq = cos_phi*cos_phi
+            sin_phi_sq = sin_phi*sin_phi
+
+            ! get g, rho and dg/dr=dg
+            ! spherical components of the gravitational acceleration
+            ! for efficiency replace with lookup table every 100 m in radial direction
+            int_radius = nint(10.d0 * radius * R_EARTH_KM )
+            minus_g = minus_gravity_table(int_radius)
+            minus_dg = minus_deriv_gravity_table(int_radius)
+            rho = density_table(int_radius)
+
+            ! Cartesian components of the gravitational acceleration
+            gxl = minus_g*sin_theta*cos_phi
+            gyl = minus_g*sin_theta*sin_phi
+            gzl = minus_g*cos_theta
+
+            ! Cartesian components of gradient of gravitational acceleration
+            ! obtained from spherical components
+            minus_g_over_radius = minus_g / radius
+            minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius
+
+            Hxxl = minus_g_over_radius*(cos_phi_sq*cos_theta_sq + sin_phi_sq) + cos_phi_sq*minus_dg*sin_theta_sq
+            Hyyl = minus_g_over_radius*(cos_phi_sq + cos_theta_sq*sin_phi_sq) + minus_dg*sin_phi_sq*sin_theta_sq
+            Hzzl = cos_theta_sq*minus_dg + minus_g_over_radius*sin_theta_sq
+            Hxyl = cos_phi*minus_dg_plus_g_over_radius*sin_phi*sin_theta_sq
+            Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta
+            Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta
+
+            ! get displacement and multiply by density to compute G tensor
+            sx_l = rho * dummyx_loc(ijk,1,1)
+            sy_l = rho * dummyy_loc(ijk,1,1)
+            sz_l = rho * dummyz_loc(ijk,1,1)
+
+            ! compute G tensor from s . g and add to sigma (not symmetric)
+            sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
+            sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
+            sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
+
+            sigma_xy = sigma_xy - sx_l * gyl
+            sigma_yx = sigma_yx - sy_l * gxl
+
+            sigma_xz = sigma_xz - sx_l * gzl
+            sigma_zx = sigma_zx - sz_l * gxl
+
+            sigma_yz = sigma_yz - sy_l * gzl
+            sigma_zy = sigma_zy - sz_l * gyl
+
+            ! precompute vector
+            factor = jacobianl * wgll_cube(ijk,1,1)
+            rho_s_H(1,ijk,1,1) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+            rho_s_H(2,ijk,1,1) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+            rho_s_H(3,ijk,1,1) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
+        endif  ! end of section with gravity terms
+
+        ! form dot product with test vector, non-symmetric form
+        tempx1(ijk,1,1) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+        tempy1(ijk,1,1) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+        tempz1(ijk,1,1) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+        tempx2(ijk,1,1) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+        tempy2(ijk,1,1) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+        tempz2(ijk,1,1) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+        tempx3(ijk,1,1) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+        tempy3(ijk,1,1) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+        tempz3(ijk,1,1) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+      enddo
+#else
   do k=1,NGLLZ
     do j=1,NGLLY
       do i=1,NGLLX
@@ -1140,6 +2009,7 @@
       enddo ! NGLLX
     enddo ! NGLLY
   enddo ! NGLLZ
+#endif
 
   end subroutine compute_element_aniso
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D.F90	2013-07-14 15:42:48 UTC (rev 22608)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D.F90	2013-07-14 16:21:07 UTC (rev 22609)
@@ -2475,6 +2475,12 @@
 ! get MPI starting time
   time_start = MPI_WTIME()
 
+#ifdef FORCE_VECTORIZATION
+  if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL .and. N_SLS /= 3) &
+      stop 'FORCE_VECTORIZATION can only be used with N_SLS == 3 when ATTENUATION .and. .not. PARTIAL_PHYS_DISPERSION_ONLY&
+              & because N_SLS is assumed to be equal to 3 when vectorizing compute_element_iso,tiso,aniso'
+#endif
+
 ! *********************************************************
 ! ************* MAIN LOOP OVER THE TIME STEPS *************
 ! *********************************************************