[cig-commits] [commit] devel: updates Deville routines, uses subroutines to remove equivalence-statements (c96657c)

Fri Nov 28 14:14:14 PST 2014

Repository : https://github.com/geodynamics/specfem3d

On branch  : devel
Link       : https://github.com/geodynamics/specfem3d/compare/9a84d06c76e869f5276019e4f84affce23830a4d...8dce71b713c1fd0b510e7538fea0f2307c7b29e8

>---------------------------------------------------------------

commit c96657c48693aab4bcad535e527c0f63fe2bab70
Author: daniel peter <peterda at ethz.ch>
Date:   Fri Nov 28 12:42:07 2014 +0100

    updates Deville routines, uses subroutines to remove equivalence-statements


>---------------------------------------------------------------

c96657c48693aab4bcad535e527c0f63fe2bab70
 src/specfem3D/compute_forces_acoustic_Dev.F90     | 263 ++++-----
 src/specfem3D/compute_forces_viscoelastic_Dev.F90 | 623 +++++++++-------------
 2 files changed, 355 insertions(+), 531 deletions(-)

diff --git a/src/specfem3D/compute_forces_acoustic_Dev.F90 b/src/specfem3D/compute_forces_acoustic_Dev.F90
index 1201a60..1ab13be 100644
--- a/src/specfem3D/compute_forces_acoustic_Dev.F90
+++ b/src/specfem3D/compute_forces_acoustic_Dev.F90
@@ -86,6 +86,7 @@
   logical,intent(in) :: backward_simulation
 
   ! local parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: chi_elem
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1,tempx2,tempx3
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: newtempx1,newtempx2,newtempx3
 
@@ -95,55 +96,18 @@
 
   integer :: ispec,iglob,ispec_p,num_elements
 
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: chi_elem
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,m2) :: B1_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: E1_m1_m2_5points
-
-  equivalence(chi_elem,B1_m1_m2_5points)
-  equivalence(tempx1,C1_m1_m2_5points)
-  equivalence(newtempx1,E1_m1_m2_5points)
-
-  real(kind=CUSTOM_REAL), dimension(m2,NGLLX) :: A1_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: C1_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: E1_mxm_m2_m1_5points
-
-  equivalence(chi_elem,A1_mxm_m2_m1_5points)
-  equivalence(tempx3,C1_mxm_m2_m1_5points)
-  equivalence(newtempx3,E1_mxm_m2_m1_5points)
-
   ! CPML
   integer :: ispec_CPML
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: chi_elem_old,chi_elem_new
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1_old,tempx2_old,tempx3_old
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1_new,tempx2_new,tempx3_new
 
-  ! CPML Deville
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: chi_elem_old,chi_elem_new
-  real(kind=CUSTOM_REAL), dimension(NGLLX,m2) :: B1_m1_m2_5points_old,B1_m1_m2_5points_new
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points_old,C1_m1_m2_5points_new
-
-  equivalence(chi_elem_old,B1_m1_m2_5points_old)
-  equivalence(tempx1_old,C1_m1_m2_5points_old)
-
-  equivalence(chi_elem_new,B1_m1_m2_5points_new)
-  equivalence(tempx1_new,C1_m1_m2_5points_new)
-
-  real(kind=CUSTOM_REAL), dimension(m2,NGLLX) :: A1_mxm_m2_m1_5points_old,A1_mxm_m2_m1_5points_new
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: C1_mxm_m2_m1_5points_old,C1_mxm_m2_m1_5points_new
-
-  equivalence(chi_elem_old,A1_mxm_m2_m1_5points_old)
-  equivalence(tempx3_old,C1_mxm_m2_m1_5points_old)
-
-  equivalence(chi_elem_new,A1_mxm_m2_m1_5points_new)
-  equivalence(tempx3_new,C1_mxm_m2_m1_5points_new)
-
-  integer :: i,j,k
 #ifdef FORCE_VECTORIZATION
 ! this will (purposely) give out-of-bound array accesses if run through range checking,
 ! thus use only for production runs with no bound checking
   integer :: ijk
+#else
+  integer :: i,j,k
 #endif
 
 
@@ -166,42 +130,17 @@
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
-    do j=1,m2
-      do i=1,m1
-        C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
-                                hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
-                                hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
-                                hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
-                                hprime_xx(i,5)*B1_m1_m2_5points(5,j)
-      enddo
-    enddo
-
-    do k = 1,NGLLX
-      do j=1,m1
-        do i=1,m1
-          tempx2(i,j,k) = chi_elem(i,1,k)*hprime_xxT(1,j) + &
-                          chi_elem(i,2,k)*hprime_xxT(2,j) + &
-                          chi_elem(i,3,k)*hprime_xxT(3,j) + &
-                          chi_elem(i,4,k)*hprime_xxT(4,j) + &
-                          chi_elem(i,5,k)*hprime_xxT(5,j)
-        enddo
-      enddo
-    enddo
 
-    do j=1,m1
-      do i=1,m2
-        C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                    A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                    A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                    A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                    A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-      enddo
-    enddo
+    ! computes 1. matrix multiplication for tempx1,..
+    call mxm5_single(hprime_xx,m1,chi_elem,tempx1,m2)
+    ! computes 2. matrix multiplication for tempx2,..
+    call mxm5_3dmat_single(chi_elem,m1,hprime_xxT,m1,tempx2,NGLLX)
+    ! computes 3. matrix multiplication for tempx1,..
+    call mxm5_single(chi_elem,m2,hprime_xxT,tempx3,m1)
 
     if (PML_CONDITIONS .and. (.not. backward_simulation) .and. NSPEC_CPML > 0) then
       ! do not merge this second line with the first using an ".and." statement
       ! because array is_CPML() is unallocated when PML_CONDITIONS is false
-
       if (is_CPML(ispec)) then
       ! gets values for element
         DO_LOOP_IJK
@@ -212,56 +151,20 @@
         ! subroutines adapted from Deville, Fischer and Mund, High-order methods
         ! for incompressible fluid flow, Cambridge University Press (2002),
         ! pages 386 and 389 and Figure 8.3.1
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_5points_old(i,j) = hprime_xx(i,1)*B1_m1_m2_5points_old(1,j) + &
-                                        hprime_xx(i,2)*B1_m1_m2_5points_old(2,j) + &
-                                        hprime_xx(i,3)*B1_m1_m2_5points_old(3,j) + &
-                                        hprime_xx(i,4)*B1_m1_m2_5points_old(4,j) + &
-                                        hprime_xx(i,5)*B1_m1_m2_5points_old(5,j)
-
-            C1_m1_m2_5points_new(i,j) = hprime_xx(i,1)*B1_m1_m2_5points_new(1,j) + &
-                                        hprime_xx(i,2)*B1_m1_m2_5points_new(2,j) + &
-                                        hprime_xx(i,3)*B1_m1_m2_5points_new(3,j) + &
-                                        hprime_xx(i,4)*B1_m1_m2_5points_new(4,j) + &
-                                        hprime_xx(i,5)*B1_m1_m2_5points_new(5,j)
-          enddo
-        enddo
-
-        do k = 1,NGLLX
-          do j=1,m1
-            do i=1,m1
-              tempx2_old(i,j,k) = chi_elem_old(i,1,k)*hprime_xxT(1,j) + &
-                                  chi_elem_old(i,2,k)*hprime_xxT(2,j) + &
-                                  chi_elem_old(i,3,k)*hprime_xxT(3,j) + &
-                                  chi_elem_old(i,4,k)*hprime_xxT(4,j) + &
-                                  chi_elem_old(i,5,k)*hprime_xxT(5,j)
-
-              tempx2_new(i,j,k) = chi_elem_new(i,1,k)*hprime_xxT(1,j) + &
-                                  chi_elem_new(i,2,k)*hprime_xxT(2,j) + &
-                                  chi_elem_new(i,3,k)*hprime_xxT(3,j) + &
-                                  chi_elem_new(i,4,k)*hprime_xxT(4,j) + &
-                                  chi_elem_new(i,5,k)*hprime_xxT(5,j)
-            enddo
-          enddo
-        enddo
-
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_5points_old(i,j) = A1_mxm_m2_m1_5points_old(i,1)*hprime_xxT(1,j) + &
-                                            A1_mxm_m2_m1_5points_old(i,2)*hprime_xxT(2,j) + &
-                                            A1_mxm_m2_m1_5points_old(i,3)*hprime_xxT(3,j) + &
-                                            A1_mxm_m2_m1_5points_old(i,4)*hprime_xxT(4,j) + &
-                                            A1_mxm_m2_m1_5points_old(i,5)*hprime_xxT(5,j)
-
-            C1_mxm_m2_m1_5points_new(i,j) = A1_mxm_m2_m1_5points_new(i,1)*hprime_xxT(1,j) + &
-                                            A1_mxm_m2_m1_5points_new(i,2)*hprime_xxT(2,j) + &
-                                            A1_mxm_m2_m1_5points_new(i,3)*hprime_xxT(3,j) + &
-                                            A1_mxm_m2_m1_5points_new(i,4)*hprime_xxT(4,j) + &
-                                            A1_mxm_m2_m1_5points_new(i,5)*hprime_xxT(5,j)
-          enddo
-        enddo
 
+        ! computes 1. matrix multiplication for tempx1,..
+        call mxm5_single(hprime_xx,m1,chi_elem_old,tempx1_old,m2)
+        ! computes 2. matrix multiplication for tempx2,..
+        call mxm5_3dmat_single(chi_elem_old,m1,hprime_xxT,m1,tempx2_old,NGLLX)
+        ! computes 3. matrix multiplication for tempx1,..
+        call mxm5_single(chi_elem_old,m2,hprime_xxT,tempx3_old,m1)
+
+        ! computes 1. matrix multiplication for tempx1,..
+        call mxm5_single(hprime_xx,m1,chi_elem_new,tempx1_new,m2)
+        ! computes 2. matrix multiplication for tempx2,..
+        call mxm5_3dmat_single(chi_elem_new,m1,hprime_xxT,m1,tempx2_new,NGLLX)
+        ! computes 3. matrix multiplication for tempx1,..
+        call mxm5_single(chi_elem_new,m2,hprime_xxT,tempx3_new,m1)
       endif ! is_CPML
     endif ! PML_CONDITIONS
 
@@ -325,7 +228,8 @@
         ispec_CPML = spec_to_CPML(ispec)
 
         ! sets C-PML elastic memory variables to compute stress sigma and form dot product with test vector
-        call pml_compute_memory_variables_acoustic(ispec,ispec_CPML,tempx1,tempx2,tempx3,&
+        call pml_compute_memory_variables_acoustic(ispec,ispec_CPML, &
+                                                   tempx1,tempx2,tempx3,&
                                                    rmemory_dpotential_dxl,rmemory_dpotential_dyl,rmemory_dpotential_dzl)
 
         ! calculates contribution from each C-PML element to update acceleration
@@ -337,37 +241,13 @@
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
-    do j=1,m2
-      do i=1,m1
-        E1_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_5points(1,j) + &
-                                hprimewgll_xxT(i,2)*C1_m1_m2_5points(2,j) + &
-                                hprimewgll_xxT(i,3)*C1_m1_m2_5points(3,j) + &
-                                hprimewgll_xxT(i,4)*C1_m1_m2_5points(4,j) + &
-                                hprimewgll_xxT(i,5)*C1_m1_m2_5points(5,j)
-      enddo
-    enddo
-
-    do k = 1,NGLLX
-      do j=1,m1
-        do i=1,m1
-          newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                             tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                             tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                             tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                             tempx2(i,5,k)*hprimewgll_xx(5,j)
-        enddo
-      enddo
-    enddo
 
-    do j=1,m1
-      do i=1,m2
-        E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                    C1_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                    C1_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                    C1_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                    C1_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-      enddo
-    enddo
+    ! computes 1. matrix multiplication for newtempx1,..
+    call mxm5_single(hprimewgll_xxT,m1,tempx1,newtempx1,m2)
+    ! computes 2. matrix multiplication for tempx2,..
+    call mxm5_3dmat_single(tempx2,m1,hprimewgll_xx,m1,newtempx2,NGLLX)
+    ! computes 3. matrix multiplication for newtempx3,..
+    call mxm5_single(tempx3,m2,hprimewgll_xx,newtempx3,m1)
 
     ! second double-loop over GLL to compute all the terms
 #ifdef FORCE_VECTORIZATION
@@ -405,5 +285,86 @@
 ! There is something to enforce explicitly only in the case of elastic elements, for which a Dirichlet
 ! condition is needed for the displacement vector, which is the vectorial unknown for these elements.
 
+  contains
+
+!--------------------------------------------------------------------------------------------
+!
+! matrix-matrix multiplications
+!
+! subroutines adapted from Deville, Fischer and Mund, High-order methods
+! for incompressible fluid flow, Cambridge University Press (2002),
+! pages 386 and 389 and Figure 8.3.1
+!
+!--------------------------------------------------------------------------------------------
+!
+! note: the matrix-matrix multiplications are used for very small matrices ( 5 x 5 x 5 elements);
+!       thus, calling external optimized libraries for these multiplications are in general slower
+!
+! please leave the routines here to help compilers inlining the code
+
+  subroutine mxm5_single(A,n1,B,C,n3)
+
+! 2-dimensional arrays (25,5)/(5,25)
+
+  use constants,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5),intent(in) :: A
+  real(kind=CUSTOM_REAL),dimension(5,n3),intent(in) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j = 1,n3
+    do i = 1,n1
+      C(i,j) =  A(i,1) * B(1,j) &
+              + A(i,2) * B(2,j) &
+              + A(i,3) * B(3,j) &
+              + A(i,4) * B(4,j) &
+              + A(i,5) * B(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_single
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3dmat_single(A,n1,B,n2,C,n3)
+
+! 3-dimensional arrays (5,5,5) for A and C
+
+  use constants,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n2,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5,n3),intent(in) :: A
+  real(kind=CUSTOM_REAL),dimension(5,n2),intent(in) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n2,n3),intent(out) :: C
+
+  ! local parameters
+  integer :: i,j,k
+
+  ! matrix-matrix multiplication
+  do j = 1,n2
+    do i = 1,n1
+      ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+      do k = 1,n3
+        C(i,j,k) =  A(i,1,k) * B(1,j) &
+                  + A(i,2,k) * B(2,j) &
+                  + A(i,3,k) * B(3,j) &
+                  + A(i,4,k) * B(4,j) &
+                  + A(i,5,k) * B(5,j)
+      enddo
+    enddo
+  enddo
+
+  end subroutine mxm5_3dmat_single
+
   end subroutine compute_forces_acoustic_Dev
 
diff --git a/src/specfem3D/compute_forces_viscoelastic_Dev.F90 b/src/specfem3D/compute_forces_viscoelastic_Dev.F90
index 056a149..d28f1a1 100644
--- a/src/specfem3D/compute_forces_viscoelastic_Dev.F90
+++ b/src/specfem3D/compute_forces_viscoelastic_Dev.F90
@@ -152,118 +152,73 @@
   logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
   integer :: ispec2D_moho_top, ispec2D_moho_bot
 
+  ! C-PML absorbing boundary conditions
+  logical :: PML_CONDITIONS
+
+  ! CPML adjoint
+  logical :: backward_simulation
+
 ! local parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc, &
-    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
 
-  real(kind=CUSTOM_REAL) duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att
-  real(kind=CUSTOM_REAL) duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: B1_m1_m2_5points,B2_m1_m2_5points,B3_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points,C2_m1_m2_5points,C3_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: E1_m1_m2_5points,E2_m1_m2_5points,E3_m1_m2_5points
-
-  equivalence(dummyx_loc,B1_m1_m2_5points)
-  equivalence(dummyy_loc,B2_m1_m2_5points)
-  equivalence(dummyz_loc,B3_m1_m2_5points)
-  equivalence(tempx1,C1_m1_m2_5points)
-  equivalence(tempy1,C2_m1_m2_5points)
-  equivalence(tempz1,C3_m1_m2_5points)
-  equivalence(newtempx1,E1_m1_m2_5points)
-  equivalence(newtempy1,E2_m1_m2_5points)
-  equivalence(newtempz1,E3_m1_m2_5points)
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
 
+  ! attenuation
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     tempx1_att,tempx2_att,tempx3_att,tempy1_att,tempy2_att,tempy3_att,tempz1_att,tempz2_att,tempz3_att
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc_att,dummyy_loc_att,dummyz_loc_att
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: B1_m1_m2_5points_att,B2_m1_m2_5points_att,B3_m1_m2_5points_att
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points_att,C2_m1_m2_5points_att,C3_m1_m2_5points_att
-
-  equivalence(dummyx_loc_att,B1_m1_m2_5points_att)
-  equivalence(dummyy_loc_att,B2_m1_m2_5points_att)
-  equivalence(dummyz_loc_att,B3_m1_m2_5points_att)
-  equivalence(tempx1_att,C1_m1_m2_5points_att)
-  equivalence(tempy1_att,C2_m1_m2_5points_att)
-  equivalence(tempz1_att,C3_m1_m2_5points_att)
-
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: &
-    A1_mxm_m2_m1_5points,A2_mxm_m2_m1_5points,A3_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: &
-    C1_mxm_m2_m1_5points,C2_mxm_m2_m1_5points,C3_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: &
-    E1_mxm_m2_m1_5points,E2_mxm_m2_m1_5points,E3_mxm_m2_m1_5points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_5points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_5points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_5points)
-  equivalence(tempx3,C1_mxm_m2_m1_5points)
-  equivalence(tempy3,C2_mxm_m2_m1_5points)
-  equivalence(tempz3,C3_mxm_m2_m1_5points)
-  equivalence(newtempx3,E1_mxm_m2_m1_5points)
-  equivalence(newtempy3,E2_mxm_m2_m1_5points)
-  equivalence(newtempz3,E3_mxm_m2_m1_5points)
-
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: &
-    A1_mxm_m2_m1_5points_att,A2_mxm_m2_m1_5points_att,A3_mxm_m2_m1_5points_att
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: &
-    C1_mxm_m2_m1_5points_att,C2_mxm_m2_m1_5points_att,C3_mxm_m2_m1_5points_att
-
-  equivalence(dummyx_loc_att,A1_mxm_m2_m1_5points_att)
-  equivalence(dummyy_loc_att,A2_mxm_m2_m1_5points_att)
-  equivalence(dummyz_loc_att,A3_mxm_m2_m1_5points_att)
-  equivalence(tempx3_att,C1_mxm_m2_m1_5points_att)
-  equivalence(tempy3_att,C2_mxm_m2_m1_5points_att)
-  equivalence(tempz3_att,C3_mxm_m2_m1_5points_att)
-
-! C-PML absorbing boundary conditions
-  logical :: PML_CONDITIONS
-  integer :: ispec_CPML
-! CPML adjoint
-  logical :: backward_simulation
+  real(kind=CUSTOM_REAL) :: duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att
+  real(kind=CUSTOM_REAL) :: duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att
+  real(kind=CUSTOM_REAL) :: duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att
 
   ! local attenuation parameters
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_trace_loc,epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3, &
-                         R_trace_val1,R_trace_val2,R_trace_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
+    epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
 
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+  real(kind=CUSTOM_REAL) :: R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3, &
+                            R_trace_val1,R_trace_val2,R_trace_val3
+  real(kind=CUSTOM_REAL) :: factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) :: Sn,Snp1
+  real(kind=CUSTOM_REAL) :: templ
 
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) :: duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
 
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+  real(kind=CUSTOM_REAL) :: duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) :: duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
 
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+  real(kind=CUSTOM_REAL) :: sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
 
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
+  real(kind=CUSTOM_REAL) :: fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) :: lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) :: kappal
 
   ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+  real(kind=CUSTOM_REAL) :: c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                            c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
 
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
+  integer :: i_SLS,imodulo_N_SLS
+  integer :: ispec,iglob,ispec_p,num_elements
 
   real(kind=CUSTOM_REAL) :: eta
 
   real(kind=CUSTOM_REAL) :: epsilon_trace,epsilon_trace_squared
   real(kind=CUSTOM_REAL) :: mul_plus_A_over_4,lambdal_plus_B,lambdal_over_two_plus_B_over_2
 
-  integer :: i,j,k
+  ! C-PML absorbing boundary conditions
+  integer :: ispec_CPML
+
 #ifdef FORCE_VECTORIZATION
 ! this will (purposely) give out-of-bound array accesses if run through range checking,
 ! thus use only for production runs with no bound checking
   integer :: ijk
+#else
+  integer :: i,j,k
 #endif
 
   imodulo_N_SLS = mod(N_SLS,3)
@@ -358,269 +313,78 @@
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
-    ! call mxm_m1_m2_5points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-    do j=1,m2
-      do i=1,m1
-        C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
-                                hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
-                                hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
-                                hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
-                                hprime_xx(i,5)*B1_m1_m2_5points(5,j)
-        C2_m1_m2_5points(i,j) = hprime_xx(i,1)*B2_m1_m2_5points(1,j) + &
-                                hprime_xx(i,2)*B2_m1_m2_5points(2,j) + &
-                                hprime_xx(i,3)*B2_m1_m2_5points(3,j) + &
-                                hprime_xx(i,4)*B2_m1_m2_5points(4,j) + &
-                                hprime_xx(i,5)*B2_m1_m2_5points(5,j)
-        C3_m1_m2_5points(i,j) = hprime_xx(i,1)*B3_m1_m2_5points(1,j) + &
-                                hprime_xx(i,2)*B3_m1_m2_5points(2,j) + &
-                                hprime_xx(i,3)*B3_m1_m2_5points(3,j) + &
-                                hprime_xx(i,4)*B3_m1_m2_5points(4,j) + &
-                                hprime_xx(i,5)*B3_m1_m2_5points(5,j)
-      enddo
-    enddo
+
+    ! computes 1. matrix multiplication for tempx1,..
+    call mxm5_3comp_singleA(hprime_xx,m1,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1,m2)
 
 ! counts:
 ! + m1 * m2 * 3 * 9 = 5 * 25 * 3 * 9 = 3375 FLOP
 !
 ! + m1 * 5 float = 100 BYTE  (hprime_xx once, assuming B3_** in cache)
 
-    if (ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
-      ! temporary variables used for fixing attenuation in a consistent way
-      do j=1,m2
-        do i=1,m1
-          C1_m1_m2_5points_att(i,j) = C1_m1_m2_5points(i,j) + &
-               hprime_xx(i,1)*B1_m1_m2_5points_att(1,j) + &
-               hprime_xx(i,2)*B1_m1_m2_5points_att(2,j) + &
-               hprime_xx(i,3)*B1_m1_m2_5points_att(3,j) + &
-               hprime_xx(i,4)*B1_m1_m2_5points_att(4,j) + &
-               hprime_xx(i,5)*B1_m1_m2_5points_att(5,j)
-
-          C2_m1_m2_5points_att(i,j) = C2_m1_m2_5points(i,j) + &
-               hprime_xx(i,1)*B2_m1_m2_5points_att(1,j) + &
-               hprime_xx(i,2)*B2_m1_m2_5points_att(2,j) + &
-               hprime_xx(i,3)*B2_m1_m2_5points_att(3,j) + &
-               hprime_xx(i,4)*B2_m1_m2_5points_att(4,j) + &
-               hprime_xx(i,5)*B2_m1_m2_5points_att(5,j)
-
-          C3_m1_m2_5points_att(i,j) = C3_m1_m2_5points(i,j) + &
-               hprime_xx(i,1)*B3_m1_m2_5points_att(1,j) + &
-               hprime_xx(i,2)*B3_m1_m2_5points_att(2,j) + &
-               hprime_xx(i,3)*B3_m1_m2_5points_att(3,j) + &
-               hprime_xx(i,4)*B3_m1_m2_5points_att(4,j) + &
-               hprime_xx(i,5)*B3_m1_m2_5points_att(5,j)
-        enddo
-      enddo
-    else if (PML_CONDITIONS .and. (.not. backward_simulation) .and. NSPEC_CPML > 0) then
-      ! do not merge this second line with the first using an ".and." statement
-      ! because array is_CPML() is unallocated when PML_CONDITIONS is false
-      if (is_CPML(ispec)) then
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_5points_att(i,j) = &
-                 hprime_xx(i,1)*B1_m1_m2_5points_att(1,j) + &
-                 hprime_xx(i,2)*B1_m1_m2_5points_att(2,j) + &
-                 hprime_xx(i,3)*B1_m1_m2_5points_att(3,j) + &
-                 hprime_xx(i,4)*B1_m1_m2_5points_att(4,j) + &
-                 hprime_xx(i,5)*B1_m1_m2_5points_att(5,j)
-
-            C2_m1_m2_5points_att(i,j) = &
-                 hprime_xx(i,1)*B2_m1_m2_5points_att(1,j) + &
-                 hprime_xx(i,2)*B2_m1_m2_5points_att(2,j) + &
-                 hprime_xx(i,3)*B2_m1_m2_5points_att(3,j) + &
-                 hprime_xx(i,4)*B2_m1_m2_5points_att(4,j) + &
-                 hprime_xx(i,5)*B2_m1_m2_5points_att(5,j)
-
-            C3_m1_m2_5points_att(i,j) = &
-                 hprime_xx(i,1)*B3_m1_m2_5points_att(1,j) + &
-                 hprime_xx(i,2)*B3_m1_m2_5points_att(2,j) + &
-                 hprime_xx(i,3)*B3_m1_m2_5points_att(3,j) + &
-                 hprime_xx(i,4)*B3_m1_m2_5points_att(4,j) + &
-                 hprime_xx(i,5)*B3_m1_m2_5points_att(5,j)
-          enddo
-        enddo
-      endif
-    endif
-
-    !   call mxm_m1_m1_5points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-    !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-    do j=1,m1
-      do i=1,m1
-        ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-        do k = 1,NGLLX
-          tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                          dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                          dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                          dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                          dummyx_loc(i,5,k)*hprime_xxT(5,j)
-          tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                          dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                          dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                          dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                          dummyy_loc(i,5,k)*hprime_xxT(5,j)
-          tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                          dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                          dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                          dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                          dummyz_loc(i,5,k)*hprime_xxT(5,j)
-        enddo
-      enddo
-    enddo
-
-
+    ! computes 2. matrix multiplication for tempx2,..
+    call mxm5_3comp_3dmat_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m1,hprime_xxT,m1,tempx2,tempy2,tempz2,NGLLX)
 
 ! counts:
 ! + m1 * m1 * NGLLX * 3 * 9 = 5 * 5 * 5 * 3 * 9 = 3375 FLOP
 !
 ! + m1 * 5 float = 100 BYTE  (hprime_xxT once, assuming dummy*_** in cache)
 
-
-    if (ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
-      ! temporary variables used for fixing attenuation in a consistent way
-      do j=1,m1
-        do i=1,m1
-          ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-          do k = 1,NGLLX
-            tempx2_att(i,j,k) = tempx2(i,j,k) + &
-                 dummyx_loc_att(i,1,k)*hprime_xxT(1,j) + &
-                 dummyx_loc_att(i,2,k)*hprime_xxT(2,j) + &
-                 dummyx_loc_att(i,3,k)*hprime_xxT(3,j) + &
-                 dummyx_loc_att(i,4,k)*hprime_xxT(4,j) + &
-                 dummyx_loc_att(i,5,k)*hprime_xxT(5,j)
-
-            tempy2_att(i,j,k) = tempy2(i,j,k) + &
-                 dummyy_loc_att(i,1,k)*hprime_xxT(1,j) + &
-                 dummyy_loc_att(i,2,k)*hprime_xxT(2,j) + &
-                 dummyy_loc_att(i,3,k)*hprime_xxT(3,j) + &
-                 dummyy_loc_att(i,4,k)*hprime_xxT(4,j) + &
-                 dummyy_loc_att(i,5,k)*hprime_xxT(5,j)
-
-            tempz2_att(i,j,k) = tempz2(i,j,k) + &
-                 dummyz_loc_att(i,1,k)*hprime_xxT(1,j) + &
-                 dummyz_loc_att(i,2,k)*hprime_xxT(2,j) + &
-                 dummyz_loc_att(i,3,k)*hprime_xxT(3,j) + &
-                 dummyz_loc_att(i,4,k)*hprime_xxT(4,j) + &
-                 dummyz_loc_att(i,5,k)*hprime_xxT(5,j)
-          enddo
-        enddo
-      enddo
-    else if (PML_CONDITIONS .and. (.not. backward_simulation) .and. NSPEC_CPML > 0) then
-      ! do not merge this second line with the first using an ".and." statement
-      ! because array is_CPML() is unallocated when PML_CONDITIONS is false
-      ! temporary variables used for fixing attenuation in a consistent way
-      if (is_CPML(ispec)) then
-        do j=1,m1
-          do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              tempx2_att(i,j,k) = &
-                   dummyx_loc_att(i,1,k)*hprime_xxT(1,j) + &
-                   dummyx_loc_att(i,2,k)*hprime_xxT(2,j) + &
-                   dummyx_loc_att(i,3,k)*hprime_xxT(3,j) + &
-                   dummyx_loc_att(i,4,k)*hprime_xxT(4,j) + &
-                   dummyx_loc_att(i,5,k)*hprime_xxT(5,j)
-
-              tempy2_att(i,j,k) = &
-                   dummyy_loc_att(i,1,k)*hprime_xxT(1,j) + &
-                   dummyy_loc_att(i,2,k)*hprime_xxT(2,j) + &
-                   dummyy_loc_att(i,3,k)*hprime_xxT(3,j) + &
-                   dummyy_loc_att(i,4,k)*hprime_xxT(4,j) + &
-                   dummyy_loc_att(i,5,k)*hprime_xxT(5,j)
-
-              tempz2_att(i,j,k) = &
-                   dummyz_loc_att(i,1,k)*hprime_xxT(1,j) + &
-                   dummyz_loc_att(i,2,k)*hprime_xxT(2,j) + &
-                   dummyz_loc_att(i,3,k)*hprime_xxT(3,j) + &
-                   dummyz_loc_att(i,4,k)*hprime_xxT(4,j) + &
-                   dummyz_loc_att(i,5,k)*hprime_xxT(5,j)
-            enddo
-          enddo
-        enddo
-      endif
-    endif
-
-    ! call mxm_m2_m1_5points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-    do j=1,m1
-      do i=1,m2
-        C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                    A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                    A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                    A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                    A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-        C2_mxm_m2_m1_5points(i,j) = A2_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                    A2_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                    A2_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                    A2_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                    A2_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-        C3_mxm_m2_m1_5points(i,j) = A3_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                    A3_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                    A3_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                    A3_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                    A3_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-      enddo
-    enddo
+    ! computes 3. matrix multiplication for tempx1,..
+    call mxm5_3comp_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m2,hprime_xxT,tempx3,tempy3,tempz3,m1)
 
 ! counts:
 ! + m1 * m2 * 3 * 9 = 5 * 25 * 3 * 9 = 3375 FLOP
 !
 ! + 0 BYTE  (assuming A3_**, hprime_xxT in cache)
 
+
     if (ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
       ! temporary variables used for fixing attenuation in a consistent way
-      do j=1,m1
-        do i=1,m2
-          C1_mxm_m2_m1_5points_att(i,j) = C1_mxm_m2_m1_5points(i,j) + &
-               A1_mxm_m2_m1_5points_att(i,1)*hprime_xxT(1,j) + &
-               A1_mxm_m2_m1_5points_att(i,2)*hprime_xxT(2,j) + &
-               A1_mxm_m2_m1_5points_att(i,3)*hprime_xxT(3,j) + &
-               A1_mxm_m2_m1_5points_att(i,4)*hprime_xxT(4,j) + &
-               A1_mxm_m2_m1_5points_att(i,5)*hprime_xxT(5,j)
-
-          C2_mxm_m2_m1_5points_att(i,j) = C2_mxm_m2_m1_5points(i,j) + &
-               A2_mxm_m2_m1_5points_att(i,1)*hprime_xxT(1,j) + &
-               A2_mxm_m2_m1_5points_att(i,2)*hprime_xxT(2,j) + &
-               A2_mxm_m2_m1_5points_att(i,3)*hprime_xxT(3,j) + &
-               A2_mxm_m2_m1_5points_att(i,4)*hprime_xxT(4,j) + &
-               A2_mxm_m2_m1_5points_att(i,5)*hprime_xxT(5,j)
-
-          C3_mxm_m2_m1_5points_att(i,j) = C3_mxm_m2_m1_5points(i,j) + &
-               A3_mxm_m2_m1_5points_att(i,1)*hprime_xxT(1,j) + &
-               A3_mxm_m2_m1_5points_att(i,2)*hprime_xxT(2,j) + &
-               A3_mxm_m2_m1_5points_att(i,3)*hprime_xxT(3,j) + &
-               A3_mxm_m2_m1_5points_att(i,4)*hprime_xxT(4,j) + &
-               A3_mxm_m2_m1_5points_att(i,5)*hprime_xxT(5,j)
-        enddo
-      enddo
+
+      ! computes 1. matrix multiplication for tempx1,..
+      call mxm5_3comp_singleA(hprime_xx,m1,dummyx_loc_att,dummyy_loc_att,dummyz_loc_att, &
+                              tempx1_att,tempy1_att,tempz1_att,m2)
+
+      tempx1_att(:,:,:) = tempx1_att(:,:,:) + tempx1(:,:,:)
+      tempy1_att(:,:,:) = tempy1_att(:,:,:) + tempy1(:,:,:)
+      tempz1_att(:,:,:) = tempz1_att(:,:,:) + tempz1(:,:,:)
+
+      ! computes 2. matrix multiplication for tempx2,..
+      call mxm5_3comp_3dmat_singleB(dummyx_loc_att,dummyy_loc_att,dummyz_loc_att,m1,hprime_xxT,m1, &
+                                    tempx2_att,tempy2_att,tempz2_att,NGLLX)
+
+      tempx2_att(:,:,:) = tempx2_att(:,:,:) + tempx2(:,:,:)
+      tempy2_att(:,:,:) = tempy2_att(:,:,:) + tempy2(:,:,:)
+      tempz2_att(:,:,:) = tempz2_att(:,:,:) + tempz2(:,:,:)
+
+      ! computes 3. matrix multiplication for tempx1,..
+      call mxm5_3comp_singleB(dummyx_loc_att,dummyy_loc_att,dummyz_loc_att,m2,hprime_xxT, &
+                              tempx3_att,tempy3_att,tempz3_att,m1)
+
+      tempx3_att(:,:,:) = tempx3_att(:,:,:) + tempx3(:,:,:)
+      tempy3_att(:,:,:) = tempy3_att(:,:,:) + tempy3(:,:,:)
+      tempz3_att(:,:,:) = tempz3_att(:,:,:) + tempz3(:,:,:)
+
     else if (PML_CONDITIONS .and. (.not. backward_simulation) .and. NSPEC_CPML > 0) then
       ! do not merge this second line with the first using an ".and." statement
       ! because array is_CPML() is unallocated when PML_CONDITIONS is false
       if (is_CPML(ispec)) then
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_5points_att(i,j) = &
-                 A1_mxm_m2_m1_5points_att(i,1)*hprime_xxT(1,j) + &
-                 A1_mxm_m2_m1_5points_att(i,2)*hprime_xxT(2,j) + &
-                 A1_mxm_m2_m1_5points_att(i,3)*hprime_xxT(3,j) + &
-                 A1_mxm_m2_m1_5points_att(i,4)*hprime_xxT(4,j) + &
-                 A1_mxm_m2_m1_5points_att(i,5)*hprime_xxT(5,j)
-
-            C2_mxm_m2_m1_5points_att(i,j) = &
-                 A2_mxm_m2_m1_5points_att(i,1)*hprime_xxT(1,j) + &
-                 A2_mxm_m2_m1_5points_att(i,2)*hprime_xxT(2,j) + &
-                 A2_mxm_m2_m1_5points_att(i,3)*hprime_xxT(3,j) + &
-                 A2_mxm_m2_m1_5points_att(i,4)*hprime_xxT(4,j) + &
-                 A2_mxm_m2_m1_5points_att(i,5)*hprime_xxT(5,j)
-
-            C3_mxm_m2_m1_5points_att(i,j) = &
-                 A3_mxm_m2_m1_5points_att(i,1)*hprime_xxT(1,j) + &
-                 A3_mxm_m2_m1_5points_att(i,2)*hprime_xxT(2,j) + &
-                 A3_mxm_m2_m1_5points_att(i,3)*hprime_xxT(3,j) + &
-                 A3_mxm_m2_m1_5points_att(i,4)*hprime_xxT(4,j) + &
-                 A3_mxm_m2_m1_5points_att(i,5)*hprime_xxT(5,j)
-          enddo
-        enddo
+        ! computes 1. matrix multiplication for tempx1,..
+        call mxm5_3comp_singleA(hprime_xx,m1,dummyx_loc_att,dummyy_loc_att,dummyz_loc_att, &
+                                tempx1_att,tempy1_att,tempz1_att,m2)
+        ! computes 2. matrix multiplication for tempx2,..
+        call mxm5_3comp_3dmat_singleB(dummyx_loc_att,dummyy_loc_att,dummyz_loc_att,m1,hprime_xxT,m1, &
+                                      tempx2_att,tempy2_att,tempz2_att,NGLLX)
+        ! computes 3. matrix multiplication for tempx1,..
+        call mxm5_3comp_singleB(dummyx_loc_att,dummyy_loc_att,dummyz_loc_att,m2,hprime_xxT, &
+                                tempx3_att,tempy3_att,tempz3_att,m1)
       endif
     endif
 
+    !
+    ! computes either isotropic or fully anisotropic (visco-)elastic elements
+    !
     DO_LOOP_IJK
 
       ! get derivatives of ux, uy and uz with respect to x, y and z
@@ -1042,7 +806,9 @@
       if (is_CPML(ispec)) then
         ispec_CPML = spec_to_CPML(ispec)
         ! sets C-PML elastic memory variables to compute stress sigma and form dot product with test vector
-        call pml_compute_memory_variables_elastic(ispec,ispec_CPML,tempx1,tempy1,tempz1,tempx2,tempy2,tempz2, &
+        call pml_compute_memory_variables_elastic(ispec,ispec_CPML, &
+                                   tempx1,tempy1,tempz1, &
+                                   tempx2,tempy2,tempz2, &
                                    tempx3,tempy3,tempz3, &
                                    rmemory_dux_dxl_x, rmemory_duy_dyl_x, rmemory_duz_dzl_x, &
                                    rmemory_dux_dyl_x, rmemory_dux_dzl_x, rmemory_duz_dxl_x, rmemory_duy_dxl_x, &
@@ -1059,84 +825,32 @@
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
-    do j=1,m2
-      do i=1,m1
-        E1_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_5points(1,j) + &
-                                hprimewgll_xxT(i,2)*C1_m1_m2_5points(2,j) + &
-                                hprimewgll_xxT(i,3)*C1_m1_m2_5points(3,j) + &
-                                hprimewgll_xxT(i,4)*C1_m1_m2_5points(4,j) + &
-                                hprimewgll_xxT(i,5)*C1_m1_m2_5points(5,j)
-        E2_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_5points(1,j) + &
-                                hprimewgll_xxT(i,2)*C2_m1_m2_5points(2,j) + &
-                                hprimewgll_xxT(i,3)*C2_m1_m2_5points(3,j) + &
-                                hprimewgll_xxT(i,4)*C2_m1_m2_5points(4,j) + &
-                                hprimewgll_xxT(i,5)*C2_m1_m2_5points(5,j)
-        E3_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_5points(1,j) + &
-                                hprimewgll_xxT(i,2)*C3_m1_m2_5points(2,j) + &
-                                hprimewgll_xxT(i,3)*C3_m1_m2_5points(3,j) + &
-                                hprimewgll_xxT(i,4)*C3_m1_m2_5points(4,j) + &
-                                hprimewgll_xxT(i,5)*C3_m1_m2_5points(5,j)
-      enddo
-    enddo
+
+    ! computes 1. matrix multiplication for newtempx1,..
+    call mxm5_3comp_singleA(hprimewgll_xxT,m1,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1,m2)
 
 ! counts:
 ! + m1 * m2 * 3 * 9 = 3375 FLOP
 !
 ! + m1 * 5 float = 100 BYTE (hprimewgll_xxT once, assumes E3*, C1* in cache)
 
-    do i=1,m1
-      do j=1,m1
-        ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-        do k = 1,NGLLX
-          newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                             tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                             tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                             tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                             tempx2(i,5,k)*hprimewgll_xx(5,j)
-          newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                             tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                             tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                             tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                             tempy2(i,5,k)*hprimewgll_xx(5,j)
-          newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                             tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                             tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                             tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                             tempz2(i,5,k)*hprimewgll_xx(5,j)
-        enddo
-      enddo
-    enddo
+    ! computes 2. matrix multiplication for tempx2,..
+    call mxm5_3comp_3dmat_singleB(tempx2,tempy2,tempz2,m1,hprimewgll_xx,m1,newtempx2,newtempy2,newtempz2,NGLLX)
 
 ! counts:
 ! + m1 * m1 * NGLLX * 3 * 9 = 3375 FLOP
 !
 ! + m1 * 5 float = 100 BYTE (hprimewgll_xx once, assumes E3*, C1* in cache)
 
-    do j=1,m1
-      do i=1,m2
-        E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                    C1_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                    C1_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                    C1_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                    C1_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-        E2_mxm_m2_m1_5points(i,j) = C2_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                    C2_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                    C2_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                    C2_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                    C2_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-        E3_mxm_m2_m1_5points(i,j) = C3_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                    C3_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                    C3_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                    C3_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                    C3_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-      enddo
-    enddo
+    ! computes 3. matrix multiplication for newtempx3,..
+    call mxm5_3comp_singleB(tempx3,tempy3,tempz3,m2,hprimewgll_xx,newtempx3,newtempy3,newtempz3,m1)
 
 ! counts:
 ! + m1 * m2 * 3 * 9 = 3375 FLOP
 !
 ! + 0 BYTE (assumes E1*, C1*, hprime* in cache)
 
+    ! sums contributions
     DO_LOOP_IJK
       fac1 = wgllwgll_yz_3D(INDEX_IJK)
       fac2 = wgllwgll_xz_3D(INDEX_IJK)
@@ -1253,5 +967,154 @@
 
   enddo  ! spectral element loop
 
+  contains
+
+!--------------------------------------------------------------------------------------------
+!
+! matrix-matrix multiplications
+!
+! subroutines adapted from Deville, Fischer and Mund, High-order methods
+! for incompressible fluid flow, Cambridge University Press (2002),
+! pages 386 and 389 and Figure 8.3.1
+!
+!--------------------------------------------------------------------------------------------
+!
+! note: the matrix-matrix multiplications are used for very small matrices ( 5 x 5 x 5 elements);
+!       thus, calling external optimized libraries for these multiplications are in general slower
+!
+! please leave the routines here to help compilers inlining the code
+
+  subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays
+
+  use constants,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5),intent(in) :: A
+  real(kind=CUSTOM_REAL),dimension(5,n3),intent(in) :: B1,B2,B3
+  real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j = 1,n3
+    do i = 1,n1
+      C1(i,j) =  A(i,1) * B1(1,j) &
+               + A(i,2) * B1(2,j) &
+               + A(i,3) * B1(3,j) &
+               + A(i,4) * B1(4,j) &
+               + A(i,5) * B1(5,j)
+
+      C2(i,j) =  A(i,1) * B2(1,j) &
+               + A(i,2) * B2(2,j) &
+               + A(i,3) * B2(3,j) &
+               + A(i,4) * B2(4,j) &
+               + A(i,5) * B2(5,j)
+
+      C3(i,j) =  A(i,1) * B3(1,j) &
+               + A(i,2) * B3(2,j) &
+               + A(i,3) * B3(3,j) &
+               + A(i,4) * B3(4,j) &
+               + A(i,5) * B3(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_singleA
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays
+
+  use constants,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5),intent(in) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(5,n3),intent(in) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j = 1,n3
+    do i = 1,n1
+      C1(i,j) =  A1(i,1) * B(1,j) &
+               + A1(i,2) * B(2,j) &
+               + A1(i,3) * B(3,j) &
+               + A1(i,4) * B(4,j) &
+               + A1(i,5) * B(5,j)
+
+      C2(i,j) =  A2(i,1) * B(1,j) &
+               + A2(i,2) * B(2,j) &
+               + A2(i,3) * B(3,j) &
+               + A2(i,4) * B(4,j) &
+               + A2(i,5) * B(5,j)
+
+      C3(i,j) =  A3(i,1) * B(1,j) &
+               + A3(i,2) * B(2,j) &
+               + A3(i,3) * B(3,j) &
+               + A3(i,4) * B(4,j) &
+               + A3(i,5) * B(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_singleB
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 3-dimensional arrays (5,5,5), same B matrix for all 3 component arrays
+
+  use constants,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n2,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5,n3),intent(in) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(5,n2),intent(in) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n2,n3),intent(out) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j,k
+
+  ! matrix-matrix multiplication
+  do j = 1,n2
+    do i = 1,n1
+      ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+      do k = 1,n3
+        C1(i,j,k) =  A1(i,1,k) * B(1,j) &
+                   + A1(i,2,k) * B(2,j) &
+                   + A1(i,3,k) * B(3,j) &
+                   + A1(i,4,k) * B(4,j) &
+                   + A1(i,5,k) * B(5,j)
+
+        C2(i,j,k) =  A2(i,1,k) * B(1,j) &
+                   + A2(i,2,k) * B(2,j) &
+                   + A2(i,3,k) * B(3,j) &
+                   + A2(i,4,k) * B(4,j) &
+                   + A2(i,5,k) * B(5,j)
+
+        C3(i,j,k) =  A3(i,1,k) * B(1,j) &
+                   + A3(i,2,k) * B(2,j) &
+                   + A3(i,3,k) * B(3,j) &
+                   + A3(i,4,k) * B(4,j) &
+                   + A3(i,5,k) * B(5,j)
+      enddo
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_3dmat_singleB
+
   end subroutine compute_forces_viscoelastic_Dev_5p