[cig-commits] r22977 - in seismo/3D/SPECFEM3D_GLOBE/trunk: . src/cuda src/meshfem3D src/shared src/specfem3D utils/attenuation

danielpeter at geodynamics.org danielpeter at geodynamics.org
Mon Nov 4 00:16:43 PST 2013


Author: danielpeter
Date: 2013-11-04 00:16:42 -0800 (Mon, 04 Nov 2013)
New Revision: 22977

Modified:
   seismo/3D/SPECFEM3D_GLOBE/trunk/Makefile.in
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/create_central_cube.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/create_regions_mesh.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/get_model.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/get_perm_color.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/setup_color_perm.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/save_header_file.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_boundary_kernel.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_att_memory.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_strain.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_noDev.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_noDev.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_outer_core_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_kernels.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_seismograms.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/get_attenuation.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/prepare_timerun.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/read_attenuation_adios.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D_par.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/update_displacement_LDDRK.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/update_displacement_Newmark.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/utils/attenuation/attenuation_output.f90
Log:
switches indexing of arrays epsilondev_loc, rho_s_H, sum_terms, R_xx,.. and factor_common for better vectorization; removes equivalence statements, uses Deville subroutine calls instead; updates gradient calculations for acoustic kernels in outer core

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/Makefile.in	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/Makefile.in	2013-11-04 08:16:42 UTC (rev 22977)
@@ -201,11 +201,11 @@
 ifdef CLEAN
 clean:
 	@echo "cleaning by CLEAN defined"
-	-rm -f $(foreach dir, $(CLEAN), $($(dir)_OBJECTS) $($(dir)_MODULES) $($(dir)_SHARED_OBJECTS) $($(dir)_TARGETS))
+	-rm -f $(foreach dir, $(CLEAN), $($(dir)_OBJECTS) $($(dir)_MODULES) $($(dir)_SHARED_OBJECTS) $($(dir)_TARGETS)) $O/*
 else
 clean:
 	@echo "cleaning by CLEAN not defined"
-	-rm -f $(foreach dir, $(SUBDIRS), $($(dir)_OBJECTS) $($(dir)_MODULES) $($(dir)_TARGETS))
+	-rm -f $(foreach dir, $(SUBDIRS), $($(dir)_OBJECTS) $($(dir)_MODULES) $($(dir)_TARGETS)) $O/*
 endif
 
 help:

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu	2013-11-04 08:16:42 UTC (rev 22977)
@@ -94,20 +94,27 @@
                                               realw* sigma_yz) {
 
   realw R_xx_val,R_yy_val;
-
+  int offset_sls;
+  
   for(int i_sls = 0; i_sls < N_SLS; i_sls++){
     // index
-    // note: index for R_xx,.. here is (i_sls,i,j,k,ispec) and not (i,j,k,ispec,i_sls) as in local version
-    //          local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
-    R_xx_val = R_xx[i_sls + N_SLS*(tx + NGLL3*working_element)];
-    R_yy_val = R_yy[i_sls + N_SLS*(tx + NGLL3*working_element)];
+    // note: index for R_xx,.. here is (i,j,k,i_sls,ispec) and not (i,j,k,ispec,i_sls) as in local version
+    //       see local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
+    // indexing examples:
+    //   (i,j,k,ispec,i_sls) -> offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls)
+    //   (i_sls,i,j,k,ispec) -> offset_sls = i_sls + N_SLS*(tx + NGLL3*working_element)
+    //   (i,j,k,i_sls,ispec) -> offset_sls = tx + NGLL3*(i_sls + N_SLS*working_element)
+    offset_sls = tx + NGLL3*(i_sls + N_SLS*working_element);
+    
+    R_xx_val = R_xx[offset_sls];
+    R_yy_val = R_yy[offset_sls];
 
     *sigma_xx = *sigma_xx - R_xx_val;
     *sigma_yy = *sigma_yy - R_yy_val;
     *sigma_zz = *sigma_zz + R_xx_val + R_yy_val;
-    *sigma_xy = *sigma_xy - R_xy[i_sls + N_SLS*(tx + NGLL3*working_element)];
-    *sigma_xz = *sigma_xz - R_xz[i_sls + N_SLS*(tx + NGLL3*working_element)];
-    *sigma_yz = *sigma_yz - R_yz[i_sls + N_SLS*(tx + NGLL3*working_element)];
+    *sigma_xy = *sigma_xy - R_xy[offset_sls];
+    *sigma_xz = *sigma_xz - R_xz[offset_sls];
+    *sigma_yz = *sigma_yz - R_yz[offset_sls];
   }
 }
 
@@ -131,7 +138,8 @@
   realw fac;
   realw alphaval_loc,betaval_loc,gammaval_loc;
   realw factor_loc,Sn,Snp1;
-
+  int offset_sls;
+  
   // shear moduli for common factor (only Q_mu attenuation)
   if( ANISOTROPY ){
     fac = d_c44store[tx + NGLL3_PADDED * working_element];
@@ -142,16 +150,19 @@
   // use Runge-Kutta scheme to march in time
   for(int i_sls = 0; i_sls < N_SLS; i_sls++){
     // indices
-    // note: index for R_xx,... here is (i_sls,i,j,k,ispec) and not (i,j,k,ispec,i_sls) as in local version
-    //          local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
+    // note: index for R_xx,... here is (i,j,k,i_sls,ispec) and not (i,j,k,ispec,i_sls) as in local version
     //
-    // either mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+    // index:
+    // (i,j,k,i_sls,ispec) -> offset_sls = tx + NGLL3*(i_sls + N_SLS*working_element)
+    offset_sls = tx + NGLL3*(i_sls + N_SLS*working_element);
+
+    // either mustore(i,j,k,ispec) * factor_common(i,j,k,i_sls,ispec)
     // or       factor_common(i_sls,:,:,:,ispec) * c44store(:,:,:,ispec)
     if( USE_3D_ATTENUATION_ARRAYS ){
       // array dimension: factor_common(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC)
-      factor_loc = fac * factor_common[i_sls + N_SLS*(tx + NGLL3*working_element)];
+      factor_loc = fac * factor_common[offset_sls];
     }else{
-      // array dimension: factor_common(N_SLS,1,1,1,NSPEC)
+      // array dimension: factor_common(1,1,1,N_SLS,NSPEC)
       factor_loc = fac * factor_common[i_sls + N_SLS*working_element];
     }
 
@@ -159,36 +170,33 @@
     betaval_loc = betaval[i_sls];
     gammaval_loc = gammaval[i_sls];
 
+    
     // term in xx
     Sn   = factor_loc * epsilondev_xx[tx + NGLL3 * working_element]; //(i,j,k,ispec)
     Snp1   = factor_loc * epsilondev_xx_loc; //(i,j,k)
-    R_xx[i_sls + N_SLS*(tx + NGLL3*working_element)] =
-      alphaval_loc * R_xx[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_xx[offset_sls] = alphaval_loc * R_xx[offset_sls] + betaval_loc * Sn + gammaval_loc * Snp1;
 
     // term in yy
     Sn   = factor_loc * epsilondev_yy[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_yy_loc;
-    R_yy[i_sls + N_SLS*(tx + NGLL3*working_element)] =
-      alphaval_loc * R_yy[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_yy[offset_sls] = alphaval_loc * R_yy[offset_sls] + betaval_loc * Sn + gammaval_loc * Snp1;
+
     // term in zz not computed since zero trace
 
     // term in xy
     Sn   = factor_loc * epsilondev_xy[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_xy_loc;
-    R_xy[i_sls + N_SLS*(tx + NGLL3*working_element)] =
-      alphaval_loc * R_xy[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_xy[offset_sls] = alphaval_loc * R_xy[offset_sls] + betaval_loc * Sn + gammaval_loc * Snp1;
 
     // term in xz
     Sn   = factor_loc * epsilondev_xz[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_xz_loc;
-    R_xz[i_sls + N_SLS*(tx + NGLL3*working_element)] =
-      alphaval_loc * R_xz[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_xz[offset_sls] = alphaval_loc * R_xz[offset_sls] + betaval_loc * Sn + gammaval_loc * Snp1;
 
     // term in yz
     Sn   = factor_loc * epsilondev_yz[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_yz_loc;
-    R_yz[i_sls + N_SLS*(tx + NGLL3*working_element)] =
-      alphaval_loc * R_yz[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_yz[offset_sls] = alphaval_loc * R_yz[offset_sls] + betaval_loc * Sn + gammaval_loc * Snp1;
   }
 }
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu	2013-11-04 08:16:42 UTC (rev 22977)
@@ -68,24 +68,28 @@
                                              realw* sigma_xz,
                                              realw* sigma_yz) {
 
-  int offset;
   realw R_xx_val,R_yy_val;
+  int offset_sls;
 
   for(int i_sls = 0; i_sls < N_SLS; i_sls++){
     // index
     // note: index for R_xx,.. here is (i_sls,i,j,k,ispec) and not (i,j,k,ispec,i_sls) as in local version
     //          local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
-    offset = i_sls + N_SLS*(tx + NGLL3*working_element);
+    // indexing examples:
+    //   (i,j,k,ispec,i_sls) -> offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls)
+    //   (i_sls,i,j,k,ispec) -> offset_sls = i_sls + N_SLS*(tx + NGLL3*working_element)
+    //   (i,j,k,i_sls,ispec) -> offset_sls = tx + NGLL3*(i_sls + N_SLS*working_element)
+    offset_sls = tx + NGLL3*(i_sls + N_SLS*working_element);
 
-    R_xx_val = R_xx[offset];
-    R_yy_val = R_yy[offset];
+    R_xx_val = R_xx[offset_sls];
+    R_yy_val = R_yy[offset_sls];
 
     *sigma_xx = *sigma_xx - R_xx_val;
     *sigma_yy = *sigma_yy - R_yy_val;
     *sigma_zz = *sigma_zz + R_xx_val + R_yy_val;
-    *sigma_xy = *sigma_xy - R_xy[offset];
-    *sigma_xz = *sigma_xz - R_xz[offset];
-    *sigma_yz = *sigma_yz - R_yz[offset];
+    *sigma_xy = *sigma_xy - R_xy[offset_sls];
+    *sigma_xz = *sigma_xz - R_xz[offset_sls];
+    *sigma_yz = *sigma_yz - R_yz[offset_sls];
   }
 }
 
@@ -105,26 +109,26 @@
                                               int USE_3D_ATTENUATION_ARRAYS
                                               ){
 
-  int offset;
   realw mul;
   realw alphaval_loc,betaval_loc,gammaval_loc;
   realw factor_loc,Sn,Snp1;
+  int offset_sls;
 
   mul = d_muv[tx + NGLL3_PADDED * working_element];
 
   // use Runge-Kutta scheme to march in time
   for(int i_sls = 0; i_sls < N_SLS; i_sls++){
-
     // indices
-    // note: index for R_xx,... here is (i_sls,i,j,k,ispec) and not (i,j,k,ispec,i_sls) as in local version
-    //          local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
-    // index for (i_sls,i,j,k,ispec)
-    offset = i_sls + N_SLS*(tx + NGLL3*working_element);
+    // note: index for R_xx,... here is (i,j,k,i_sls,ispec) and not (i,j,k,ispec,i_sls) as in local version
+    // index for R_xx(i,j,k,i_sls,ispec),..
+    offset_sls = tx + NGLL3*(i_sls + N_SLS*working_element);
 
     if( USE_3D_ATTENUATION_ARRAYS ){
-      factor_loc = mul * factor_common[offset]; //mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+      //mustore(i,j,k,ispec) * factor_common(i,j,k,i_sls,ispec)
+      factor_loc = mul * factor_common[offset_sls];
     }else{
-      factor_loc = mul * factor_common[i_sls + N_SLS*working_element]; //mustore(i,j,k,ispec) * factor_common(i_sls,1,1,1,ispec)
+      //mustore(i,j,k,ispec) * factor_common(1,1,1,i_sls,ispec)
+      factor_loc = mul * factor_common[i_sls + N_SLS*working_element];
     }
     alphaval_loc = alphaval[i_sls]; // (i_sls)
     betaval_loc = betaval[i_sls];
@@ -133,28 +137,28 @@
     // term in xx
     Sn   = factor_loc * epsilondev_xx[tx + NGLL3 * working_element]; //(i,j,k,ispec)
     Snp1   = factor_loc * epsilondev_xx_loc; //(i,j,k)
-    R_xx[offset] = alphaval_loc * R_xx[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_xx[offset_sls] = alphaval_loc * R_xx[offset_sls] + betaval_loc * Sn + gammaval_loc * Snp1;
 
     // term in yy
     Sn   = factor_loc * epsilondev_yy[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_yy_loc;
-    R_yy[offset] = alphaval_loc * R_yy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_yy[offset_sls] = alphaval_loc * R_yy[offset_sls] + betaval_loc * Sn + gammaval_loc * Snp1;
     // term in zz not computed since zero trace
 
     // term in xy
     Sn   = factor_loc * epsilondev_xy[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_xy_loc;
-    R_xy[offset] = alphaval_loc * R_xy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_xy[offset_sls] = alphaval_loc * R_xy[offset_sls] + betaval_loc * Sn + gammaval_loc * Snp1;
 
     // term in xz
     Sn   = factor_loc * epsilondev_xz[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_xz_loc;
-    R_xz[offset] = alphaval_loc * R_xz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_xz[offset_sls] = alphaval_loc * R_xz[offset_sls] + betaval_loc * Sn + gammaval_loc * Snp1;
 
     // term in yz
     Sn   = factor_loc * epsilondev_yz[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_yz_loc;
-    R_yz[offset] = alphaval_loc * R_yz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_yz[offset_sls] = alphaval_loc * R_yz[offset_sls] + betaval_loc * Sn + gammaval_loc * Snp1;
   }
 }
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/create_central_cube.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/create_central_cube.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/create_central_cube.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -219,14 +219,13 @@
 
         ! determine where we cut the central cube to share it between CHUNK_AB & CHUNK_AB_ANTIPODE
         ! in the case of mod(NPROC_XI,2)/=0, the cut is asymetric and the bigger part is for CHUNK_AB
+        nz_inf_limit = nz_central_cube
         if (mod(NPROC_XI,2)/=0 .and. NPROC_XI > 1) then
           if (ichunk == CHUNK_AB) then
             nz_inf_limit = ((nz_central_cube*2)/NPROC_XI)*floor(NPROC_XI/2.d0)
           else if (ichunk == CHUNK_AB_ANTIPODE) then
             nz_inf_limit = ((nz_central_cube*2)/NPROC_XI)*ceiling(NPROC_XI/2.d0)
           endif
-        else
-          nz_inf_limit = nz_central_cube
         endif
 
         if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/create_regions_mesh.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/create_regions_mesh.F90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/create_regions_mesh.F90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -512,7 +512,7 @@
     nspec_att = 1
   endif
   allocate(Qmu_store(ATT1,ATT2,ATT3,nspec_att), &
-          tau_e_store(N_SLS,ATT1,ATT2,ATT3,nspec_att),stat=ier)
+          tau_e_store(ATT1,ATT2,ATT3,N_SLS,nspec_att),stat=ier)
   if(ier /= 0) stop 'error in allocate 1'
 
   Qmu_store(:,:,:,:) = 0.0; tau_e_store(:,:,:,:,:) = 0.0

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/get_model.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/get_model.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/get_model.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -80,7 +80,7 @@
   double precision :: rho,dvp
   double precision :: vpv,vph,vsv,vsh,eta_aniso
   double precision :: r,r_prem,moho
-  integer :: i,j,k
+  integer :: i,j,k,i_sls
 
   ! loops over all gll points for this spectral element
   do k=1,NGLLZ
@@ -305,12 +305,16 @@
 
         if(ATTENUATION) then
           if(ATTENUATION_3D .or. ATTENUATION_1D_WITH_3D_STORAGE) then
-            tau_e_store(:,i,j,k,ispec) = tau_e(:)
+            do i_sls = 1,N_SLS
+              tau_e_store(i,j,k,i_sls,ispec) = tau_e(i_sls)
+            enddo
             Qmu_store(i,j,k,ispec)     = Qmu
           else
             ! store values from mid-point for whole element
             if( i == NGLLX/2 .and. j == NGLLY/2 .and. k == NGLLZ/2 ) then
-              tau_e_store(:,1,1,1,ispec) = tau_e(:)
+              do i_sls = 1,N_SLS
+                tau_e_store(1,1,1,i_sls,ispec) = tau_e(i_sls)
+              enddo
               Qmu_store(1,1,1,ispec)     = Qmu
             endif
           endif

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/get_perm_color.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/get_perm_color.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/get_perm_color.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -1154,7 +1154,7 @@
   integer, intent(in) :: nspec
   integer, intent(in), dimension(nspec) :: perm
 
-  real(kind=CUSTOM_REAL), intent(inout), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,nspec) :: &
+  real(kind=CUSTOM_REAL), intent(inout), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,nspec) :: &
     array_to_permute,temp_array
 
   integer :: old_ispec,new_ispec
@@ -1184,7 +1184,7 @@
   integer, intent(in) :: nspec
   integer, intent(in), dimension(nspec) :: perm
 
-  real(kind=CUSTOM_REAL), intent(inout), dimension(N_SLS,1,1,1,nspec) :: &
+  real(kind=CUSTOM_REAL), intent(inout), dimension(1,1,1,N_SLS,nspec) :: &
     array_to_permute,temp_array
 
   integer :: old_ispec,new_ispec

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/setup_color_perm.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/setup_color_perm.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/setup_color_perm.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -860,13 +860,13 @@
   if (ATTENUATION) then
     if (ATTENUATION_3D .or. ATTENUATION_1D_WITH_3D_STORAGE) then
       allocate(temp_array_real(NGLLX,NGLLY,NGLLZ,nspec))
-      allocate(temp_array_real_sls(N_SLS,NGLLX,NGLLY,NGLLZ,nspec))
+      allocate(temp_array_real_sls(NGLLX,NGLLY,NGLLZ,N_SLS,nspec))
       call permute_elements_real(Qmu_store,temp_array_real,perm,nspec)
       call permute_elements_real_sls(tau_e_store,temp_array_real_sls,perm,nspec)
       deallocate(temp_array_real,temp_array_real_sls)
     else
       allocate(temp_array_real(1,1,1,nspec))
-      allocate(temp_array_real_sls(N_SLS,1,1,1,nspec))
+      allocate(temp_array_real_sls(1,1,1,N_SLS,nspec))
       call permute_elements_real1(Qmu_store,temp_array_real,perm,nspec)
       call permute_elements_real_sls1(tau_e_store,temp_array_real_sls,perm,nspec)
       deallocate(temp_array_real,temp_array_real_sls)

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/save_header_file.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/save_header_file.F90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/save_header_file.F90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -590,8 +590,7 @@
   endif
   write(IOUT,*)
 
-!! DK DK Jul 2013: we need that for the part1_*.f90 and part2_*.f90 include files, which some compilers
-!! DK DK Jul 2013: refuse to preprocess even if we rename them *.F90
+  ! we use this vectorization flag for solver routines in files **.f90
 #ifdef FORCE_VECTORIZATION
   write(IOUT,*) 'logical, parameter :: FORCE_VECTORIZATION_VAL = .true.'
 #else

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_boundary_kernel.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_boundary_kernel.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_boundary_kernel.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -33,7 +33,6 @@
   use specfem_par_crustmantle
   use specfem_par_innercore
   use specfem_par_outercore
-  use specfem_par_movie
   implicit none
 
   ! local parameters
@@ -43,9 +42,12 @@
   integer:: ispec,i,j,k,l,iglob
   real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl
   real(kind=CUSTOM_REAL) :: tempx1l,tempx2l,tempx3l
+  real(kind=CUSTOM_REAL) :: gradx,grady,gradz
+  logical,dimension(NGLOB_OUTER_CORE) :: mask_ibool
 
   ! transfers wavefields onto CPU
   if( GPU_MODE ) then
+
     ! crust/mantle
     call transfer_accel_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,accel_crust_mantle,Mesh_pointer)
     call transfer_displ_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,displ_crust_mantle,Mesh_pointer)
@@ -59,187 +61,221 @@
     call transfer_displ_oc_from_device(NGLOB_OUTER_CORE,displ_outer_core,Mesh_pointer)
     call transfer_b_displ_oc_from_device(NGLOB_OUTER_CORE,b_displ_outer_core,Mesh_pointer)
 
-    ! pre-calculates gradients on CPU
+    ! pre-calculates gradients in outer core
+    ! note: for CPU, this is already done in compute_kernels_outer_core() routine
+
+    ! pre-calculates gradients
+    mask_ibool(:) = .false.
     do ispec = 1, NSPEC_OUTER_CORE
       do k = 1, NGLLZ
         do j = 1, NGLLY
           do i = 1, NGLLX
+
+            ! global index
             iglob = ibool_outer_core(i,j,k,ispec)
 
-            xixl = xix_outer_core(i,j,k,ispec)
-            xiyl = xiy_outer_core(i,j,k,ispec)
-            xizl = xiz_outer_core(i,j,k,ispec)
-            etaxl = etax_outer_core(i,j,k,ispec)
-            etayl = etay_outer_core(i,j,k,ispec)
-            etazl = etaz_outer_core(i,j,k,ispec)
-            gammaxl = gammax_outer_core(i,j,k,ispec)
-            gammayl = gammay_outer_core(i,j,k,ispec)
-            gammazl = gammaz_outer_core(i,j,k,ispec)
+            ! only calculates gradients once for shared nodes
+            if( .not. mask_ibool(iglob) ) then
 
-            ! calculates gradient grad(b_displ)
-            tempx1l = 0._CUSTOM_REAL
-            tempx2l = 0._CUSTOM_REAL
-            tempx3l = 0._CUSTOM_REAL
-            do l=1,NGLLX
-              tempx1l = tempx1l + b_displ_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
-            enddo
-            do l=1,NGLLY
-              tempx2l = tempx2l + b_displ_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
-            enddo
-            do l=1,NGLLZ
-              tempx3l = tempx3l + b_displ_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
-            enddo
-            b_vector_displ_outer_core(1,iglob) = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-            b_vector_displ_outer_core(2,iglob) = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-            b_vector_displ_outer_core(3,iglob) = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+              ! masks this global point
+              mask_ibool(iglob) = .true.
 
-            ! calculates gradient grad(accel)
-            tempx1l = 0._CUSTOM_REAL
-            tempx2l = 0._CUSTOM_REAL
-            tempx3l = 0._CUSTOM_REAL
-            do l=1,NGLLX
-              tempx1l = tempx1l + accel_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
-            enddo
-            do l=1,NGLLY
-              tempx2l = tempx2l + accel_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
-            enddo
-            do l=1,NGLLZ
-              tempx3l = tempx3l + accel_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
-            enddo
-            vector_accel_outer_core(1,iglob) = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-            vector_accel_outer_core(2,iglob) = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-            vector_accel_outer_core(3,iglob) = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+              xixl = xix_outer_core(i,j,k,ispec)
+              xiyl = xiy_outer_core(i,j,k,ispec)
+              xizl = xiz_outer_core(i,j,k,ispec)
+              etaxl = etax_outer_core(i,j,k,ispec)
+              etayl = etay_outer_core(i,j,k,ispec)
+              etazl = etaz_outer_core(i,j,k,ispec)
+              gammaxl = gammax_outer_core(i,j,k,ispec)
+              gammayl = gammay_outer_core(i,j,k,ispec)
+              gammazl = gammaz_outer_core(i,j,k,ispec)
 
-            ! calculates gradient grad(displ)
-            tempx1l = 0._CUSTOM_REAL
-            tempx2l = 0._CUSTOM_REAL
-            tempx3l = 0._CUSTOM_REAL
-            do l=1,NGLLX
-              tempx1l = tempx1l + displ_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
-            enddo
-            do l=1,NGLLY
-              tempx2l = tempx2l + displ_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
-            enddo
-            do l=1,NGLLZ
-              tempx3l = tempx3l + displ_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
-            enddo
-            vector_displ_outer_core(1,iglob) = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-            vector_displ_outer_core(2,iglob) = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-            vector_displ_outer_core(3,iglob) = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+              ! calculates gradient grad(b_displ)
+              tempx1l = 0._CUSTOM_REAL
+              tempx2l = 0._CUSTOM_REAL
+              tempx3l = 0._CUSTOM_REAL
+              do l=1,NGLLX
+                tempx1l = tempx1l + b_displ_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
+              enddo
+              do l=1,NGLLY
+                tempx2l = tempx2l + b_displ_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
+              enddo
+              do l=1,NGLLZ
+                tempx3l = tempx3l + b_displ_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
+              enddo
+              gradx = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
+              grady = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
+              gradz = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+
+              ! assigns and stores gradient on global nodes
+              b_vector_displ_outer_core(1,iglob) = gradx
+              b_vector_displ_outer_core(2,iglob) = grady
+              b_vector_displ_outer_core(3,iglob) = gradz
+
+              ! calculates gradient grad(accel)
+              tempx1l = 0._CUSTOM_REAL
+              tempx2l = 0._CUSTOM_REAL
+              tempx3l = 0._CUSTOM_REAL
+              do l=1,NGLLX
+                tempx1l = tempx1l + accel_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
+              enddo
+              do l=1,NGLLY
+                tempx2l = tempx2l + accel_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
+              enddo
+              do l=1,NGLLZ
+                tempx3l = tempx3l + accel_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
+              enddo
+              gradx = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
+              grady = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
+              gradz = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+
+              vector_accel_outer_core(1,iglob) = gradx
+              vector_accel_outer_core(2,iglob) = grady
+              vector_accel_outer_core(3,iglob) = gradz
+
+              ! calculates gradient grad(displ)
+              tempx1l = 0._CUSTOM_REAL
+              tempx2l = 0._CUSTOM_REAL
+              tempx3l = 0._CUSTOM_REAL
+              do l=1,NGLLX
+                tempx1l = tempx1l + displ_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
+              enddo
+              do l=1,NGLLY
+                tempx2l = tempx2l + displ_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
+              enddo
+              do l=1,NGLLZ
+                tempx3l = tempx3l + displ_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
+              enddo
+              gradx = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
+              grady = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
+              gradz = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+
+              vector_displ_outer_core(1,iglob) = gradx
+              vector_displ_outer_core(2,iglob) = grady
+              vector_displ_outer_core(3,iglob) = gradz
+            endif
+
           enddo
         enddo
       enddo
     enddo
-  endif
 
+  endif ! GPU_MODE
+
   ! updates kernels on CPU
   fluid_solid_boundary = .false.
   iregion_code = IREGION_CRUST_MANTLE
 
   ! Moho
   if (.not. SUPPRESS_CRUSTAL_MESH .and. HONOR_1D_SPHERICAL_MOHO) then
-    call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
-             b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-             ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-             xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-             etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-             gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-             rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-             kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-             c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-             c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-             c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-             c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-             c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-             c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-             k_top,ibelm_moho_top,normal_moho,moho_kl_top,fluid_solid_boundary,NSPEC2D_MOHO)
+    call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle,b_displ_crust_mantle, &
+                                       nspec_crust_mantle,iregion_code, &
+                                       ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+                                       xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+                                       etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+                                       gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+                                       hprime_xx,hprime_yy,hprime_zz, &
+                                       rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+                                       kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+                                       c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+                                       c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+                                       c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+                                       c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+                                       c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+                                       c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+                                       k_top,ibelm_moho_top,normal_moho,moho_kl_top,fluid_solid_boundary,NSPEC2D_MOHO)
 
-    call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
-             b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-             ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-             xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-             etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-             gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-             rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-             kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-             c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-             c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-             c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-             c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-             c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-             c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-             k_bot,ibelm_moho_bot,normal_moho,moho_kl_bot,fluid_solid_boundary,NSPEC2D_MOHO)
+    call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle,b_displ_crust_mantle, &
+                                       nspec_crust_mantle,iregion_code, &
+                                       ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+                                       xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+                                       etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+                                       gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+                                       hprime_xx,hprime_yy,hprime_zz, &
+                                       rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+                                       kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+                                       c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+                                       c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+                                       c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+                                       c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+                                       c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+                                       c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+                                       k_bot,ibelm_moho_bot,normal_moho,moho_kl_bot,fluid_solid_boundary,NSPEC2D_MOHO)
 
     moho_kl = moho_kl + (moho_kl_top - moho_kl_bot) * deltat
   endif
 
   ! 400
-  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
-             b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-             ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-             xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-             etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-             gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-             rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-             kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-             c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-             c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-             c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-             c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-             c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-             c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-             k_top,ibelm_400_top,normal_400,d400_kl_top,fluid_solid_boundary,NSPEC2D_400)
+  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle,b_displ_crust_mantle, &
+                                     nspec_crust_mantle,iregion_code, &
+                                     ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+                                     xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+                                     etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+                                     gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+                                     hprime_xx,hprime_yy,hprime_zz, &
+                                     rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+                                     kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+                                     c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+                                     c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+                                     c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+                                     c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+                                     c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+                                     c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+                                     k_top,ibelm_400_top,normal_400,d400_kl_top,fluid_solid_boundary,NSPEC2D_400)
 
-  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
-             b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-             ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-             xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-             etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-             gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-             rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-             kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-             c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-             c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-             c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-             c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-             c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-             c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-             k_bot,ibelm_400_bot,normal_400,d400_kl_bot,fluid_solid_boundary,NSPEC2D_400)
+  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle,b_displ_crust_mantle, &
+                                     nspec_crust_mantle,iregion_code, &
+                                     ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+                                     xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+                                     etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+                                     gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+                                     hprime_xx,hprime_yy,hprime_zz, &
+                                     rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+                                     kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+                                     c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+                                     c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+                                     c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+                                     c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+                                     c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+                                     c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+                                     k_bot,ibelm_400_bot,normal_400,d400_kl_bot,fluid_solid_boundary,NSPEC2D_400)
 
   d400_kl = d400_kl + (d400_kl_top - d400_kl_bot) * deltat
 
   ! 670
-  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
-             b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-             ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-             xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-             etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-             gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-             rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-             kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-             c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-             c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-             c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-             c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-             c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-             c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-             k_top,ibelm_670_top,normal_670,d670_kl_top,fluid_solid_boundary,NSPEC2D_670)
+  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle,b_displ_crust_mantle, &
+                                     nspec_crust_mantle,iregion_code, &
+                                     ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+                                     xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+                                     etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+                                     gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+                                     hprime_xx,hprime_yy,hprime_zz, &
+                                     rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+                                     kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+                                     c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+                                     c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+                                     c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+                                     c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+                                     c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+                                     c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+                                     k_top,ibelm_670_top,normal_670,d670_kl_top,fluid_solid_boundary,NSPEC2D_670)
 
-  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
-             b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-             ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-             xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-             etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-             gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-             rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-             kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-             c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-             c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-             c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-             c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-             c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-             c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-             k_bot,ibelm_670_bot,normal_670,d670_kl_bot,fluid_solid_boundary,NSPEC2D_670)
+  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle,b_displ_crust_mantle, &
+                                     nspec_crust_mantle,iregion_code, &
+                                     ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+                                     xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+                                     etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+                                     gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+                                     hprime_xx,hprime_yy,hprime_zz, &
+                                     rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+                                     kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+                                     c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+                                     c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+                                     c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+                                     c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+                                     c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+                                     c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+                                     k_bot,ibelm_670_bot,normal_670,d670_kl_bot,fluid_solid_boundary,NSPEC2D_670)
 
   d670_kl = d670_kl + (d670_kl_top - d670_kl_bot) * deltat
 
@@ -247,22 +283,23 @@
   fluid_solid_boundary = .true.
   iregion_code = IREGION_CRUST_MANTLE
 
-  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
-             b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-             ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-             xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-             etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-             gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-             rhostore_crust_mantle,kappavstore_crust_mantle, muvstore_crust_mantle, &
-             kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-             c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-             c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-             c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-             c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-             c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-             c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-             k_top,ibelm_bottom_crust_mantle,normal_top_outer_core, &
-             cmb_kl_top,fluid_solid_boundary,NSPEC2D_CMB)
+  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle,b_displ_crust_mantle, &
+                                     nspec_crust_mantle,iregion_code, &
+                                     ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+                                     xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+                                     etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+                                     gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+                                     hprime_xx,hprime_yy,hprime_zz, &
+                                     rhostore_crust_mantle,kappavstore_crust_mantle, muvstore_crust_mantle, &
+                                     kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+                                     c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+                                     c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+                                     c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+                                     c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+                                     c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+                                     c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+                                     k_top,ibelm_bottom_crust_mantle,normal_top_outer_core, &
+                                     cmb_kl_top,fluid_solid_boundary,NSPEC2D_CMB)
 
   iregion_code = IREGION_OUTER_CORE
 
@@ -270,43 +307,43 @@
   allocate(dummy_ispec_is_tiso(NSPEC_OUTER_CORE))
   dummy_ispec_is_tiso(:) = .false.
 
-  call compute_boundary_kernel_depth(vector_displ_outer_core,vector_accel_outer_core, &
-             b_vector_displ_outer_core,nspec_outer_core, &
-             iregion_code,ystore_outer_core,zstore_outer_core,ibool_outer_core,dummy_ispec_is_tiso, &
-             xix_outer_core,xiy_outer_core,xiz_outer_core, &
-             etax_outer_core,etay_outer_core,etaz_outer_core,&
-             gammax_outer_core,gammay_outer_core,gammaz_outer_core,hprime_xx,hprime_yy,hprime_zz, &
-             rhostore_outer_core,kappavstore_outer_core,dummy_array, &
-             dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array, &
-             k_bot,ibelm_top_outer_core,normal_top_outer_core, &
-             cmb_kl_bot,fluid_solid_boundary,NSPEC2D_CMB)
+  call compute_boundary_kernel_depth(vector_displ_outer_core,vector_accel_outer_core,b_vector_displ_outer_core, &
+                                     nspec_outer_core, &
+                                     iregion_code,ystore_outer_core,zstore_outer_core,ibool_outer_core,dummy_ispec_is_tiso, &
+                                     xix_outer_core,xiy_outer_core,xiz_outer_core, &
+                                     etax_outer_core,etay_outer_core,etaz_outer_core,&
+                                     gammax_outer_core,gammay_outer_core,gammaz_outer_core,hprime_xx,hprime_yy,hprime_zz, &
+                                     rhostore_outer_core,kappavstore_outer_core,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array, &
+                                     k_bot,ibelm_top_outer_core,normal_top_outer_core, &
+                                     cmb_kl_bot,fluid_solid_boundary,NSPEC2D_CMB)
 
   cmb_kl = cmb_kl + (cmb_kl_top - cmb_kl_bot) * deltat
 
   ! ICB
   fluid_solid_boundary = .true.
-  call compute_boundary_kernel_depth(vector_displ_outer_core,vector_accel_outer_core, &
-             b_vector_displ_outer_core,nspec_outer_core, &
-             iregion_code,ystore_outer_core,zstore_outer_core,ibool_outer_core,dummy_ispec_is_tiso, &
-             xix_outer_core,xiy_outer_core,xiz_outer_core, &
-             etax_outer_core,etay_outer_core,etaz_outer_core,&
-             gammax_outer_core,gammay_outer_core,gammaz_outer_core,hprime_xx,hprime_yy,hprime_zz, &
-             rhostore_outer_core,kappavstore_outer_core,dummy_array, &
-             dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array, &
-             k_top,ibelm_bottom_outer_core,normal_bottom_outer_core, &
-             icb_kl_top,fluid_solid_boundary,NSPEC2D_ICB)
+  call compute_boundary_kernel_depth(vector_displ_outer_core,vector_accel_outer_core,b_vector_displ_outer_core, &
+                                     nspec_outer_core, &
+                                     iregion_code,ystore_outer_core,zstore_outer_core,ibool_outer_core,dummy_ispec_is_tiso, &
+                                     xix_outer_core,xiy_outer_core,xiz_outer_core, &
+                                     etax_outer_core,etay_outer_core,etaz_outer_core,&
+                                     gammax_outer_core,gammay_outer_core,gammaz_outer_core,hprime_xx,hprime_yy,hprime_zz, &
+                                     rhostore_outer_core,kappavstore_outer_core,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array, &
+                                     k_top,ibelm_bottom_outer_core,normal_bottom_outer_core, &
+                                     icb_kl_top,fluid_solid_boundary,NSPEC2D_ICB)
 
   deallocate(dummy_ispec_is_tiso)
 
@@ -317,22 +354,22 @@
   allocate(dummy_ispec_is_tiso(NSPEC_INNER_CORE))
   dummy_ispec_is_tiso(:) = .false.
 
-  call compute_boundary_kernel_depth(displ_inner_core,accel_inner_core, &
-             b_displ_inner_core,nspec_inner_core,iregion_code, &
-             ystore_inner_core,zstore_inner_core,ibool_inner_core,dummy_ispec_is_tiso, &
-             xix_inner_core,xiy_inner_core,xiz_inner_core, &
-             etax_inner_core,etay_inner_core,etaz_inner_core,&
-             gammax_inner_core,gammay_inner_core,gammaz_inner_core,hprime_xx,hprime_yy,hprime_zz, &
-             rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
-             dummy_array,dummy_array,dummy_array, &
-             c11store_inner_core,c12store_inner_core,c13store_inner_core,dummy_array, &
-             dummy_array,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array,dummy_array, &
-             c33store_inner_core,dummy_array,dummy_array, &
-             dummy_array,c44store_inner_core,dummy_array,dummy_array, &
-             dummy_array,dummy_array,dummy_array, &
-             k_bot,ibelm_top_inner_core,normal_bottom_outer_core, &
-             icb_kl_bot,fluid_solid_boundary,NSPEC2D_ICB)
+  call compute_boundary_kernel_depth(displ_inner_core,accel_inner_core,b_displ_inner_core, &
+                                     nspec_inner_core,iregion_code, &
+                                     ystore_inner_core,zstore_inner_core,ibool_inner_core,dummy_ispec_is_tiso, &
+                                     xix_inner_core,xiy_inner_core,xiz_inner_core, &
+                                     etax_inner_core,etay_inner_core,etaz_inner_core,&
+                                     gammax_inner_core,gammay_inner_core,gammaz_inner_core,hprime_xx,hprime_yy,hprime_zz, &
+                                     rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
+                                     dummy_array,dummy_array,dummy_array, &
+                                     c11store_inner_core,c12store_inner_core,c13store_inner_core,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array,dummy_array, &
+                                     c33store_inner_core,dummy_array,dummy_array, &
+                                     dummy_array,c44store_inner_core,dummy_array,dummy_array, &
+                                     dummy_array,dummy_array,dummy_array, &
+                                     k_bot,ibelm_top_inner_core,normal_bottom_outer_core, &
+                                     icb_kl_bot,fluid_solid_boundary,NSPEC2D_ICB)
   deallocate(dummy_ispec_is_tiso)
 
   icb_kl = icb_kl + (icb_kl_top - icb_kl_bot) * deltat
@@ -343,7 +380,8 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine compute_boundary_kernel_depth(displ,accel,b_displ,nspec,iregion_code, &
+  subroutine compute_boundary_kernel_depth(displ,accel,b_displ, &
+                                           nspec,iregion_code, &
                                            ystore,zstore,ibool,ispec_is_tiso, &
                                            xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
                                            hprime_xx,hprime_yy,hprime_zz, &

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -82,7 +82,7 @@
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
 
   real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilon_trace_over_3
 
@@ -96,8 +96,8 @@
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
 
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: rho_s_H
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: epsilondev_loc
 
 ! local parameters
   real(kind=CUSTOM_REAL) one_minus_sum_beta_use
@@ -190,11 +190,11 @@
       else
         epsilon_trace_over_3(INDEX_IJK,ispec) = templ
       endif
-      epsilondev_loc(1,INDEX_IJK) = duxdxl - templ
-      epsilondev_loc(2,INDEX_IJK) = duydyl - templ
-      epsilondev_loc(3,INDEX_IJK) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-      epsilondev_loc(4,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-      epsilondev_loc(5,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+      epsilondev_loc(INDEX_IJK,1) = duxdxl - templ
+      epsilondev_loc(INDEX_IJK,2) = duydyl - templ
+      epsilondev_loc(INDEX_IJK,3) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+      epsilondev_loc(INDEX_IJK,4) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+      epsilondev_loc(INDEX_IJK,5) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
     endif
 
     !
@@ -237,50 +237,47 @@
 
       ! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop
       ! in order to vectorize the outer loop
-      R_xx_val = R_xx(1,INDEX_IJK,ispec)
-      R_yy_val = R_yy(1,INDEX_IJK,ispec)
+      R_xx_val = R_xx(INDEX_IJK,1,ispec)
+      R_yy_val = R_yy(INDEX_IJK,1,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(1,INDEX_IJK,ispec)
-      sigma_xz = sigma_xz - R_xz(1,INDEX_IJK,ispec)
-      sigma_yz = sigma_yz - R_yz(1,INDEX_IJK,ispec)
+      sigma_xy = sigma_xy - R_xy(INDEX_IJK,1,ispec)
+      sigma_xz = sigma_xz - R_xz(INDEX_IJK,1,ispec)
+      sigma_yz = sigma_yz - R_yz(INDEX_IJK,1,ispec)
 
-      R_xx_val = R_xx(2,INDEX_IJK,ispec)
-      R_yy_val = R_yy(2,INDEX_IJK,ispec)
+      R_xx_val = R_xx(INDEX_IJK,2,ispec)
+      R_yy_val = R_yy(INDEX_IJK,2,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(2,INDEX_IJK,ispec)
-      sigma_xz = sigma_xz - R_xz(2,INDEX_IJK,ispec)
-      sigma_yz = sigma_yz - R_yz(2,INDEX_IJK,ispec)
+      sigma_xy = sigma_xy - R_xy(INDEX_IJK,2,ispec)
+      sigma_xz = sigma_xz - R_xz(INDEX_IJK,2,ispec)
+      sigma_yz = sigma_yz - R_yz(INDEX_IJK,2,ispec)
 
-      R_xx_val = R_xx(3,INDEX_IJK,ispec)
-      R_yy_val = R_yy(3,INDEX_IJK,ispec)
+      R_xx_val = R_xx(INDEX_IJK,3,ispec)
+      R_yy_val = R_yy(INDEX_IJK,3,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(3,INDEX_IJK,ispec)
-      sigma_xz = sigma_xz - R_xz(3,INDEX_IJK,ispec)
-      sigma_yz = sigma_yz - R_yz(3,INDEX_IJK,ispec)
+      sigma_xy = sigma_xy - R_xy(INDEX_IJK,3,ispec)
+      sigma_xz = sigma_xz - R_xz(INDEX_IJK,3,ispec)
+      sigma_yz = sigma_yz - R_yz(INDEX_IJK,3,ispec)
 #else
 !daniel debug: att - debug update
 !          call compute_element_att_mem_up_cm(ispec,INDEX_IJK, &
-!                                          R_xx(1,INDEX_IJK,ispec), &
-!                                          R_yy(1,INDEX_IJK,ispec), &
-!                                          R_xy(1,INDEX_IJK,ispec), &
-!                                          R_xz(1,INDEX_IJK,ispec), &
-!                                          R_yz(1,INDEX_IJK,ispec), &
-!                                          epsilondev_loc(:,INDEX_IJK),muvstore(INDEX_IJK,ispec),is_backward_field)
+!                                          R_xx(INDEX_IJK,1,ispec), &
+!                                          R_yy(INDEX_IJK,1,ispec), &
+!                                          R_xy(INDEX_IJK,1,ispec), &
+!                                          R_xz(INDEX_IJK,1,ispec), &
+!                                          R_yz(INDEX_IJK,1,ispec), &
+!                                          epsilondev_loc(INDEX_IJK,:),muvstore(INDEX_IJK,ispec),is_backward_field)
 
       ! note: function inlining is generally done by fortran compilers;
       !       compilers decide based on performance heuristics
       ! note: fortran passes pointers to array location, thus R_memory(1,1,...) should be fine
-      call compute_element_att_stress(R_xx(1,INDEX_IJK,ispec), &
-                                      R_yy(1,INDEX_IJK,ispec), &
-                                      R_xy(1,INDEX_IJK,ispec), &
-                                      R_xz(1,INDEX_IJK,ispec), &
-                                      R_yz(1,INDEX_IJK,ispec), &
+      call compute_element_att_stress(i,j,k,R_xx(1,1,1,1,ispec),R_yy(1,1,1,1,ispec),R_xy(1,1,1,1,ispec), &
+                                      R_xz(1,1,1,1,ispec),R_yz(1,1,1,1,ispec), &
                                       sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
 #endif
 
@@ -361,9 +358,9 @@
 
           ! precompute vector
           factor = dble(jacobianl) * wgll_cube(INDEX_IJK)
-          rho_s_H(1,INDEX_IJK) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
-          rho_s_H(2,INDEX_IJK) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
-          rho_s_H(3,INDEX_IJK) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
+          rho_s_H(INDEX_IJK,1) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
+          rho_s_H(INDEX_IJK,2) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
+          rho_s_H(INDEX_IJK,3) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
 
         else
 
@@ -388,9 +385,9 @@
 
           ! precompute vector
           factor = jacobianl * wgll_cube(INDEX_IJK)
-          rho_s_H(1,INDEX_IJK) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-          rho_s_H(2,INDEX_IJK) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-          rho_s_H(3,INDEX_IJK) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
+          rho_s_H(INDEX_IJK,1) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+          rho_s_H(INDEX_IJK,2) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+          rho_s_H(INDEX_IJK,3) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
 
         endif
 
@@ -466,7 +463,7 @@
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
 
   real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilon_trace_over_3
 
@@ -484,8 +481,8 @@
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
 
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: rho_s_H
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: epsilondev_loc
 
 ! local parameters
   real(kind=CUSTOM_REAL) one_minus_sum_beta_use
@@ -591,11 +588,11 @@
       else
         epsilon_trace_over_3(INDEX_IJK,ispec) = templ
       endif
-      epsilondev_loc(1,INDEX_IJK) = duxdxl - templ
-      epsilondev_loc(2,INDEX_IJK) = duydyl - templ
-      epsilondev_loc(3,INDEX_IJK) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-      epsilondev_loc(4,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-      epsilondev_loc(5,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+      epsilondev_loc(INDEX_IJK,1) = duxdxl - templ
+      epsilondev_loc(INDEX_IJK,2) = duydyl - templ
+      epsilondev_loc(INDEX_IJK,3) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+      epsilondev_loc(INDEX_IJK,4) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+      epsilondev_loc(INDEX_IJK,5) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
     endif
 
     !
@@ -822,41 +819,38 @@
 
       ! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop
       ! in order to vectorize the outer loop
-      R_xx_val = R_xx(1,INDEX_IJK,ispec)
-      R_yy_val = R_yy(1,INDEX_IJK,ispec)
+      R_xx_val = R_xx(INDEX_IJK,1,ispec)
+      R_yy_val = R_yy(INDEX_IJK,1,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(1,INDEX_IJK,ispec)
-      sigma_xz = sigma_xz - R_xz(1,INDEX_IJK,ispec)
-      sigma_yz = sigma_yz - R_yz(1,INDEX_IJK,ispec)
+      sigma_xy = sigma_xy - R_xy(INDEX_IJK,1,ispec)
+      sigma_xz = sigma_xz - R_xz(INDEX_IJK,1,ispec)
+      sigma_yz = sigma_yz - R_yz(INDEX_IJK,1,ispec)
 
-      R_xx_val = R_xx(2,INDEX_IJK,ispec)
-      R_yy_val = R_yy(2,INDEX_IJK,ispec)
+      R_xx_val = R_xx(INDEX_IJK,2,ispec)
+      R_yy_val = R_yy(INDEX_IJK,2,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(2,INDEX_IJK,ispec)
-      sigma_xz = sigma_xz - R_xz(2,INDEX_IJK,ispec)
-      sigma_yz = sigma_yz - R_yz(2,INDEX_IJK,ispec)
+      sigma_xy = sigma_xy - R_xy(INDEX_IJK,2,ispec)
+      sigma_xz = sigma_xz - R_xz(INDEX_IJK,2,ispec)
+      sigma_yz = sigma_yz - R_yz(INDEX_IJK,2,ispec)
 
-      R_xx_val = R_xx(3,ijk,1,1,ispec)
-      R_yy_val = R_yy(3,ijk,1,1,ispec)
+      R_xx_val = R_xx(INDEX_IJK,3,ispec)
+      R_yy_val = R_yy(INDEX_IJK,3,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(3,INDEX_IJK,ispec)
-      sigma_xz = sigma_xz - R_xz(3,INDEX_IJK,ispec)
-      sigma_yz = sigma_yz - R_yz(3,INDEX_IJK,ispec)
+      sigma_xy = sigma_xy - R_xy(INDEX_IJK,3,ispec)
+      sigma_xz = sigma_xz - R_xz(INDEX_IJK,3,ispec)
+      sigma_yz = sigma_yz - R_yz(INDEX_IJK,3,ispec)
 #else
       ! note: function inlining is generally done by fortran compilers;
       !       compilers decide based on performance heuristics
       ! note: fortran passes pointers to array location, thus R_memory(1,1,...) is fine
-      call compute_element_att_stress(R_xx(1,INDEX_IJK,ispec), &
-                                      R_yy(1,INDEX_IJK,ispec), &
-                                      R_xy(1,INDEX_IJK,ispec), &
-                                      R_xz(1,INDEX_IJK,ispec), &
-                                      R_yz(1,INDEX_IJK,ispec), &
+      call compute_element_att_stress(i,j,k,R_xx(1,1,1,1,ispec),R_yy(1,1,1,1,ispec),R_xy(1,1,1,1,ispec), &
+                                      R_xz(1,1,1,1,ispec),R_yz(1,1,1,1,ispec), &
                                       sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
 #endif
 
@@ -936,9 +930,9 @@
 
           ! precompute vector
           factor = dble(jacobianl) * wgll_cube(INDEX_IJK)
-          rho_s_H(1,INDEX_IJK) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
-          rho_s_H(2,INDEX_IJK) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
-          rho_s_H(3,INDEX_IJK) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
+          rho_s_H(INDEX_IJK,1) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
+          rho_s_H(INDEX_IJK,2) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
+          rho_s_H(INDEX_IJK,3) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
 
         else
 
@@ -963,9 +957,9 @@
 
           ! precompute vector
           factor = jacobianl * wgll_cube(INDEX_IJK)
-          rho_s_H(1,INDEX_IJK) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-          rho_s_H(2,INDEX_IJK) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-          rho_s_H(3,INDEX_IJK) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
+          rho_s_H(INDEX_IJK,1) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+          rho_s_H(INDEX_IJK,2) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+          rho_s_H(INDEX_IJK,3) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
 
         endif
 
@@ -1041,7 +1035,7 @@
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
 
   real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilon_trace_over_3
 
@@ -1059,8 +1053,8 @@
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
 
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: rho_s_H
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: epsilondev_loc
 
   ! local parameters
   real(kind=CUSTOM_REAL) minus_sum_beta,mul
@@ -1154,11 +1148,11 @@
       else
         epsilon_trace_over_3(INDEX_IJK,ispec) = templ
       endif
-      epsilondev_loc(1,INDEX_IJK) = duxdxl - templ
-      epsilondev_loc(2,INDEX_IJK) = duydyl - templ
-      epsilondev_loc(3,INDEX_IJK) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-      epsilondev_loc(4,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-      epsilondev_loc(5,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+      epsilondev_loc(INDEX_IJK,1) = duxdxl - templ
+      epsilondev_loc(INDEX_IJK,2) = duydyl - templ
+      epsilondev_loc(INDEX_IJK,3) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+      epsilondev_loc(INDEX_IJK,4) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+      epsilondev_loc(INDEX_IJK,5) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
     endif
 
     !
@@ -1232,42 +1226,39 @@
 
       ! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop
       ! in order to vectorize the outer loop
-      R_xx_val = R_xx(1,INDEX_IJK,ispec)
-      R_yy_val = R_yy(1,INDEX_IJK,ispec)
+      R_xx_val = R_xx(INDEX_IJK,1,ispec)
+      R_yy_val = R_yy(INDEX_IJK,1,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(1,INDEX_IJK,ispec)
-      sigma_xz = sigma_xz - R_xz(1,INDEX_IJK,ispec)
-      sigma_yz = sigma_yz - R_yz(1,INDEX_IJK,ispec)
+      sigma_xy = sigma_xy - R_xy(INDEX_IJK,1,ispec)
+      sigma_xz = sigma_xz - R_xz(INDEX_IJK,1,ispec)
+      sigma_yz = sigma_yz - R_yz(INDEX_IJK,1,ispec)
 
-      R_xx_val = R_xx(2,INDEX_IJK,ispec)
-      R_yy_val = R_yy(2,INDEX_IJK,ispec)
+      R_xx_val = R_xx(INDEX_IJK,2,ispec)
+      R_yy_val = R_yy(INDEX_IJK,2,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(2,INDEX_IJK,ispec)
-      sigma_xz = sigma_xz - R_xz(2,INDEX_IJK,ispec)
-      sigma_yz = sigma_yz - R_yz(2,INDEX_IJK,ispec)
+      sigma_xy = sigma_xy - R_xy(INDEX_IJK,2,ispec)
+      sigma_xz = sigma_xz - R_xz(INDEX_IJK,2,ispec)
+      sigma_yz = sigma_yz - R_yz(INDEX_IJK,2,ispec)
 
-      R_xx_val = R_xx(3,INDEX_IJK,ispec)
-      R_yy_val = R_yy(3,INDEX_IJK,ispec)
+      R_xx_val = R_xx(INDEX_IJK,3,ispec)
+      R_yy_val = R_yy(INDEX_IJK,3,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(3,INDEX_IJK,ispec)
-      sigma_xz = sigma_xz - R_xz(3,INDEX_IJK,ispec)
-      sigma_yz = sigma_yz - R_yz(3,INDEX_IJK,ispec)
+      sigma_xy = sigma_xy - R_xy(INDEX_IJK,3,ispec)
+      sigma_xz = sigma_xz - R_xz(INDEX_IJK,3,ispec)
+      sigma_yz = sigma_yz - R_yz(INDEX_IJK,3,ispec)
 #else
       ! note: function inlining is generally done by fortran compilers;
       !       compilers decide based on performance heuristics
 
       ! note: Fortran passes pointers to array location, thus R_memory(1,1,...) is fine
-      call compute_element_att_stress(R_xx(1,INDEX_IJK,ispec), &
-                                      R_yy(1,INDEX_IJK,ispec), &
-                                      R_xy(1,INDEX_IJK,ispec), &
-                                      R_xz(1,INDEX_IJK,ispec), &
-                                      R_yz(1,INDEX_IJK,ispec), &
+      call compute_element_att_stress(i,j,k,R_xx(1,1,1,1,ispec),R_yy(1,1,1,1,ispec),R_xy(1,1,1,1,ispec), &
+                                      R_xz(1,1,1,1,ispec),R_yz(1,1,1,1,ispec), &
                                       sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
 #endif
     endif ! ATTENUATION_VAL
@@ -1346,9 +1337,9 @@
 
           ! precompute vector
           factor = dble(jacobianl) * wgll_cube(INDEX_IJK)
-          rho_s_H(1,INDEX_IJK) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
-          rho_s_H(2,INDEX_IJK) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
-          rho_s_H(3,INDEX_IJK) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
+          rho_s_H(INDEX_IJK,1) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
+          rho_s_H(INDEX_IJK,2) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
+          rho_s_H(INDEX_IJK,3) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
 
         else
 
@@ -1373,9 +1364,9 @@
 
           ! precompute vector
           factor = jacobianl * wgll_cube(INDEX_IJK)
-          rho_s_H(1,INDEX_IJK) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-          rho_s_H(2,INDEX_IJK) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-          rho_s_H(3,INDEX_IJK) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
+          rho_s_H(INDEX_IJK,1) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+          rho_s_H(INDEX_IJK,2) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+          rho_s_H(INDEX_IJK,3) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
 
         endif
 
@@ -1410,44 +1401,579 @@
 !
 
 
-  subroutine compute_element_att_stress(R_xx_loc,R_yy_loc,R_xy_loc,R_xz_loc,R_yz_loc, &
+  subroutine compute_element_att_stress(i,j,k,R_xx_loc,R_yy_loc,R_xy_loc,R_xz_loc,R_yz_loc, &
                                        sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
 
 ! updates stress with attenuation correction
 
 
-  use constants_solver,only: CUSTOM_REAL,N_SLS
+  use constants_solver,only: CUSTOM_REAL,N_SLS,NGLLX,NGLLY,NGLLZ
 
   implicit none
 
+  integer, intent(in) :: i,j,k
   ! attenuation
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-!  real(kind=CUSTOM_REAL), dimension(5,N_SLS) :: R_memory_loc
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_xx_loc
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_yy_loc
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_xy_loc
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_xz_loc
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_yz_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS) :: R_xx_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS) :: R_yy_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS) :: R_xy_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS) :: R_xz_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS) :: R_yz_loc
 
   real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
 
 ! local parameters
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1
+  real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
   integer :: i_SLS
 
   do i_SLS = 1,N_SLS
-    R_xx_val1 = R_xx_loc(i_SLS) ! R_memory(1,i_SLS,i,j,k,ispec)
-    R_yy_val1 = R_yy_loc(i_SLS) ! R_memory(2,i_SLS,i,j,k,ispec)
-    sigma_xx = sigma_xx - R_xx_val1
-    sigma_yy = sigma_yy - R_yy_val1
-    sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-    sigma_xy = sigma_xy - R_xy_loc(i_SLS) ! R_memory(3,i_SLS,i,j,k,ispec)
-    sigma_xz = sigma_xz - R_xz_loc(i_SLS) ! R_memory(4,i_SLS,i,j,k,ispec)
-    sigma_yz = sigma_yz - R_yz_loc(i_SLS) ! R_memory(5,i_SLS,i,j,k,ispec)
+    R_xx_val = R_xx_loc(i,j,k,i_SLS)
+    R_yy_val = R_yy_loc(i,j,k,i_SLS)
+    sigma_xx = sigma_xx - R_xx_val
+    sigma_yy = sigma_yy - R_yy_val
+    sigma_zz = sigma_zz + R_xx_val + R_yy_val
+    sigma_xy = sigma_xy - R_xy_loc(i,j,k,i_SLS)
+    sigma_xz = sigma_xz - R_xz_loc(i,j,k,i_SLS)
+    sigma_yz = sigma_yz - R_yz_loc(i,j,k,i_SLS)
   enddo
 
   end subroutine compute_element_att_stress
 
+!--------------------------------------------------------------------------------------------
+!
+! Deville et al. 2002
+! Higher-Order Methods for Incompressible Fluid Flow
+!
+! subroutines adapted from Deville, Fischer and Mund, High-order methods
+! for incompressible fluid flow, Cambridge University Press (2002),
+! pages 386 and 389 and Figure 8.3.1
+!
+!--------------------------------------------------------------------------------------------
 
+! matrix - matrix multiplications
+
+! single component routines
+
+  subroutine mxm(A,n1,B,n2,C,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n2,n3
+  real(kind=CUSTOM_REAL),dimension(n1,n2) :: A
+  real(kind=CUSTOM_REAL),dimension(n2,n3) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C
+
+  ! chooses optimized version
+  select case( n2 )
+
+  case( 4 )
+    call mxm4(A,n1,B,C,n3)
+
+  case( 5 )
+    call mxm5(A,n1,B,C,n3)
+
+  case( 6 )
+    call mxm6(A,n1,B,C,n3)
+
+  case( 7 )
+    call mxm7(A,n1,B,C,n3)
+
+  case( 8 )
+    call mxm8(A,n1,B,C,n3)
+
+  case default
+    call mxmN(A,n1,B,n2,C,n3)
+
+  end select
+
+  end subroutine mxm
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm4(A,n1,B,C,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,4) :: A
+  real(kind=CUSTOM_REAL),dimension(4,n3) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C(i,j) =  A(i,1) * B(1,j) &
+              + A(i,2) * B(2,j) &
+              + A(i,3) * B(3,j) &
+              + A(i,4) * B(4,j)
+    enddo
+  enddo
+
+  end subroutine mxm4
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5(A,n1,B,C,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5) :: A
+  real(kind=CUSTOM_REAL),dimension(5,n3) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C(i,j) =  A(i,1) * B(1,j) &
+              + A(i,2) * B(2,j) &
+              + A(i,3) * B(3,j) &
+              + A(i,4) * B(4,j) &
+              + A(i,5) * B(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm6(A,n1,B,C,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,6) :: A
+  real(kind=CUSTOM_REAL),dimension(6,n3) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C(i,j) =  A(i,1) * B(1,j) &
+              + A(i,2) * B(2,j) &
+              + A(i,3) * B(3,j) &
+              + A(i,4) * B(4,j) &
+              + A(i,5) * B(5,j) &
+              + A(i,6) * B(6,j)
+    enddo
+  enddo
+
+  end subroutine mxm6
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm7(A,n1,B,C,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,7) :: A
+  real(kind=CUSTOM_REAL),dimension(7,n3) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C(i,j) =  A(i,1) * B(1,j) &
+              + A(i,2) * B(2,j) &
+              + A(i,3) * B(3,j) &
+              + A(i,4) * B(4,j) &
+              + A(i,5) * B(5,j) &
+              + A(i,6) * B(6,j) &
+              + A(i,7) * B(7,j)
+    enddo
+  enddo
+
+  end subroutine mxm7
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm8(A,n1,B,C,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,8) :: A
+  real(kind=CUSTOM_REAL),dimension(8,n3) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C(i,j) =  A(i,1) * B(1,j) &
+              + A(i,2) * B(2,j) &
+              + A(i,3) * B(3,j) &
+              + A(i,4) * B(4,j) &
+              + A(i,5) * B(5,j) &
+              + A(i,6) * B(6,j) &
+              + A(i,7) * B(7,j) &
+              + A(i,8) * B(8,j)
+    enddo
+  enddo
+
+  end subroutine mxm8
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxmN(A,n1,B,n2,C,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n2,n3
+  real(kind=CUSTOM_REAL),dimension(n1,n2) :: A
+  real(kind=CUSTOM_REAL),dimension(n2,n3) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C
+
+  ! local parameters
+  integer :: i,j,k
+  real(kind=CUSTOM_REAL) :: tmp
+
+  ! general matrix-matrix multiplication
+  do j=1,n3
+    do k=1,n2
+      tmp = B(k,j)
+      do i=1,n1
+        C(i,j) = C(i,j) + A(i,k) * tmp
+      enddo
+    enddo
+  enddo
+
+  end subroutine mxmN
+
+
+!----------------------------------------------------------------------------------------------
+
+
+
+! 3-component routines: combines arrays A1,A2,A3 which correspond to 3 different components x/y/z
+
+  subroutine mxm_3comp(A1,A2,A3,n1,B1,B2,B3,n2,C1,C2,C3,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n2,n3
+  real(kind=CUSTOM_REAL),dimension(n1,n2) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(n2,n3) :: B1,B2,B3
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! chooses optimized version
+  select case( n2 )
+
+  case( 4 )
+    call mxm4_3comp(A1,A2,A3,n1,B1,B2,B3,C1,C2,C3,n3)
+
+  case( 5 )
+    call mxm5_3comp(A1,A2,A3,n1,B1,B2,B3,C1,C2,C3,n3)
+
+  case( 6 )
+    call mxm6_3comp(A1,A2,A3,n1,B1,B2,B3,C1,C2,C3,n3)
+
+  case( 7 )
+    call mxm7_3comp(A1,A2,A3,n1,B1,B2,B3,C1,C2,C3,n3)
+
+  case( 8 )
+    call mxm8_3comp(A1,A2,A3,n1,B1,B2,B3,C1,C2,C3,n3)
+
+  case default
+    call mxmN_3comp(A1,A2,A3,n1,B1,B2,B3,n2,C1,C2,C3,n3)
+
+  end select
+
+  end subroutine mxm_3comp
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm4_3comp(A1,A2,A3,n1,B1,B2,B3,C1,C2,C3,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,4) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(4,n3) :: B1,B2,B3
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A1(i,1) * B1(1,j) &
+               + A1(i,2) * B1(2,j) &
+               + A1(i,3) * B1(3,j) &
+               + A1(i,4) * B1(4,j)
+
+      C2(i,j) =  A2(i,1) * B2(1,j) &
+               + A2(i,2) * B2(2,j) &
+               + A2(i,3) * B2(3,j) &
+               + A2(i,4) * B2(4,j)
+
+      C3(i,j) =  A3(i,1) * B3(1,j) &
+               + A3(i,2) * B3(2,j) &
+               + A3(i,3) * B3(3,j) &
+               + A3(i,4) * B3(4,j)
+    enddo
+  enddo
+
+  end subroutine mxm4_3comp
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3comp(A1,A2,A3,n1,B1,B2,B3,C1,C2,C3,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(5,n3) :: B1,B2,B3
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A1(i,1) * B1(1,j) &
+               + A1(i,2) * B1(2,j) &
+               + A1(i,3) * B1(3,j) &
+               + A1(i,4) * B1(4,j) &
+               + A1(i,5) * B1(5,j)
+
+      C2(i,j) =  A2(i,1) * B2(1,j) &
+               + A2(i,2) * B2(2,j) &
+               + A2(i,3) * B2(3,j) &
+               + A2(i,4) * B2(4,j) &
+               + A2(i,5) * B2(5,j)
+
+      C3(i,j) =  A3(i,1) * B3(1,j) &
+               + A3(i,2) * B3(2,j) &
+               + A3(i,3) * B3(3,j) &
+               + A3(i,4) * B3(4,j) &
+               + A3(i,5) * B3(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp
+
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm6_3comp(A1,A2,A3,n1,B1,B2,B3,C1,C2,C3,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,6) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(6,n3) :: B1,B2,B3
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A1(i,1) * B1(1,j) &
+               + A1(i,2) * B1(2,j) &
+               + A1(i,3) * B1(3,j) &
+               + A1(i,4) * B1(4,j) &
+               + A1(i,5) * B1(5,j) &
+               + A1(i,6) * B1(6,j)
+
+      C2(i,j) =  A2(i,1) * B2(1,j) &
+               + A2(i,2) * B2(2,j) &
+               + A2(i,3) * B2(3,j) &
+               + A2(i,4) * B2(4,j) &
+               + A2(i,5) * B2(5,j) &
+               + A2(i,6) * B2(6,j)
+
+      C3(i,j) =  A3(i,1) * B3(1,j) &
+               + A3(i,2) * B3(2,j) &
+               + A3(i,3) * B3(3,j) &
+               + A3(i,4) * B3(4,j) &
+               + A3(i,5) * B3(5,j) &
+               + A3(i,6) * B3(6,j)
+    enddo
+  enddo
+
+  end subroutine mxm6_3comp
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm7_3comp(A1,A2,A3,n1,B1,B2,B3,C1,C2,C3,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,7) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(7,n3) :: B1,B2,B3
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A1(i,1) * B1(1,j) &
+               + A1(i,2) * B1(2,j) &
+               + A1(i,3) * B1(3,j) &
+               + A1(i,4) * B1(4,j) &
+               + A1(i,5) * B1(5,j) &
+               + A1(i,6) * B1(6,j) &
+               + A1(i,7) * B1(7,j)
+
+      C2(i,j) =  A2(i,1) * B2(1,j) &
+               + A2(i,2) * B2(2,j) &
+               + A2(i,3) * B2(3,j) &
+               + A2(i,4) * B2(4,j) &
+               + A2(i,5) * B2(5,j) &
+               + A2(i,6) * B2(6,j) &
+               + A2(i,7) * B2(7,j)
+
+      C3(i,j) =  A3(i,1) * B3(1,j) &
+               + A3(i,2) * B3(2,j) &
+               + A3(i,3) * B3(3,j) &
+               + A3(i,4) * B3(4,j) &
+               + A3(i,5) * B3(5,j) &
+               + A3(i,6) * B3(6,j) &
+               + A3(i,7) * B3(7,j)
+    enddo
+  enddo
+
+  end subroutine mxm7_3comp
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm8_3comp(A1,A2,A3,n1,B1,B2,B3,C1,C2,C3,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,8) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(8,n3) :: B1,B2,B3
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A1(i,1) * B1(1,j) &
+               + A1(i,2) * B1(2,j) &
+               + A1(i,3) * B1(3,j) &
+               + A1(i,4) * B1(4,j) &
+               + A1(i,5) * B1(5,j) &
+               + A1(i,6) * B1(6,j) &
+               + A1(i,7) * B1(7,j) &
+               + A1(i,8) * B1(8,j)
+
+      C2(i,j) =  A2(i,1) * B2(1,j) &
+               + A2(i,2) * B2(2,j) &
+               + A2(i,3) * B2(3,j) &
+               + A2(i,4) * B2(4,j) &
+               + A2(i,5) * B2(5,j) &
+               + A2(i,6) * B2(6,j) &
+               + A2(i,7) * B2(7,j) &
+               + A2(i,8) * B2(8,j)
+
+      C3(i,j) =  A3(i,1) * B3(1,j) &
+               + A3(i,2) * B3(2,j) &
+               + A3(i,3) * B3(3,j) &
+               + A3(i,4) * B3(4,j) &
+               + A3(i,5) * B3(5,j) &
+               + A3(i,6) * B3(6,j) &
+               + A3(i,7) * B3(7,j) &
+               + A3(i,8) * B3(8,j)
+    enddo
+  enddo
+
+  end subroutine mxm8_3comp
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxmN_3comp(A1,A2,A3,n1,B1,B2,B3,n2,C1,C2,C3,n3)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n2,n3
+  real(kind=CUSTOM_REAL),dimension(n1,n2) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(n2,n3) :: B1,B2,B3
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j,k
+  real(kind=CUSTOM_REAL) :: tmp1,tmp2,tmp3
+
+  ! general matrix-matrix multiplication
+  do j=1,n3
+    do k=1,n2
+      tmp1 = B1(k,j)
+      tmp2 = B2(k,j)
+      tmp3 = B3(k,j)
+      do i=1,n1
+        C1(i,j) = C1(i,j) + A1(i,k) * tmp1
+        C2(i,j) = C2(i,j) + A2(i,k) * tmp2
+        C3(i,j) = C3(i,j) + A3(i,k) * tmp3
+      enddo
+    enddo
+  enddo
+
+  end subroutine mxmN_3comp
+
+
+
+
+

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_att_memory.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_att_memory.F90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_att_memory.F90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -25,7 +25,12 @@
 !
 !=====================================================================
 
+! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION
+! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh
+#include "config.fh"
 
+
+
 !--------------------------------------------------------------------------------------------
 !
 ! crust/mantle region
@@ -67,12 +72,12 @@
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
 
   ! variable sized array variables
   integer :: vx,vy,vz,vnspec
 
-  real(kind=CUSTOM_REAL), dimension(N_SLS,vx,vy,vz,vnspec) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(vx,vy,vz,N_SLS,vnspec) :: factor_common
   real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_MANTLE) :: c44store
@@ -81,7 +86,7 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
     epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
 
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: epsilondev_loc
 
 ! local parameters
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: factor_common_c44_muv
@@ -89,6 +94,8 @@
 
 #ifdef FORCE_VECTORIZATION
   integer :: ijk
+#else
+  integer :: i,j,k
 #endif
 
   ! use Runge-Kutta scheme to march in time
@@ -97,87 +104,65 @@
   ! IMPROVE we use mu_v here even if there is some anisotropy
   ! IMPROVE we should probably use an average value instead
 
-#ifdef FORCE_VECTORIZATION
 
   do i_SLS = 1,N_SLS
+
     ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
     if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
+
       if(ANISOTROPIC_3D_MANTLE_VAL) then
-        do ijk=1,NGLLCUBE
-          factor_common_c44_muv(ijk,1,1) = factor_common(i_SLS,ijk,1,1,ispec) * c44store(ijk,1,1,ispec)
-        enddo
-      else
-        do ijk=1,NGLLCUBE
-          factor_common_c44_muv(ijk,1,1) = factor_common(i_SLS,ijk,1,1,ispec) * muvstore(ijk,1,1,ispec)
-        enddo
-      endif
-    else
-      if(ANISOTROPIC_3D_MANTLE_VAL) then
-        do ijk=1,NGLLCUBE
-          factor_common_c44_muv(ijk,1,1) = factor_common(i_SLS,1,1,1,ispec) * c44store(ijk,1,1,ispec)
-        enddo
-      else
-        do ijk=1,NGLLCUBE
-          factor_common_c44_muv(ijk,1,1) = factor_common(i_SLS,1,1,1,ispec) * muvstore(ijk,1,1,ispec)
-        enddo
-      endif
-    endif
 
-    do ijk=1,NGLLCUBE
-      R_xx(i_SLS,ijk,1,1,ispec) = alphaval(i_SLS) * R_xx(i_SLS,ijk,1,1,ispec) + factor_common_c44_muv(ijk,1,1) * &
-          (betaval(i_SLS) * epsilondev_xx(ijk,1,1,ispec) + gammaval(i_SLS) * epsilondev_loc(1,ijk,1,1))
+        DO_LOOP_IJK
+          factor_common_c44_muv(INDEX_IJK) = factor_common(INDEX_IJK,i_SLS,ispec) * c44store(INDEX_IJK,ispec)
+        ENDDO_LOOP_IJK
 
-      R_yy(i_SLS,ijk,1,1,ispec) = alphaval(i_SLS) * R_yy(i_SLS,ijk,1,1,ispec) + factor_common_c44_muv(ijk,1,1) * &
-          (betaval(i_SLS) * epsilondev_yy(ijk,1,1,ispec) + gammaval(i_SLS) * epsilondev_loc(2,ijk,1,1))
+      else
 
-      R_xy(i_SLS,ijk,1,1,ispec) = alphaval(i_SLS) * R_xy(i_SLS,ijk,1,1,ispec) + factor_common_c44_muv(ijk,1,1) * &
-          (betaval(i_SLS) * epsilondev_xy(ijk,1,1,ispec) + gammaval(i_SLS) * epsilondev_loc(3,ijk,1,1))
+        DO_LOOP_IJK
+          factor_common_c44_muv(INDEX_IJK) = factor_common(INDEX_IJK,i_SLS,ispec) * muvstore(INDEX_IJK,ispec)
+        ENDDO_LOOP_IJK
 
-      R_xz(i_SLS,ijk,1,1,ispec) = alphaval(i_SLS) * R_xz(i_SLS,ijk,1,1,ispec) + factor_common_c44_muv(ijk,1,1) * &
-          (betaval(i_SLS) * epsilondev_xz(ijk,1,1,ispec) + gammaval(i_SLS) * epsilondev_loc(4,ijk,1,1))
+      endif
 
-      R_yz(i_SLS,ijk,1,1,ispec) = alphaval(i_SLS) * R_yz(i_SLS,ijk,1,1,ispec) + factor_common_c44_muv(ijk,1,1) * &
-          (betaval(i_SLS) * epsilondev_yz(ijk,1,1,ispec) + gammaval(i_SLS) * epsilondev_loc(5,ijk,1,1))
-    enddo
-  enddo ! i_SLS
+    else
 
-#else
-
-  do i_SLS = 1,N_SLS
-    ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
-    if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
       if(ANISOTROPIC_3D_MANTLE_VAL) then
-        factor_common_c44_muv(:,:,:) = factor_common(i_SLS,:,:,:,ispec) * c44store(:,:,:,ispec)
+
+        DO_LOOP_IJK
+          factor_common_c44_muv(INDEX_IJK) = factor_common(1,1,1,i_SLS,ispec) * c44store(INDEX_IJK,ispec)
+        ENDDO_LOOP_IJK
+
       else
-        factor_common_c44_muv(:,:,:) = factor_common(i_SLS,:,:,:,ispec) * muvstore(:,:,:,ispec)
+
+        DO_LOOP_IJK
+          factor_common_c44_muv(INDEX_IJK) = factor_common(1,1,1,i_SLS,ispec) * muvstore(INDEX_IJK,ispec)
+        ENDDO_LOOP_IJK
+
       endif
-    else
-      if(ANISOTROPIC_3D_MANTLE_VAL) then
-        factor_common_c44_muv(:,:,:) = factor_common(i_SLS,1,1,1,ispec) * c44store(:,:,:,ispec)
-      else
-        factor_common_c44_muv(:,:,:) = factor_common(i_SLS,1,1,1,ispec) * muvstore(:,:,:,ispec)
-      endif
     endif
 
-    R_xx(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xx(i_SLS,:,:,:,ispec) + factor_common_c44_muv(:,:,:) * &
-          (betaval(i_SLS) * epsilondev_xx(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(1,:,:,:))
+    ! updates memory variables
+    DO_LOOP_IJK
 
-    R_yy(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_yy(i_SLS,:,:,:,ispec) + factor_common_c44_muv(:,:,:) * &
-          (betaval(i_SLS) * epsilondev_yy(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(2,:,:,:))
+      R_xx(INDEX_IJK,i_SLS,ispec) = alphaval(i_SLS) * R_xx(INDEX_IJK,i_SLS,ispec) + factor_common_c44_muv(INDEX_IJK) * &
+          (betaval(i_SLS) * epsilondev_xx(INDEX_IJK,ispec) + gammaval(i_SLS) * epsilondev_loc(INDEX_IJK,1))
 
-    R_xy(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xy(i_SLS,:,:,:,ispec) + factor_common_c44_muv(:,:,:) * &
-          (betaval(i_SLS) * epsilondev_xy(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(3,:,:,:))
+      R_yy(INDEX_IJK,i_SLS,ispec) = alphaval(i_SLS) * R_yy(INDEX_IJK,i_SLS,ispec) + factor_common_c44_muv(INDEX_IJK) * &
+          (betaval(i_SLS) * epsilondev_yy(INDEX_IJK,ispec) + gammaval(i_SLS) * epsilondev_loc(INDEX_IJK,2))
 
-    R_xz(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xz(i_SLS,:,:,:,ispec) + factor_common_c44_muv(:,:,:) * &
-          (betaval(i_SLS) * epsilondev_xz(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(4,:,:,:))
+      R_xy(INDEX_IJK,i_SLS,ispec) = alphaval(i_SLS) * R_xy(INDEX_IJK,i_SLS,ispec) + factor_common_c44_muv(INDEX_IJK) * &
+          (betaval(i_SLS) * epsilondev_xy(INDEX_IJK,ispec) + gammaval(i_SLS) * epsilondev_loc(INDEX_IJK,3))
 
-    R_yz(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_yz(i_SLS,:,:,:,ispec) + factor_common_c44_muv(:,:,:) * &
-          (betaval(i_SLS) * epsilondev_yz(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(5,:,:,:))
+      R_xz(INDEX_IJK,i_SLS,ispec) = alphaval(i_SLS) * R_xz(INDEX_IJK,i_SLS,ispec) + factor_common_c44_muv(INDEX_IJK) * &
+          (betaval(i_SLS) * epsilondev_xz(INDEX_IJK,ispec) + gammaval(i_SLS) * epsilondev_loc(INDEX_IJK,4))
 
-  enddo ! i_SLS
+      R_yz(INDEX_IJK,i_SLS,ispec) = alphaval(i_SLS) * R_yz(INDEX_IJK,i_SLS,ispec) + factor_common_c44_muv(INDEX_IJK) * &
+          (betaval(i_SLS) * epsilondev_yz(INDEX_IJK,ispec) + gammaval(i_SLS) * epsilondev_loc(INDEX_IJK,5))
 
-#endif
+    ENDDO_LOOP_IJK
 
+  enddo ! i_SLS
+
   end subroutine compute_element_att_memory_cm
 
 
@@ -189,7 +174,6 @@
                                                  R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk, &
                                                  vx,vy,vz,vnspec,factor_common, &
                                                  c44store,muvstore, &
-!                                                epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
                                                  epsilondev_loc, &
                                                  deltat)
 ! crust mantle
@@ -221,30 +205,33 @@
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
 
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION) :: &
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_ATTENUATION) :: &
     R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk
 
   ! variable sized array variables
   integer :: vx,vy,vz,vnspec
 
-  real(kind=CUSTOM_REAL), dimension(N_SLS,vx,vy,vz,vnspec) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(vx,vy,vz,N_SLS,vnspec) :: factor_common
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_MANTLE) :: c44store
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: muvstore
 
-! real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
-!   epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: epsilondev_loc
 
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
-
   real(kind=CUSTOM_REAL) :: deltat
 
-! local parameters
+  ! local parameters
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: factor_common_c44_muv
   integer :: i_SLS
 
+#ifdef FORCE_VECTORIZATION
+  integer :: ijk
+#else
+  integer :: i,j,k
+#endif
+
   ! use Runge-Kutta scheme to march in time
 
   ! get coefficients for that standard linear solid
@@ -256,44 +243,61 @@
     ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
     if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
       if(ANISOTROPIC_3D_MANTLE_VAL) then
-        factor_common_c44_muv(:,:,:) = factor_common(i_SLS,:,:,:,ispec) * c44store(:,:,:,ispec)
+
+        DO_LOOP_IJK
+          factor_common_c44_muv(INDEX_IJK) = factor_common(INDEX_IJK,i_SLS,ispec) * c44store(INDEX_IJK,ispec)
+        ENDDO_LOOP_IJK
       else
-        factor_common_c44_muv(:,:,:) = factor_common(i_SLS,:,:,:,ispec) * muvstore(:,:,:,ispec)
+        DO_LOOP_IJK
+          factor_common_c44_muv(INDEX_IJK) = factor_common(INDEX_IJK,i_SLS,ispec) * muvstore(INDEX_IJK,ispec)
+        ENDDO_LOOP_IJK
       endif
+
     else
+
       if(ANISOTROPIC_3D_MANTLE_VAL) then
-        factor_common_c44_muv(:,:,:) = factor_common(i_SLS,1,1,1,ispec) * c44store(:,:,:,ispec)
+        DO_LOOP_IJK
+          factor_common_c44_muv(INDEX_IJK) = factor_common(1,1,1,i_SLS,ispec) * c44store(INDEX_IJK,ispec)
+        ENDDO_LOOP_IJK
       else
-        factor_common_c44_muv(:,:,:) = factor_common(i_SLS,1,1,1,ispec) * muvstore(:,:,:,ispec)
+        DO_LOOP_IJK
+          factor_common_c44_muv(INDEX_IJK) = factor_common(1,1,1,i_SLS,ispec) * muvstore(INDEX_IJK,ispec)
+        ENDDO_LOOP_IJK
       endif
+
     endif
 
-    R_xx_lddrk(i_SLS,:,:,:,ispec) = ALPHA_LDDRK(istage) * R_xx_lddrk(i_SLS,:,:,:,ispec) &
-                     + deltat * ( factor_common_c44_muv(:,:,:) * epsilondev_loc(1,:,:,:) &
-                                  - R_xx(i_SLS,:,:,:,ispec)*(1._CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
+    ! updates memory variables
+    DO_LOOP_IJK
 
-    R_yy_lddrk(i_SLS,:,:,:,ispec) = ALPHA_LDDRK(istage) * R_yy_lddrk(i_SLS,:,:,:,ispec) &
-                     + deltat * ( factor_common_c44_muv(:,:,:) * epsilondev_loc(2,:,:,:) &
-                                  - R_yy(i_SLS,:,:,:,ispec)*(1._CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
+      R_xx_lddrk(INDEX_IJK,i_SLS,ispec) = ALPHA_LDDRK(istage) * R_xx_lddrk(INDEX_IJK,i_SLS,ispec) &
+                       + deltat * ( factor_common_c44_muv(INDEX_IJK) * epsilondev_loc(INDEX_IJK,1) &
+                                    - R_xx(INDEX_IJK,i_SLS,ispec)*(1._CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
 
-    R_xy_lddrk(i_SLS,:,:,:,ispec) = ALPHA_LDDRK(istage) * R_xy_lddrk(i_SLS,:,:,:,ispec) &
-                     + deltat * ( factor_common_c44_muv(:,:,:) * epsilondev_loc(3,:,:,:) &
-                                  - R_xy(i_SLS,:,:,:,ispec)*(1._CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
+      R_yy_lddrk(INDEX_IJK,i_SLS,ispec) = ALPHA_LDDRK(istage) * R_yy_lddrk(INDEX_IJK,i_SLS,ispec) &
+                       + deltat * ( factor_common_c44_muv(INDEX_IJK) * epsilondev_loc(INDEX_IJK,2) &
+                                    - R_yy(INDEX_IJK,i_SLS,ispec)*(1._CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
 
-    R_xz_lddrk(i_SLS,:,:,:,ispec) = ALPHA_LDDRK(istage) * R_xz_lddrk(i_SLS,:,:,:,ispec) &
-                     + deltat * ( factor_common_c44_muv(:,:,:) * epsilondev_loc(4,:,:,:) &
-                                  - R_xz(i_SLS,:,:,:,ispec)*(1._CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
+      R_xy_lddrk(INDEX_IJK,i_SLS,ispec) = ALPHA_LDDRK(istage) * R_xy_lddrk(INDEX_IJK,i_SLS,ispec) &
+                       + deltat * ( factor_common_c44_muv(INDEX_IJK) * epsilondev_loc(INDEX_IJK,3) &
+                                    - R_xy(INDEX_IJK,i_SLS,ispec)*(1._CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
 
-    R_yz_lddrk(i_SLS,:,:,:,ispec) = ALPHA_LDDRK(istage) * R_yz_lddrk(i_SLS,:,:,:,ispec) &
-                     + deltat * ( factor_common_c44_muv(:,:,:) * epsilondev_loc(5,:,:,:) &
-                                  - R_yz(i_SLS,:,:,:,ispec)*(1._CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
+      R_xz_lddrk(INDEX_IJK,i_SLS,ispec) = ALPHA_LDDRK(istage) * R_xz_lddrk(INDEX_IJK,i_SLS,ispec) &
+                       + deltat * ( factor_common_c44_muv(INDEX_IJK) * epsilondev_loc(INDEX_IJK,4) &
+                                    - R_xz(INDEX_IJK,i_SLS,ispec)*(1._CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
 
-    R_xx(i_SLS,:,:,:,ispec) = R_xx(i_SLS,:,:,:,ispec) + BETA_LDDRK(istage) * R_xx_lddrk(i_SLS,:,:,:,ispec)
-    R_yy(i_SLS,:,:,:,ispec) = R_yy(i_SLS,:,:,:,ispec) + BETA_LDDRK(istage) * R_yy_lddrk(i_SLS,:,:,:,ispec)
-    R_xy(i_SLS,:,:,:,ispec) = R_xy(i_SLS,:,:,:,ispec) + BETA_LDDRK(istage) * R_xy_lddrk(i_SLS,:,:,:,ispec)
-    R_xz(i_SLS,:,:,:,ispec) = R_xz(i_SLS,:,:,:,ispec) + BETA_LDDRK(istage) * R_xz_lddrk(i_SLS,:,:,:,ispec)
-    R_yz(i_SLS,:,:,:,ispec) = R_yz(i_SLS,:,:,:,ispec) + BETA_LDDRK(istage) * R_yz_lddrk(i_SLS,:,:,:,ispec)
+      R_yz_lddrk(INDEX_IJK,i_SLS,ispec) = ALPHA_LDDRK(istage) * R_yz_lddrk(INDEX_IJK,i_SLS,ispec) &
+                       + deltat * ( factor_common_c44_muv(INDEX_IJK) * epsilondev_loc(INDEX_IJK,5) &
+                                    - R_yz(INDEX_IJK,i_SLS,ispec)*(1._CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
 
+      R_xx(INDEX_IJK,i_SLS,ispec) = R_xx(INDEX_IJK,i_SLS,ispec) + BETA_LDDRK(istage) * R_xx_lddrk(INDEX_IJK,i_SLS,ispec)
+      R_yy(INDEX_IJK,i_SLS,ispec) = R_yy(INDEX_IJK,i_SLS,ispec) + BETA_LDDRK(istage) * R_yy_lddrk(INDEX_IJK,i_SLS,ispec)
+      R_xy(INDEX_IJK,i_SLS,ispec) = R_xy(INDEX_IJK,i_SLS,ispec) + BETA_LDDRK(istage) * R_xy_lddrk(INDEX_IJK,i_SLS,ispec)
+      R_xz(INDEX_IJK,i_SLS,ispec) = R_xz(INDEX_IJK,i_SLS,ispec) + BETA_LDDRK(istage) * R_xz_lddrk(INDEX_IJK,i_SLS,ispec)
+      R_yz(INDEX_IJK,i_SLS,ispec) = R_yz(INDEX_IJK,i_SLS,ispec) + BETA_LDDRK(istage) * R_yz_lddrk(INDEX_IJK,i_SLS,ispec)
+
+    ENDDO_LOOP_IJK
+
   enddo ! i_SLS
 
   end subroutine compute_element_att_memory_cm_lddrk
@@ -337,13 +341,12 @@
   ! element id
   integer :: ispec
 
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: &
-    R_xx,R_yy,R_xy,R_xz,R_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
 
   ! variable sized array variables
   integer :: vx,vy,vz,vnspec
 
-  real(kind=CUSTOM_REAL), dimension(N_SLS,vx,vy,vz,vnspec) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(vx,vy,vz,N_SLS,vnspec) :: factor_common
   real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: muvstore
@@ -352,7 +355,7 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
     epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
 
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: epsilondev_loc
 
 ! local parameters
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: factor_common_use
@@ -361,6 +364,8 @@
 
 #ifdef FORCE_VECTORIZATION
   integer :: ijk
+#else
+  integer :: i,j,k
 #endif
 
   ! use Runge-Kutta scheme to march in time
@@ -372,66 +377,45 @@
   ! note: epsilondev_loc is calculated based on displ( n + 1 ), thus corresponds to strain at time (n + 1)
   !       epsilondev_xx,.. are stored from previous step, thus corresponds now to strain at time n
 
-#ifdef FORCE_VECTORIZATION
-
   do i_SLS = 1,N_SLS
+
     ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
     if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-      do ijk=1,NGLLCUBE
-        factor_common_use(ijk,1,1) = factor_common(i_SLS,ijk,1,1,ispec) * muvstore(ijk,1,1,ispec)
-      enddo
-    else
-      do ijk=1,NGLLCUBE
-        factor_common_use(ijk,1,1) = factor_common(i_SLS,1,1,1,ispec) * muvstore(ijk,1,1,ispec)
-      enddo
-    endif
 
-    do ijk=1,NGLLCUBE
-      R_xx(i_SLS,ijk,1,1,ispec) = alphaval(i_SLS) * R_xx(i_SLS,ijk,1,1,ispec) + factor_common_use(ijk,1,1) * &
-           (betaval(i_SLS) * epsilondev_xx(ijk,1,1,ispec) + gammaval(i_SLS) * epsilondev_loc(1,ijk,1,1))
+      DO_LOOP_IJK
+        factor_common_use(INDEX_IJK) = factor_common(INDEX_IJK,i_SLS,ispec) * muvstore(INDEX_IJK,ispec)
+      ENDDO_LOOP_IJK
 
-      R_yy(i_SLS,ijk,1,1,ispec) = alphaval(i_SLS) * R_yy(i_SLS,ijk,1,1,ispec) + factor_common_use(ijk,1,1) * &
-           (betaval(i_SLS) * epsilondev_yy(ijk,1,1,ispec) + gammaval(i_SLS) * epsilondev_loc(2,ijk,1,1))
+    else
 
-      R_xy(i_SLS,ijk,1,1,ispec) = alphaval(i_SLS) * R_xy(i_SLS,ijk,1,1,ispec) + factor_common_use(ijk,1,1) * &
-           (betaval(i_SLS) * epsilondev_xy(ijk,1,1,ispec) + gammaval(i_SLS) * epsilondev_loc(3,ijk,1,1))
+      DO_LOOP_IJK
+        factor_common_use(INDEX_IJK) = factor_common(1,1,1,i_SLS,ispec) * muvstore(INDEX_IJK,ispec)
+      ENDDO_LOOP_IJK
 
-      R_xz(i_SLS,ijk,1,1,ispec) = alphaval(i_SLS) * R_xz(i_SLS,ijk,1,1,ispec) + factor_common_use(ijk,1,1) * &
-           (betaval(i_SLS) * epsilondev_xz(ijk,1,1,ispec) + gammaval(i_SLS) * epsilondev_loc(4,ijk,1,1))
+    endif
 
-      R_yz(i_SLS,ijk,1,1,ispec) = alphaval(i_SLS) * R_yz(i_SLS,ijk,1,1,ispec) + factor_common_use(ijk,1,1) * &
-           (betaval(i_SLS) * epsilondev_yz(ijk,1,1,ispec) + gammaval(i_SLS) * epsilondev_loc(5,ijk,1,1))
-    enddo
-  enddo
+    ! updates memory variables
+    DO_LOOP_IJK
 
-#else
+      R_xx(INDEX_IJK,i_SLS,ispec) = alphaval(i_SLS) * R_xx(INDEX_IJK,i_SLS,ispec) + factor_common_use(INDEX_IJK) * &
+           (betaval(i_SLS) * epsilondev_xx(INDEX_IJK,ispec) + gammaval(i_SLS) * epsilondev_loc(INDEX_IJK,1))
 
-  do i_SLS = 1,N_SLS
-    ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
-    if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-      factor_common_use(:,:,:) = factor_common(i_SLS,:,:,:,ispec) * muvstore(:,:,:,ispec)
-    else
-      factor_common_use(:,:,:) = factor_common(i_SLS,1,1,1,ispec) * muvstore(:,:,:,ispec)
-    endif
+      R_yy(INDEX_IJK,i_SLS,ispec) = alphaval(i_SLS) * R_yy(INDEX_IJK,i_SLS,ispec) + factor_common_use(INDEX_IJK) * &
+           (betaval(i_SLS) * epsilondev_yy(INDEX_IJK,ispec) + gammaval(i_SLS) * epsilondev_loc(INDEX_IJK,2))
 
-    R_xx(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xx(i_SLS,:,:,:,ispec) + factor_common_use(:,:,:) * &
-         (betaval(i_SLS) * epsilondev_xx(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(1,:,:,:))
+      R_xy(INDEX_IJK,i_SLS,ispec) = alphaval(i_SLS) * R_xy(INDEX_IJK,i_SLS,ispec) + factor_common_use(INDEX_IJK) * &
+           (betaval(i_SLS) * epsilondev_xy(INDEX_IJK,ispec) + gammaval(i_SLS) * epsilondev_loc(INDEX_IJK,3))
 
-    R_yy(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_yy(i_SLS,:,:,:,ispec) + factor_common_use(:,:,:) * &
-         (betaval(i_SLS) * epsilondev_yy(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(2,:,:,:))
+      R_xz(INDEX_IJK,i_SLS,ispec) = alphaval(i_SLS) * R_xz(INDEX_IJK,i_SLS,ispec) + factor_common_use(INDEX_IJK) * &
+           (betaval(i_SLS) * epsilondev_xz(INDEX_IJK,ispec) + gammaval(i_SLS) * epsilondev_loc(INDEX_IJK,4))
 
-    R_xy(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xy(i_SLS,:,:,:,ispec) + factor_common_use(:,:,:) * &
-         (betaval(i_SLS) * epsilondev_xy(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(3,:,:,:))
+      R_yz(INDEX_IJK,i_SLS,ispec) = alphaval(i_SLS) * R_yz(INDEX_IJK,i_SLS,ispec) + factor_common_use(INDEX_IJK) * &
+           (betaval(i_SLS) * epsilondev_yz(INDEX_IJK,ispec) + gammaval(i_SLS) * epsilondev_loc(INDEX_IJK,5))
 
-    R_xz(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xz(i_SLS,:,:,:,ispec) + factor_common_use(:,:,:) * &
-         (betaval(i_SLS) * epsilondev_xz(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(4,:,:,:))
+    ENDDO_LOOP_IJK
 
-    R_yz(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_yz(i_SLS,:,:,:,ispec) + factor_common_use(:,:,:) * &
-         (betaval(i_SLS) * epsilondev_yz(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(5,:,:,:))
-  enddo
+  enddo ! N_SLS
 
-#endif
-
   end subroutine compute_element_att_memory_ic
 
 
@@ -443,7 +427,6 @@
                                                  R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk, &
                                                  vx,vy,vz,vnspec,factor_common, &
                                                  muvstore, &
-!                                                epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
                                                  epsilondev_loc, &
                                                  deltat)
 ! inner core
@@ -471,32 +454,33 @@
   ! element id
   integer :: ispec
 
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: &
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_ATTENUATION) :: &
     R_xx,R_yy,R_xy,R_xz,R_yz
 
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: &
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_ATTENUATION) :: &
     R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk
 
   ! variable sized array variables
   integer :: vx,vy,vz,vnspec
 
-  real(kind=CUSTOM_REAL), dimension(N_SLS,vx,vy,vz,vnspec) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(vx,vy,vz,N_SLS,vnspec) :: factor_common
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: muvstore
 
-! real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilondev
-! real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
-!   epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: epsilondev_loc
 
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
-
   real(kind=CUSTOM_REAL) :: deltat
 
-! local parameters
+  ! local parameters
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: factor_common_use
 
   integer :: i_SLS
+
+#ifdef FORCE_VECTORIZATION
+  integer :: ijk
+#else
   integer :: i,j,k
+#endif
 
   ! use Runge-Kutta scheme to march in time
 
@@ -511,43 +495,46 @@
 
     ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
     if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-      factor_common_use(:,:,:) = factor_common(i_SLS,:,:,:,ispec) * muvstore(:,:,:,ispec)
+      DO_LOOP_IJK
+        factor_common_use(INDEX_IJK) = factor_common(INDEX_IJK,i_SLS,ispec) * muvstore(INDEX_IJK,ispec)
+      ENDDO_LOOP_IJK
     else
-      factor_common_use(:,:,:) = factor_common(i_SLS,1,1,1,ispec) * muvstore(:,:,:,ispec)
+      DO_LOOP_IJK
+        factor_common_use(INDEX_IJK) = factor_common(1,1,1,i_SLS,ispec) * muvstore(INDEX_IJK,ispec)
+      ENDDO_LOOP_IJK
     endif
 
-    do k = 1,NGLLZ
-      do j = 1,NGLLY
-        do i = 1,NGLLX
-          R_xx_lddrk(i_SLS,i,j,k,ispec) = ALPHA_LDDRK(istage) * R_xx_lddrk(i_SLS,i,j,k,ispec) &
-            + deltat * ( factor_common_use(i,j,k)*epsilondev_loc(1,i,j,k) &
-                         - R_xx(i_SLS,i,j,k,ispec)*(1.0_CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
+    ! updates memory variables
+    DO_LOOP_IJK
 
-          R_yy_lddrk(i_SLS,i,j,k,ispec) = ALPHA_LDDRK(istage) * R_yy_lddrk(i_SLS,i,j,k,ispec) &
-            + deltat * ( factor_common_use(i,j,k)*epsilondev_loc(2,i,j,k) &
-                         - R_yy(i_SLS,i,j,k,ispec)*(1.0_CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
+      R_xx_lddrk(INDEX_IJK,i_SLS,ispec) = ALPHA_LDDRK(istage) * R_xx_lddrk(INDEX_IJK,i_SLS,ispec) &
+        + deltat * ( factor_common_use(INDEX_IJK)*epsilondev_loc(INDEX_IJK,1) &
+                     - R_xx(INDEX_IJK,i_SLS,ispec)*(1.0_CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
 
-          R_xy_lddrk(i_SLS,i,j,k,ispec) = ALPHA_LDDRK(istage) * R_xy_lddrk(i_SLS,i,j,k,ispec) &
-            + deltat * ( factor_common_use(i,j,k)*epsilondev_loc(3,i,j,k) &
-                         - R_xy(i_SLS,i,j,k,ispec)*(1.0_CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
+      R_yy_lddrk(INDEX_IJK,i_SLS,ispec) = ALPHA_LDDRK(istage) * R_yy_lddrk(INDEX_IJK,i_SLS,ispec) &
+        + deltat * ( factor_common_use(INDEX_IJK)*epsilondev_loc(INDEX_IJK,2) &
+                     - R_yy(INDEX_IJK,i_SLS,ispec)*(1.0_CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
 
-          R_xz_lddrk(i_SLS,i,j,k,ispec) = ALPHA_LDDRK(istage) * R_xz_lddrk(i_SLS,i,j,k,ispec) &
-            + deltat * ( factor_common_use(i,j,k)*epsilondev_loc(4,i,j,k) &
-                         - R_xz(i_SLS,i,j,k,ispec)*(1.0_CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
+      R_xy_lddrk(INDEX_IJK,i_SLS,ispec) = ALPHA_LDDRK(istage) * R_xy_lddrk(INDEX_IJK,i_SLS,ispec) &
+        + deltat * ( factor_common_use(INDEX_IJK)*epsilondev_loc(INDEX_IJK,3) &
+                     - R_xy(INDEX_IJK,i_SLS,ispec)*(1.0_CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
 
-          R_yz_lddrk(i_SLS,i,j,k,ispec) = ALPHA_LDDRK(istage) * R_yz_lddrk(i_SLS,i,j,k,ispec) &
-            + deltat * ( factor_common_use(i,j,k)*epsilondev_loc(5,i,j,k) &
-                         - R_yz(i_SLS,i,j,k,ispec)*(1.0_CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
+      R_xz_lddrk(INDEX_IJK,i_SLS,ispec) = ALPHA_LDDRK(istage) * R_xz_lddrk(INDEX_IJK,i_SLS,ispec) &
+        + deltat * ( factor_common_use(INDEX_IJK)*epsilondev_loc(INDEX_IJK,4) &
+                     - R_xz(INDEX_IJK,i_SLS,ispec)*(1.0_CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
 
-          R_xx(i_SLS,i,j,k,ispec) = R_xx(i_SLS,i,j,k,ispec) + BETA_LDDRK(istage) * R_xx_lddrk(i_SLS,i,j,k,ispec)
-          R_yy(i_SLS,i,j,k,ispec) = R_yy(i_SLS,i,j,k,ispec) + BETA_LDDRK(istage) * R_yy_lddrk(i_SLS,i,j,k,ispec)
-          R_xy(i_SLS,i,j,k,ispec) = R_xy(i_SLS,i,j,k,ispec) + BETA_LDDRK(istage) * R_xy_lddrk(i_SLS,i,j,k,ispec)
-          R_xz(i_SLS,i,j,k,ispec) = R_xz(i_SLS,i,j,k,ispec) + BETA_LDDRK(istage) * R_xz_lddrk(i_SLS,i,j,k,ispec)
-          R_yz(i_SLS,i,j,k,ispec) = R_yz(i_SLS,i,j,k,ispec) + BETA_LDDRK(istage) * R_yz_lddrk(i_SLS,i,j,k,ispec)
-        enddo
-      enddo
-    enddo
+      R_yz_lddrk(INDEX_IJK,i_SLS,ispec) = ALPHA_LDDRK(istage) * R_yz_lddrk(INDEX_IJK,i_SLS,ispec) &
+        + deltat * ( factor_common_use(INDEX_IJK)*epsilondev_loc(INDEX_IJK,5) &
+                     - R_yz(INDEX_IJK,i_SLS,ispec)*(1.0_CUSTOM_REAL/tau_sigma_CUSTOM_REAL(i_SLS)) )
 
+      R_xx(INDEX_IJK,i_SLS,ispec) = R_xx(INDEX_IJK,i_SLS,ispec) + BETA_LDDRK(istage) * R_xx_lddrk(INDEX_IJK,i_SLS,ispec)
+      R_yy(INDEX_IJK,i_SLS,ispec) = R_yy(INDEX_IJK,i_SLS,ispec) + BETA_LDDRK(istage) * R_yy_lddrk(INDEX_IJK,i_SLS,ispec)
+      R_xy(INDEX_IJK,i_SLS,ispec) = R_xy(INDEX_IJK,i_SLS,ispec) + BETA_LDDRK(istage) * R_xy_lddrk(INDEX_IJK,i_SLS,ispec)
+      R_xz(INDEX_IJK,i_SLS,ispec) = R_xz(INDEX_IJK,i_SLS,ispec) + BETA_LDDRK(istage) * R_xz_lddrk(INDEX_IJK,i_SLS,ispec)
+      R_yz(INDEX_IJK,i_SLS,ispec) = R_yz(INDEX_IJK,i_SLS,ispec) + BETA_LDDRK(istage) * R_yz_lddrk(INDEX_IJK,i_SLS,ispec)
+
+    ENDDO_LOOP_IJK
+
   enddo
 
   end subroutine compute_element_att_memory_ic_lddrk
@@ -637,9 +624,9 @@
 !
 !    ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
 !    if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-!      factor_common_c44_muv = factor_common(i_SLS,i,j,k,ispec) * c44_muv
+!      factor_common_c44_muv = factor_common(i,j,k,i_SLS,ispec) * c44_muv
 !    else
-!      factor_common_c44_muv = factor_common(i_SLS,1,1,1,ispec) * c44_muv
+!      factor_common_c44_muv = factor_common(1,1,1,i_SLS,ispec) * c44_muv
 !    endif
 !
 !    ! adds contributions from current strain

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_strain.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_strain.F90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_strain.F90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -53,50 +53,27 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec),intent(in) :: &
         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
 
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ),intent(out) :: epsilondev_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5),intent(out) :: epsilondev_loc
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ),intent(out) :: eps_trace_over_3_loc
 
-!  local variable
+  !  local variable
   integer :: iglob
-  integer :: i,j,k
 
   real(kind=CUSTOM_REAL) :: templ
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
-  real(kind=CUSTOM_REAL), dimension(NGLLX,m2) :: B1_m1_m2_5points,B2_m1_m2_5points,B3_m1_m2_5points
 
-  equivalence(dummyx_loc,B1_m1_m2_5points)
-  equivalence(dummyy_loc,B2_m1_m2_5points)
-  equivalence(dummyz_loc,B3_m1_m2_5points)
+  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl
+  real(kind=CUSTOM_REAL) :: duxdxl,duydyl,duzdzl,duxdyl,duydxl,duzdxl,duxdzl,duzdyl,duydzl, &
+                            duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl, &
+                            duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
 
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points,C2_m1_m2_5points,C3_m1_m2_5points
-
-  equivalence(tempx1,C1_m1_m2_5points)
-  equivalence(tempy1,C2_m1_m2_5points)
-  equivalence(tempz1,C3_m1_m2_5points)
-
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: &
-    C1_mxm_m2_m1_5points,C2_mxm_m2_m1_5points,C3_mxm_m2_m1_5points
-
-  equivalence(tempx3,C1_mxm_m2_m1_5points)
-  equivalence(tempy3,C2_mxm_m2_m1_5points)
-  equivalence(tempz3,C3_mxm_m2_m1_5points)
-
-  real(kind=CUSTOM_REAL), dimension(m2,NGLLX) :: &
-    A1_mxm_m2_m1_5points,A2_mxm_m2_m1_5points,A3_mxm_m2_m1_5points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_5points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_5points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_5points)
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duydyl,duzdzl,duxdyl,duydxl,duzdxl,duxdzl,duzdyl,duydzl,&
-                         duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,&
-                         duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
 #ifdef FORCE_VECTORIZATION
   integer :: ijk
+#else
+  integer :: i,j,k
 #endif
 
   DO_LOOP_IJK
@@ -109,75 +86,13 @@
   ENDDO_LOOP_IJK
 
   ! deville optimizations
-  do j=1,m2
-    do i=1,m1
-      C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
-                              hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
-                              hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
-                              hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
-                              hprime_xx(i,5)*B1_m1_m2_5points(5,j)
+  ! computes 1. matrix multiplication for tempx1,..
+  call mxm5_3comp_singleA(hprime_xx,m1,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1,m2)
+  ! computes 2. matrix multiplication for tempx2,..
+  call mxm5_3comp_3dmat_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m1,hprime_xxT,m1,tempx2,tempy2,tempz2,NGLLX)
+  ! computes 3. matrix multiplication for tempx1,..
+  call mxm5_3comp_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m2,hprime_xxT,tempx3,tempy3,tempz3,m1)
 
-      C2_m1_m2_5points(i,j) = hprime_xx(i,1)*B2_m1_m2_5points(1,j) + &
-                              hprime_xx(i,2)*B2_m1_m2_5points(2,j) + &
-                              hprime_xx(i,3)*B2_m1_m2_5points(3,j) + &
-                              hprime_xx(i,4)*B2_m1_m2_5points(4,j) + &
-                              hprime_xx(i,5)*B2_m1_m2_5points(5,j)
-
-      C3_m1_m2_5points(i,j) = hprime_xx(i,1)*B3_m1_m2_5points(1,j) + &
-                              hprime_xx(i,2)*B3_m1_m2_5points(2,j) + &
-                              hprime_xx(i,3)*B3_m1_m2_5points(3,j) + &
-                              hprime_xx(i,4)*B3_m1_m2_5points(4,j) + &
-                              hprime_xx(i,5)*B3_m1_m2_5points(5,j)
-    enddo
-  enddo
-
-  do j=1,m1
-    do i=1,m1
-      ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-      do k = 1,NGLLX
-        tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                        dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                        dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                        dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                        dummyx_loc(i,5,k)*hprime_xxT(5,j)
-
-        tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                        dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                        dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                        dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                        dummyy_loc(i,5,k)*hprime_xxT(5,j)
-
-        tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                        dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                        dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                        dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                        dummyz_loc(i,5,k)*hprime_xxT(5,j)
-      enddo
-    enddo
-  enddo
-
-  do j=1,m1
-    do i=1,m2
-      C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                  A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                  A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                  A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                  A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-
-      C2_mxm_m2_m1_5points(i,j) = A2_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                  A2_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                  A2_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                  A2_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                  A2_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-
-      C3_mxm_m2_m1_5points(i,j) = A3_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                  A3_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                  A3_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                  A3_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                  A3_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-    enddo
-  enddo
-
   DO_LOOP_IJK
 
     ! get derivatives of ux, uy and uz with respect to x, y and z
@@ -191,11 +106,6 @@
     gammayl = gammay(INDEX_IJK,ispec)
     gammazl = gammaz(INDEX_IJK,ispec)
 
-    ! compute the jacobian
-    jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                  - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                  + xizl*(etaxl*gammayl-etayl*gammaxl))
-
     duxdxl = xixl*tempx1(INDEX_IJK) + etaxl*tempx2(INDEX_IJK) + gammaxl*tempx3(INDEX_IJK)
     duxdyl = xiyl*tempx1(INDEX_IJK) + etayl*tempx2(INDEX_IJK) + gammayl*tempx3(INDEX_IJK)
     duxdzl = xizl*tempx1(INDEX_IJK) + etazl*tempx2(INDEX_IJK) + gammazl*tempx3(INDEX_IJK)
@@ -219,14 +129,164 @@
     ! strains
     templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
     eps_trace_over_3_loc(INDEX_IJK) = templ
-    epsilondev_loc(1,INDEX_IJK) = duxdxl - templ
-    epsilondev_loc(2,INDEX_IJK) = duydyl - templ
-    epsilondev_loc(3,INDEX_IJK) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-    epsilondev_loc(4,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-    epsilondev_loc(5,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+    epsilondev_loc(INDEX_IJK,1) = duxdxl - templ
+    epsilondev_loc(INDEX_IJK,2) = duydyl - templ
+    epsilondev_loc(INDEX_IJK,3) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+    epsilondev_loc(INDEX_IJK,4) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+    epsilondev_loc(INDEX_IJK,5) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
 
   ENDDO_LOOP_IJK
 
+  contains
+
+!--------------------------------------------------------------------------------------------
+!
+! matrix-matrix multiplications
+!
+! subroutines adapted from Deville, Fischer and Mund, High-order methods
+! for incompressible fluid flow, Cambridge University Press (2002),
+! pages 386 and 389 and Figure 8.3.1
+!
+!--------------------------------------------------------------------------------------------
+!
+! note: the matrix-matrix multiplications are used for very small matrices ( 5 x 5 x 5 elements);
+!       thus, calling external optimized libraries for these multiplications are in general slower
+!
+! please leave the routines here to help compilers inlining the code
+
+  subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5) :: A
+  real(kind=CUSTOM_REAL),dimension(5,n3) :: B1,B2,B3
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A(i,1) * B1(1,j) &
+               + A(i,2) * B1(2,j) &
+               + A(i,3) * B1(3,j) &
+               + A(i,4) * B1(4,j) &
+               + A(i,5) * B1(5,j)
+
+      C2(i,j) =  A(i,1) * B2(1,j) &
+               + A(i,2) * B2(2,j) &
+               + A(i,3) * B2(3,j) &
+               + A(i,4) * B2(4,j) &
+               + A(i,5) * B2(5,j)
+
+      C3(i,j) =  A(i,1) * B3(1,j) &
+               + A(i,2) * B3(2,j) &
+               + A(i,3) * B3(3,j) &
+               + A(i,4) * B3(4,j) &
+               + A(i,5) * B3(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_singleA
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(5,n3) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A1(i,1) * B(1,j) &
+               + A1(i,2) * B(2,j) &
+               + A1(i,3) * B(3,j) &
+               + A1(i,4) * B(4,j) &
+               + A1(i,5) * B(5,j)
+
+      C2(i,j) =  A2(i,1) * B(1,j) &
+               + A2(i,2) * B(2,j) &
+               + A2(i,3) * B(3,j) &
+               + A2(i,4) * B(4,j) &
+               + A2(i,5) * B(5,j)
+
+      C3(i,j) =  A3(i,1) * B(1,j) &
+               + A3(i,2) * B(2,j) &
+               + A3(i,3) * B(3,j) &
+               + A3(i,4) * B(4,j) &
+               + A3(i,5) * B(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_singleB
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 3-dimensional arrays (5,5,5), same B matrix for all 3 component arrays
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n2,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5,n3) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(5,n2) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n2,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j,k
+
+  ! matrix-matrix multiplication
+  do j=1,n2
+    do i=1,n1
+      ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+      do k=1,n3
+        C1(i,j,k) =  A1(i,1,k) * B(1,j) &
+                   + A1(i,2,k) * B(2,j) &
+                   + A1(i,3,k) * B(3,j) &
+                   + A1(i,4,k) * B(4,j) &
+                   + A1(i,5,k) * B(5,j)
+
+        C2(i,j,k) =  A2(i,1,k) * B(1,j) &
+                   + A2(i,2,k) * B(2,j) &
+                   + A2(i,3,k) * B(3,j) &
+                   + A2(i,4,k) * B(4,j) &
+                   + A2(i,5,k) * B(5,j)
+
+        C3(i,j,k) =  A3(i,1,k) * B(1,j) &
+                   + A3(i,2,k) * B(2,j) &
+                   + A3(i,3,k) * B(3,j) &
+                   + A3(i,4,k) * B(4,j) &
+                   + A3(i,5,k) * B(5,j)
+      enddo
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_3dmat_singleB
+
+
   end subroutine compute_element_strain_undo_att_Dev
 
 
@@ -264,7 +324,7 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC),intent(in) :: &
     xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
 
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ),intent(out) :: epsilondev_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5),intent(out) :: epsilondev_loc
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ),intent(out) :: eps_trace_over_3_loc
 
   ! local parameters
@@ -276,7 +336,7 @@
   real(kind=CUSTOM_REAL) tempz1l,tempz2l,tempz3l
 
   real(kind=CUSTOM_REAL) hp1,hp2,hp3
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl
   real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
   real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
   real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
@@ -321,7 +381,7 @@
           tempz3l = tempz3l + displ(3,iglob)*hp3
         enddo
 
-!         get derivatives of ux, uy and uz with respect to x, y and z
+        ! get derivatives of ux, uy and uz with respect to x, y and z
         xixl = xix(i,j,k,ispec)
         xiyl = xiy(i,j,k,ispec)
         xizl = xiz(i,j,k,ispec)
@@ -332,10 +392,10 @@
         gammayl = gammay(i,j,k,ispec)
         gammazl = gammaz(i,j,k,ispec)
 
-! compute the jacobian
-        jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                      - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                      + xizl*(etaxl*gammayl-etayl*gammaxl))
+        ! compute the jacobian
+        !jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
+        !              - xiyl*(etaxl*gammazl-etazl*gammaxl) &
+        !              + xizl*(etaxl*gammayl-etayl*gammaxl))
 
         duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
         duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
@@ -349,7 +409,7 @@
         duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
         duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
 
-! precompute some sums to save CPU time
+        ! precompute some sums to save CPU time
         duxdxl_plus_duydyl = duxdxl + duydyl
         duxdxl_plus_duzdzl = duxdxl + duzdzl
         duydyl_plus_duzdzl = duydyl + duzdzl
@@ -357,13 +417,13 @@
         duzdxl_plus_duxdzl = duzdxl + duxdzl
         duzdyl_plus_duydzl = duzdyl + duydzl
 
-! compute deviatoric strain
+        ! compute deviatoric strain
         eps_trace_over_3_loc(i,j,k) = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-        epsilondev_loc(1,i,j,k) = duxdxl - eps_trace_over_3_loc(i,j,k)
-        epsilondev_loc(2,i,j,k) = duydyl - eps_trace_over_3_loc(i,j,k)
-        epsilondev_loc(3,i,j,k) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-        epsilondev_loc(4,i,j,k) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-        epsilondev_loc(5,i,j,k) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+        epsilondev_loc(i,j,k,1) = duxdxl - eps_trace_over_3_loc(i,j,k)
+        epsilondev_loc(i,j,k,2) = duydyl - eps_trace_over_3_loc(i,j,k)
+        epsilondev_loc(i,j,k,3) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+        epsilondev_loc(i,j,k,4) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+        epsilondev_loc(i,j,k,5) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
 
       enddo ! NGLLX
     enddo ! NGLLY
@@ -379,7 +439,7 @@
 !--------------------------------------------------------------------------------------------
 
 
- subroutine compute_element_strain_att_Dev(ispec,nglob,nspec, &
+  subroutine compute_element_strain_att_Dev(ispec,nglob,nspec, &
                                            displ,veloc,deltat, &
                                            ibool, &
                                            hprime_xx,hprime_xxT,&
@@ -416,38 +476,22 @@
 
   ! local variable
   integer :: iglob
-  integer :: i,j,k
 
   real(kind=CUSTOM_REAL) :: templ
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
-  real(kind=CUSTOM_REAL), dimension(NGLLX,m2) :: B1_m1_m2_5points,B2_m1_m2_5points,B3_m1_m2_5points
-  equivalence(dummyx_loc,B1_m1_m2_5points)
-  equivalence(dummyy_loc,B2_m1_m2_5points)
-  equivalence(dummyz_loc,B3_m1_m2_5points)
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points,C2_m1_m2_5points,C3_m1_m2_5points
-  equivalence(tempx1,C1_m1_m2_5points)
-  equivalence(tempy1,C2_m1_m2_5points)
-  equivalence(tempz1,C3_m1_m2_5points)
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: &
-    C1_mxm_m2_m1_5points,C2_mxm_m2_m1_5points,C3_mxm_m2_m1_5points
-  equivalence(tempx3,C1_mxm_m2_m1_5points)
-  equivalence(tempy3,C2_mxm_m2_m1_5points)
-  equivalence(tempz3,C3_mxm_m2_m1_5points)
-  real(kind=CUSTOM_REAL), dimension(m2,NGLLX) :: &
-    A1_mxm_m2_m1_5points,A2_mxm_m2_m1_5points,A3_mxm_m2_m1_5points
-  equivalence(dummyx_loc,A1_mxm_m2_m1_5points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_5points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_5points)
 
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duydyl,duzdzl,duxdyl,duydxl,duzdxl,duxdzl,duzdyl,duydzl,&
-                         duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,&
-                         duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl
+  real(kind=CUSTOM_REAL) :: duxdxl,duydyl,duzdzl,duxdyl,duydxl,duzdxl,duxdzl,duzdyl,duydzl, &
+                            duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl, &
+                            duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
 
 #ifdef FORCE_VECTORIZATION
   integer :: ijk
+#else
+  integer :: i,j,k
 #endif
 
   DO_LOOP_IJK
@@ -460,75 +504,13 @@
   ENDDO_LOOP_IJK
 
   ! deville optimizations
-  do j=1,m2
-    do i=1,m1
-      C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
-                              hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
-                              hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
-                              hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
-                              hprime_xx(i,5)*B1_m1_m2_5points(5,j)
+  ! computes 1. matrix multiplication for tempx1,..
+  call mxm5_3comp_singleA(hprime_xx,m1,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1,m2)
+  ! computes 2. matrix multiplication for tempx2,..
+  call mxm5_3comp_3dmat_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m1,hprime_xxT,m1,tempx2,tempy2,tempz2,NGLLX)
+  ! computes 3. matrix multiplication for tempx1,..
+  call mxm5_3comp_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m2,hprime_xxT,tempx3,tempy3,tempz3,m1)
 
-      C2_m1_m2_5points(i,j) = hprime_xx(i,1)*B2_m1_m2_5points(1,j) + &
-                              hprime_xx(i,2)*B2_m1_m2_5points(2,j) + &
-                              hprime_xx(i,3)*B2_m1_m2_5points(3,j) + &
-                              hprime_xx(i,4)*B2_m1_m2_5points(4,j) + &
-                              hprime_xx(i,5)*B2_m1_m2_5points(5,j)
-
-      C3_m1_m2_5points(i,j) = hprime_xx(i,1)*B3_m1_m2_5points(1,j) + &
-                              hprime_xx(i,2)*B3_m1_m2_5points(2,j) + &
-                              hprime_xx(i,3)*B3_m1_m2_5points(3,j) + &
-                              hprime_xx(i,4)*B3_m1_m2_5points(4,j) + &
-                              hprime_xx(i,5)*B3_m1_m2_5points(5,j)
-    enddo
-  enddo
-
-  do j=1,m1
-    do i=1,m1
-      ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-      do k = 1,NGLLX
-        tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                        dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                        dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                        dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                        dummyx_loc(i,5,k)*hprime_xxT(5,j)
-
-        tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                        dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                        dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                        dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                        dummyy_loc(i,5,k)*hprime_xxT(5,j)
-
-        tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                        dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                        dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                        dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                        dummyz_loc(i,5,k)*hprime_xxT(5,j)
-      enddo
-    enddo
-  enddo
-
-  do j=1,m1
-    do i=1,m2
-      C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                  A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                  A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                  A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                  A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-
-      C2_mxm_m2_m1_5points(i,j) = A2_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                  A2_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                  A2_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                  A2_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                  A2_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-
-      C3_mxm_m2_m1_5points(i,j) = A3_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                  A3_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                  A3_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                  A3_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                  A3_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-    enddo
-  enddo
-
   DO_LOOP_IJK
 
     ! get derivatives of ux, uy and uz with respect to x, y and z
@@ -543,9 +525,9 @@
     gammazl = gammaz(INDEX_IJK,ispec)
 
     ! compute the jacobian
-    jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                  - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                  + xizl*(etaxl*gammayl-etayl*gammaxl))
+    !jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
+    !              - xiyl*(etaxl*gammazl-etazl*gammaxl) &
+    !              + xizl*(etaxl*gammayl-etayl*gammaxl))
 
     duxdxl = xixl*tempx1(INDEX_IJK) + etaxl*tempx2(INDEX_IJK) + gammaxl*tempx3(INDEX_IJK)
     duxdyl = xiyl*tempx1(INDEX_IJK) + etayl*tempx2(INDEX_IJK) + gammayl*tempx3(INDEX_IJK)
@@ -583,14 +565,163 @@
 
   ENDDO_LOOP_IJK
 
- end subroutine compute_element_strain_att_Dev
+  contains
 
+!--------------------------------------------------------------------------------------------
+!
+! matrix-matrix multiplications
+!
+! subroutines adapted from Deville, Fischer and Mund, High-order methods
+! for incompressible fluid flow, Cambridge University Press (2002),
+! pages 386 and 389 and Figure 8.3.1
+!
+!--------------------------------------------------------------------------------------------
+!
+! note: the matrix-matrix multiplications are used for very small matrices ( 5 x 5 x 5 elements);
+!       thus, calling external optimized libraries for these multiplications are in general slower
+!
+! please leave the routines here to help compilers inlining the code
 
+  subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5) :: A
+  real(kind=CUSTOM_REAL),dimension(5,n3) :: B1,B2,B3
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A(i,1) * B1(1,j) &
+               + A(i,2) * B1(2,j) &
+               + A(i,3) * B1(3,j) &
+               + A(i,4) * B1(4,j) &
+               + A(i,5) * B1(5,j)
+
+      C2(i,j) =  A(i,1) * B2(1,j) &
+               + A(i,2) * B2(2,j) &
+               + A(i,3) * B2(3,j) &
+               + A(i,4) * B2(4,j) &
+               + A(i,5) * B2(5,j)
+
+      C3(i,j) =  A(i,1) * B3(1,j) &
+               + A(i,2) * B3(2,j) &
+               + A(i,3) * B3(3,j) &
+               + A(i,4) * B3(4,j) &
+               + A(i,5) * B3(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_singleA
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(5,n3) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A1(i,1) * B(1,j) &
+               + A1(i,2) * B(2,j) &
+               + A1(i,3) * B(3,j) &
+               + A1(i,4) * B(4,j) &
+               + A1(i,5) * B(5,j)
+
+      C2(i,j) =  A2(i,1) * B(1,j) &
+               + A2(i,2) * B(2,j) &
+               + A2(i,3) * B(3,j) &
+               + A2(i,4) * B(4,j) &
+               + A2(i,5) * B(5,j)
+
+      C3(i,j) =  A3(i,1) * B(1,j) &
+               + A3(i,2) * B(2,j) &
+               + A3(i,3) * B(3,j) &
+               + A3(i,4) * B(4,j) &
+               + A3(i,5) * B(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_singleB
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 3-dimensional arrays (5,5,5), same B matrix for all 3 component arrays
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n2,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5,n3) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(5,n2) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n2,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j,k
+
+  ! matrix-matrix multiplication
+  do j=1,n2
+    do i=1,n1
+      ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+      do k=1,n3
+        C1(i,j,k) =  A1(i,1,k) * B(1,j) &
+                   + A1(i,2,k) * B(2,j) &
+                   + A1(i,3,k) * B(3,j) &
+                   + A1(i,4,k) * B(4,j) &
+                   + A1(i,5,k) * B(5,j)
+
+        C2(i,j,k) =  A2(i,1,k) * B(1,j) &
+                   + A2(i,2,k) * B(2,j) &
+                   + A2(i,3,k) * B(3,j) &
+                   + A2(i,4,k) * B(4,j) &
+                   + A2(i,5,k) * B(5,j)
+
+        C3(i,j,k) =  A3(i,1,k) * B(1,j) &
+                   + A3(i,2,k) * B(2,j) &
+                   + A3(i,3,k) * B(3,j) &
+                   + A3(i,4,k) * B(4,j) &
+                   + A3(i,5,k) * B(5,j)
+      enddo
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_3dmat_singleB
+
+  end subroutine compute_element_strain_att_Dev
+
+
 !
 !--------------------------------------------------------------------------------------------
 !
 
- subroutine compute_element_strain_att_noDev(ispec,nglob,nspec, &
+  subroutine compute_element_strain_att_noDev(ispec,nglob,nspec, &
                                              displ,veloc,deltat, &
                                              ibool, &
                                              hprime_xx,hprime_yy,hprime_zz, &
@@ -621,7 +752,6 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: &
         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
 
-  !real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc_nplus1
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: epsilondev_xx_loc_nplus1
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: epsilondev_yy_loc_nplus1
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: epsilondev_xy_loc_nplus1
@@ -642,7 +772,7 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
 
   real(kind=CUSTOM_REAL) hp1,hp2,hp3
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl
   real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
   real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
   real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
@@ -706,11 +836,6 @@
         gammayl = gammay(i,j,k,ispec)
         gammazl = gammaz(i,j,k,ispec)
 
-        ! compute the jacobian
-        jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                      - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                      + xizl*(etaxl*gammayl-etayl*gammaxl))
-
         duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
         duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
         duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
@@ -748,5 +873,5 @@
     enddo
   enddo
 
- end subroutine compute_element_strain_att_noDev
+  end subroutine compute_element_strain_att_noDev
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -89,7 +89,6 @@
 
   ! displacement, velocity and acceleration
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB) :: displ_crust_mantle
-! real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB) :: veloc_crust_mantle
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB) :: accel_crust_mantle
 
   ! variable sized array variables
@@ -98,8 +97,8 @@
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATT) :: R_xx,R_yy,R_xy,R_xz,R_yz
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATT) :: &
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_ATT) :: R_xx,R_yy,R_xy,R_xz,R_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_ATT) :: &
     R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: &
@@ -107,7 +106,7 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: epsilon_trace_over_3
 
   ! [alpha,beta,gamma]val reduced to N_SLS and factor_common to N_SLS*NUM_NODES
-  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,N_SLS,vnspec) :: factor_common
   real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
 
   ! inner/outer element run flag
@@ -119,54 +118,28 @@
   ! manually inline the calls to the Deville et al. (2002) routines
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
-  real(kind=CUSTOM_REAL), dimension(NGLLX,m2) :: B1_m1_m2_5points,B2_m1_m2_5points,B3_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points,C2_m1_m2_5points,C3_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: E1_m1_m2_5points,E2_m1_m2_5points,E3_m1_m2_5points
 
-  equivalence(dummyx_loc,B1_m1_m2_5points)
-  equivalence(dummyy_loc,B2_m1_m2_5points)
-  equivalence(dummyz_loc,B3_m1_m2_5points)
-  equivalence(tempx1,C1_m1_m2_5points)
-  equivalence(tempy1,C2_m1_m2_5points)
-  equivalence(tempz1,C3_m1_m2_5points)
-  equivalence(newtempx1,E1_m1_m2_5points)
-  equivalence(newtempy1,E2_m1_m2_5points)
-  equivalence(newtempz1,E3_m1_m2_5points)
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
 
-  real(kind=CUSTOM_REAL), dimension(m2,NGLLX) :: &
-    A1_mxm_m2_m1_5points,A2_mxm_m2_m1_5points,A3_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: &
-    C1_mxm_m2_m1_5points,C2_mxm_m2_m1_5points,C3_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: &
-    E1_mxm_m2_m1_5points,E2_mxm_m2_m1_5points,E3_mxm_m2_m1_5points
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: sum_terms
 
-  equivalence(dummyx_loc,A1_mxm_m2_m1_5points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_5points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_5points)
-  equivalence(tempx3,C1_mxm_m2_m1_5points)
-  equivalence(tempy3,C2_mxm_m2_m1_5points)
-  equivalence(tempz3,C3_mxm_m2_m1_5points)
-  equivalence(newtempx3,E1_mxm_m2_m1_5points)
-  equivalence(newtempy3,E2_mxm_m2_m1_5points)
-  equivalence(newtempz3,E3_mxm_m2_m1_5points)
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sum_terms
-
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: epsilondev_loc
   real(kind=CUSTOM_REAL) fac1,fac2,fac3
 
   ! for gravity
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: rho_s_H
 
   integer :: ispec,iglob
-  integer :: i,j,k
   integer :: num_elements,ispec_p
   integer :: iphase
 
 #ifdef FORCE_VECTORIZATION
   integer :: ijk
+#else
+  integer :: i,j,k
 #endif
 
 ! ****************************************************
@@ -200,75 +173,14 @@
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
-    do j=1,m2
-       do i=1,m1
-          C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
-                                  hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
-                                  hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
-                                  hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
-                                  hprime_xx(i,5)*B1_m1_m2_5points(5,j)
 
-          C2_m1_m2_5points(i,j) = hprime_xx(i,1)*B2_m1_m2_5points(1,j) + &
-                                  hprime_xx(i,2)*B2_m1_m2_5points(2,j) + &
-                                  hprime_xx(i,3)*B2_m1_m2_5points(3,j) + &
-                                  hprime_xx(i,4)*B2_m1_m2_5points(4,j) + &
-                                  hprime_xx(i,5)*B2_m1_m2_5points(5,j)
+    ! computes 1. matrix multiplication for tempx1,..
+    call mxm5_3comp_singleA(hprime_xx,m1,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1,m2)
+    ! computes 2. matrix multiplication for tempx2,..
+    call mxm5_3comp_3dmat_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m1,hprime_xxT,m1,tempx2,tempy2,tempz2,NGLLX)
+    ! computes 3. matrix multiplication for tempx1,..
+    call mxm5_3comp_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m2,hprime_xxT,tempx3,tempy3,tempz3,m1)
 
-          C3_m1_m2_5points(i,j) = hprime_xx(i,1)*B3_m1_m2_5points(1,j) + &
-                                  hprime_xx(i,2)*B3_m1_m2_5points(2,j) + &
-                                  hprime_xx(i,3)*B3_m1_m2_5points(3,j) + &
-                                  hprime_xx(i,4)*B3_m1_m2_5points(4,j) + &
-                                  hprime_xx(i,5)*B3_m1_m2_5points(5,j)
-       enddo
-    enddo
-
-    do j=1,m1
-       do i=1,m1
-          ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-          do k = 1,NGLLX
-             tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                             dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                             dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                             dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                             dummyx_loc(i,5,k)*hprime_xxT(5,j)
-
-             tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                             dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                             dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                             dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                             dummyy_loc(i,5,k)*hprime_xxT(5,j)
-
-             tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                             dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                             dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                             dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                             dummyz_loc(i,5,k)*hprime_xxT(5,j)
-          enddo
-       enddo
-    enddo
-
-    do j=1,m1
-       do i=1,m2
-          C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                      A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                      A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                      A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                      A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-
-          C2_mxm_m2_m1_5points(i,j) = A2_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                      A2_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                      A2_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                      A2_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                      A2_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-
-          C3_mxm_m2_m1_5points(i,j) = A3_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                      A3_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                      A3_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                      A3_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                      A3_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-       enddo
-    enddo
-
     !
     ! compute either isotropic, transverse isotropic or anisotropic elements
     !
@@ -326,85 +238,24 @@
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
-    do j=1,m2
-      do i=1,m1
-        E1_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_5points(1,j) + &
-                                hprimewgll_xxT(i,2)*C1_m1_m2_5points(2,j) + &
-                                hprimewgll_xxT(i,3)*C1_m1_m2_5points(3,j) + &
-                                hprimewgll_xxT(i,4)*C1_m1_m2_5points(4,j) + &
-                                hprimewgll_xxT(i,5)*C1_m1_m2_5points(5,j)
 
-        E2_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_5points(1,j) + &
-                                hprimewgll_xxT(i,2)*C2_m1_m2_5points(2,j) + &
-                                hprimewgll_xxT(i,3)*C2_m1_m2_5points(3,j) + &
-                                hprimewgll_xxT(i,4)*C2_m1_m2_5points(4,j) + &
-                                hprimewgll_xxT(i,5)*C2_m1_m2_5points(5,j)
+    ! computes 1. matrix multiplication for newtempx1,..
+    call mxm5_3comp_singleA(hprimewgll_xxT,m1,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1,m2)
+    ! computes 2. matrix multiplication for tempx2,..
+    call mxm5_3comp_3dmat_singleB(tempx2,tempy2,tempz2,m1,hprimewgll_xx,m1,newtempx2,newtempy2,newtempz2,NGLLX)
+    ! computes 3. matrix multiplication for newtempx3,..
+    call mxm5_3comp_singleB(tempx3,tempy3,tempz3,m2,hprimewgll_xx,newtempx3,newtempy3,newtempz3,m1)
 
-        E3_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_5points(1,j) + &
-                                hprimewgll_xxT(i,2)*C3_m1_m2_5points(2,j) + &
-                                hprimewgll_xxT(i,3)*C3_m1_m2_5points(3,j) + &
-                                hprimewgll_xxT(i,4)*C3_m1_m2_5points(4,j) + &
-                                hprimewgll_xxT(i,5)*C3_m1_m2_5points(5,j)
-      enddo
-    enddo
+    ! sums contributions
 
-    do i=1,m1
-      do j=1,m1
-        ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-        do k = 1,NGLLX
-          newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                             tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                             tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                             tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                             tempx2(i,5,k)*hprimewgll_xx(5,j)
-
-          newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                             tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                             tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                             tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                             tempy2(i,5,k)*hprimewgll_xx(5,j)
-
-          newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                             tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                             tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                             tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                             tempz2(i,5,k)*hprimewgll_xx(5,j)
-        enddo
-      enddo
-    enddo
-
-    do j=1,m1
-      do i=1,m2
-        E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                    C1_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                    C1_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                    C1_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                    C1_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-
-        E2_mxm_m2_m1_5points(i,j) = C2_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                    C2_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                    C2_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                    C2_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                    C2_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-
-        E3_mxm_m2_m1_5points(i,j) = C3_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                    C3_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                    C3_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                    C3_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                    C3_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-      enddo
-    enddo
-
-    ! sum contributions
-
     DO_LOOP_IJK
 
       fac1 = wgllwgll_yz_3D(INDEX_IJK)
       fac2 = wgllwgll_xz_3D(INDEX_IJK)
       fac3 = wgllwgll_xy_3D(INDEX_IJK)
-      sum_terms(1,INDEX_IJK) = - (fac1*newtempx1(INDEX_IJK) + fac2*newtempx2(INDEX_IJK) + fac3*newtempx3(INDEX_IJK))
-      sum_terms(2,INDEX_IJK) = - (fac1*newtempy1(INDEX_IJK) + fac2*newtempy2(INDEX_IJK) + fac3*newtempy3(INDEX_IJK))
-      sum_terms(3,INDEX_IJK) = - (fac1*newtempz1(INDEX_IJK) + fac2*newtempz2(INDEX_IJK) + fac3*newtempz3(INDEX_IJK))
+      sum_terms(INDEX_IJK,1) = - (fac1*newtempx1(INDEX_IJK) + fac2*newtempx2(INDEX_IJK) + fac3*newtempx3(INDEX_IJK))
+      sum_terms(INDEX_IJK,2) = - (fac1*newtempy1(INDEX_IJK) + fac2*newtempy2(INDEX_IJK) + fac3*newtempy3(INDEX_IJK))
+      sum_terms(INDEX_IJK,3) = - (fac1*newtempz1(INDEX_IJK) + fac2*newtempz2(INDEX_IJK) + fac3*newtempz3(INDEX_IJK))
 
     ENDDO_LOOP_IJK
 
@@ -419,9 +270,9 @@
       do k=1,NGLLZ
         do j=1,NGLLY
           do i=1,NGLLX
-            sum_terms(1,INDEX_IJK) = sum_terms(1,INDEX_IJK) + rho_s_H(1,INDEX_IJK)
-            sum_terms(2,INDEX_IJK) = sum_terms(2,INDEX_IJK) + rho_s_H(2,INDEX_IJK)
-            sum_terms(3,INDEX_IJK) = sum_terms(3,INDEX_IJK) + rho_s_H(3,INDEX_IJK)
+            sum_terms(INDEX_IJK,1) = sum_terms(INDEX_IJK,1) + rho_s_H(INDEX_IJK,1)
+            sum_terms(INDEX_IJK,2) = sum_terms(INDEX_IJK,2) + rho_s_H(INDEX_IJK,2)
+            sum_terms(INDEX_IJK,3) = sum_terms(INDEX_IJK,3) + rho_s_H(INDEX_IJK,3)
           enddo
         enddo
       enddo
@@ -451,9 +302,9 @@
 
           ! do NOT use array syntax ":" for the three statements below otherwise most compilers
           ! will not be able to vectorize the outer loop
-          accel_crust_mantle(1,iglob) = accel_crust_mantle(1,iglob) + sum_terms(1,INDEX_IJK)
-          accel_crust_mantle(2,iglob) = accel_crust_mantle(2,iglob) + sum_terms(2,INDEX_IJK)
-          accel_crust_mantle(3,iglob) = accel_crust_mantle(3,iglob) + sum_terms(3,INDEX_IJK)
+          accel_crust_mantle(1,iglob) = accel_crust_mantle(1,iglob) + sum_terms(INDEX_IJK,1)
+          accel_crust_mantle(2,iglob) = accel_crust_mantle(2,iglob) + sum_terms(INDEX_IJK,2)
+          accel_crust_mantle(3,iglob) = accel_crust_mantle(3,iglob) + sum_terms(INDEX_IJK,3)
 
 #ifdef FORCE_VECTORIZATION
     enddo
@@ -484,8 +335,6 @@
                                                  R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk, &
                                                  ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec,factor_common, &
                                                  c44store,muvstore, &
-!                                                epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-!                                                epsilondev_xz,epsilondev_yz, &
                                                  epsilondev_loc, &
                                                  deltat)
       else
@@ -501,14 +350,165 @@
 
     ! save deviatoric strain for Runge-Kutta scheme
     if(COMPUTE_AND_STORE_STRAIN) then
-      epsilondev_xx(:,:,:,ispec) = epsilondev_loc(1,:,:,:)
-      epsilondev_yy(:,:,:,ispec) = epsilondev_loc(2,:,:,:)
-      epsilondev_xy(:,:,:,ispec) = epsilondev_loc(3,:,:,:)
-      epsilondev_xz(:,:,:,ispec) = epsilondev_loc(4,:,:,:)
-      epsilondev_yz(:,:,:,ispec) = epsilondev_loc(5,:,:,:)
+      epsilondev_xx(:,:,:,ispec) = epsilondev_loc(:,:,:,1)
+      epsilondev_yy(:,:,:,ispec) = epsilondev_loc(:,:,:,2)
+      epsilondev_xy(:,:,:,ispec) = epsilondev_loc(:,:,:,3)
+      epsilondev_xz(:,:,:,ispec) = epsilondev_loc(:,:,:,4)
+      epsilondev_yz(:,:,:,ispec) = epsilondev_loc(:,:,:,5)
     endif
 
   enddo ! of spectral element loop NSPEC_CRUST_MANTLE
 
+  contains
+
+!--------------------------------------------------------------------------------------------
+!
+! matrix-matrix multiplications
+!
+! subroutines adapted from Deville, Fischer and Mund, High-order methods
+! for incompressible fluid flow, Cambridge University Press (2002),
+! pages 386 and 389 and Figure 8.3.1
+!
+!--------------------------------------------------------------------------------------------
+!
+! note: the matrix-matrix multiplications are used for very small matrices ( 5 x 5 x 5 elements);
+!       thus, calling external optimized libraries for these multiplications are in general slower
+!
+! please leave the routines here to help compilers inlining the code
+
+  subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5) :: A
+  real(kind=CUSTOM_REAL),dimension(5,n3) :: B1,B2,B3
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A(i,1) * B1(1,j) &
+               + A(i,2) * B1(2,j) &
+               + A(i,3) * B1(3,j) &
+               + A(i,4) * B1(4,j) &
+               + A(i,5) * B1(5,j)
+
+      C2(i,j) =  A(i,1) * B2(1,j) &
+               + A(i,2) * B2(2,j) &
+               + A(i,3) * B2(3,j) &
+               + A(i,4) * B2(4,j) &
+               + A(i,5) * B2(5,j)
+
+      C3(i,j) =  A(i,1) * B3(1,j) &
+               + A(i,2) * B3(2,j) &
+               + A(i,3) * B3(3,j) &
+               + A(i,4) * B3(4,j) &
+               + A(i,5) * B3(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_singleA
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(5,n3) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A1(i,1) * B(1,j) &
+               + A1(i,2) * B(2,j) &
+               + A1(i,3) * B(3,j) &
+               + A1(i,4) * B(4,j) &
+               + A1(i,5) * B(5,j)
+
+      C2(i,j) =  A2(i,1) * B(1,j) &
+               + A2(i,2) * B(2,j) &
+               + A2(i,3) * B(3,j) &
+               + A2(i,4) * B(4,j) &
+               + A2(i,5) * B(5,j)
+
+      C3(i,j) =  A3(i,1) * B(1,j) &
+               + A3(i,2) * B(2,j) &
+               + A3(i,3) * B(3,j) &
+               + A3(i,4) * B(4,j) &
+               + A3(i,5) * B(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_singleB
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 3-dimensional arrays (5,5,5), same B matrix for all 3 component arrays
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n2,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5,n3) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(5,n2) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n2,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j,k
+
+  ! matrix-matrix multiplication
+  do j=1,n2
+    do i=1,n1
+      ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+      do k=1,n3
+        C1(i,j,k) =  A1(i,1,k) * B(1,j) &
+                   + A1(i,2,k) * B(2,j) &
+                   + A1(i,3,k) * B(3,j) &
+                   + A1(i,4,k) * B(4,j) &
+                   + A1(i,5,k) * B(5,j)
+
+        C2(i,j,k) =  A2(i,1,k) * B(1,j) &
+                   + A2(i,2,k) * B(2,j) &
+                   + A2(i,3,k) * B(3,j) &
+                   + A2(i,4,k) * B(4,j) &
+                   + A2(i,5,k) * B(5,j)
+
+        C3(i,j,k) =  A3(i,1,k) * B(1,j) &
+                   + A3(i,2,k) * B(2,j) &
+                   + A3(i,3,k) * B(3,j) &
+                   + A3(i,4,k) * B(4,j) &
+                   + A3(i,5,k) * B(5,j)
+      enddo
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_3dmat_singleB
+
   end subroutine compute_forces_crust_mantle_Dev
 
+
+

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_noDev.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_noDev.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_noDev.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -85,8 +85,8 @@
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATT) :: R_xx,R_yy,R_xy,R_xz,R_yz
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATT) :: &
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_ATT) :: R_xx,R_yy,R_xy,R_xz,R_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_ATT) :: &
     R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: &
@@ -94,7 +94,7 @@
   real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: epsilon_trace_over_3
 
   ! [alpha,beta,gamma]val reduced to N_SLS and factor_common to N_SLS*NUM_NODES
-  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,N_SLS,vnspec) :: factor_common
   real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
 
   ! inner/outer element run flag
@@ -105,8 +105,7 @@
   ! for attenuation
   real(kind=CUSTOM_REAL) one_minus_sum_beta_use,minus_sum_beta
   real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
-!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ)   :: factor_common_c44_muv
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: epsilondev_loc
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
@@ -141,7 +140,7 @@
   real(kind=CUSTOM_REAL) fac1,fac2,fac3
   real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
   real(kind=CUSTOM_REAL) kappal,kappavl,kappahl,muvl,muhl
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sum_terms
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: sum_terms
 
   real(kind=CUSTOM_REAL) tempx1l,tempx2l,tempx3l
   real(kind=CUSTOM_REAL) tempy1l,tempy2l,tempy3l
@@ -157,7 +156,7 @@
   double precision cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq
   double precision factor,sx_l,sy_l,sz_l,gxl,gyl,gzl
   double precision Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: rho_s_H
 
 !  integer :: computed_elements
   integer :: num_elements,ispec_p
@@ -269,11 +268,11 @@
             else
               epsilon_trace_over_3(i,j,k,ispec) = templ
             endif
-            epsilondev_loc(1,i,j,k) = duxdxl - templ
-            epsilondev_loc(2,i,j,k) = duydyl - templ
-            epsilondev_loc(3,i,j,k) = 0.5 * duxdyl_plus_duydxl
-            epsilondev_loc(4,i,j,k) = 0.5 * duzdxl_plus_duxdzl
-            epsilondev_loc(5,i,j,k) = 0.5 * duzdyl_plus_duydzl
+            epsilondev_loc(i,j,k,1) = duxdxl - templ
+            epsilondev_loc(i,j,k,2) = duydyl - templ
+            epsilondev_loc(i,j,k,3) = 0.5 * duxdyl_plus_duydxl
+            epsilondev_loc(i,j,k,4) = 0.5 * duzdxl_plus_duxdzl
+            epsilondev_loc(i,j,k,5) = 0.5 * duzdyl_plus_duydzl
           endif
 
           ! precompute terms for attenuation if needed
@@ -588,14 +587,14 @@
           ! subtract memory variables if attenuation
           if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL ) then
              do i_SLS = 1,N_SLS
-                R_xx_val = R_xx(i_SLS,i,j,k,ispec)
-                R_yy_val = R_yy(i_SLS,i,j,k,ispec)
+                R_xx_val = R_xx(i,j,k,i_SLS,ispec)
+                R_yy_val = R_yy(i,j,k,i_SLS,ispec)
                 sigma_xx = sigma_xx - R_xx_val
                 sigma_yy = sigma_yy - R_yy_val
                 sigma_zz = sigma_zz + R_xx_val + R_yy_val
-                sigma_xy = sigma_xy - R_xy(i_SLS,i,j,k,ispec)
-                sigma_xz = sigma_xz - R_xz(i_SLS,i,j,k,ispec)
-                sigma_yz = sigma_yz - R_yz(i_SLS,i,j,k,ispec)
+                sigma_xy = sigma_xy - R_xy(i,j,k,i_SLS,ispec)
+                sigma_xz = sigma_xz - R_xz(i,j,k,i_SLS,ispec)
+                sigma_yz = sigma_yz - R_yz(i,j,k,i_SLS,ispec)
              enddo
           endif
 
@@ -677,9 +676,9 @@
 
               ! precompute vector
               factor = dble(jacobianl) * wgll_cube(i,j,k)
-              rho_s_H(1,i,j,k) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
-              rho_s_H(2,i,j,k) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
-              rho_s_H(3,i,j,k) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
+              rho_s_H(i,j,k,1) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
+              rho_s_H(i,j,k,2) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
+              rho_s_H(i,j,k,3) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
 
             else
 
@@ -704,9 +703,9 @@
 
               ! precompute vector
               factor = jacobianl * wgll_cube(i,j,k)
-              rho_s_H(1,i,j,k) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-              rho_s_H(2,i,j,k) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-              rho_s_H(3,i,j,k) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
+              rho_s_H(i,j,k,1) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+              rho_s_H(i,j,k,2) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+              rho_s_H(i,j,k,3) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
 
             endif
 
@@ -770,11 +769,15 @@
           fac2 = wgllwgll_xz(i,k)
           fac3 = wgllwgll_xy(i,j)
 
-          sum_terms(1,i,j,k) = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
-          sum_terms(2,i,j,k) = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
-          sum_terms(3,i,j,k) = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
+          sum_terms(i,j,k,1) = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
+          sum_terms(i,j,k,2) = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
+          sum_terms(i,j,k,3) = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
 
-          if(GRAVITY_VAL) sum_terms(:,i,j,k) = sum_terms(:,i,j,k) + rho_s_H(:,i,j,k)
+          if(GRAVITY_VAL) then
+            sum_terms(i,j,k,1) = sum_terms(i,j,k,1) + rho_s_H(i,j,k,1)
+            sum_terms(i,j,k,2) = sum_terms(i,j,k,2) + rho_s_H(i,j,k,2)
+            sum_terms(i,j,k,3) = sum_terms(i,j,k,3) + rho_s_H(i,j,k,3)
+          endif
 
         enddo ! NGLLX
       enddo ! NGLLY
@@ -785,9 +788,9 @@
       do j=1,NGLLY
         do i=1,NGLLX
           iglob = ibool(i,j,k,ispec)
-          accel_crust_mantle(1,iglob) = accel_crust_mantle(1,iglob) + sum_terms(1,i,j,k)
-          accel_crust_mantle(2,iglob) = accel_crust_mantle(2,iglob) + sum_terms(2,i,j,k)
-          accel_crust_mantle(3,iglob) = accel_crust_mantle(3,iglob) + sum_terms(3,i,j,k)
+          accel_crust_mantle(1,iglob) = accel_crust_mantle(1,iglob) + sum_terms(i,j,k,1)
+          accel_crust_mantle(2,iglob) = accel_crust_mantle(2,iglob) + sum_terms(i,j,k,2)
+          accel_crust_mantle(3,iglob) = accel_crust_mantle(3,iglob) + sum_terms(i,j,k,3)
         enddo
       enddo
     enddo
@@ -814,8 +817,6 @@
                                                  R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk, &
                                                  ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec,factor_common, &
                                                  c44store,muvstore, &
-!                                                epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-!                                                epsilondev_xz,epsilondev_yz, &
                                                  epsilondev_loc, &
                                                  deltat)
       else
@@ -835,11 +836,11 @@
       do k=1,NGLLZ
         do j=1,NGLLY
           do i=1,NGLLX
-            epsilondev_xx(i,j,k,ispec) = epsilondev_loc(1,i,j,k)
-            epsilondev_yy(i,j,k,ispec) = epsilondev_loc(2,i,j,k)
-            epsilondev_xy(i,j,k,ispec) = epsilondev_loc(3,i,j,k)
-            epsilondev_xz(i,j,k,ispec) = epsilondev_loc(4,i,j,k)
-            epsilondev_yz(i,j,k,ispec) = epsilondev_loc(5,i,j,k)
+            epsilondev_xx(i,j,k,ispec) = epsilondev_loc(i,j,k,1)
+            epsilondev_yy(i,j,k,ispec) = epsilondev_loc(i,j,k,2)
+            epsilondev_xy(i,j,k,ispec) = epsilondev_loc(i,j,k,3)
+            epsilondev_xz(i,j,k,ispec) = epsilondev_loc(i,j,k,4)
+            epsilondev_yz(i,j,k,ispec) = epsilondev_loc(i,j,k,5)
           enddo
         enddo
       enddo

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_Dev.F90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_Dev.F90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -87,13 +87,12 @@
   ! to allow for optimization of cache access by compiler
   ! variable lengths for factor_common and one_minus_sum_beta
   integer :: vnspec
-  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,N_SLS,vnspec) :: factor_common
   real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
 
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATT) :: &
-    R_xx,R_yy,R_xy,R_xz,R_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_ATT) :: R_xx,R_yy,R_xy,R_xz,R_yz
 
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATT) :: &
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_ATT) :: &
     R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: &
@@ -104,49 +103,16 @@
   logical :: phase_is_inner
 
   ! local parameters
-
   ! Deville
-  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
 
-  real(kind=CUSTOM_REAL), dimension(NGLLX,m2) :: B1_m1_m2_5points,B2_m1_m2_5points,B3_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points,C2_m1_m2_5points,C3_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: E1_m1_m2_5points,E2_m1_m2_5points,E3_m1_m2_5points
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: sum_terms
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: epsilondev_loc
 
-  equivalence(dummyx_loc,B1_m1_m2_5points)
-  equivalence(dummyy_loc,B2_m1_m2_5points)
-  equivalence(dummyz_loc,B3_m1_m2_5points)
-  equivalence(tempx1,C1_m1_m2_5points)
-  equivalence(tempy1,C2_m1_m2_5points)
-  equivalence(tempz1,C3_m1_m2_5points)
-  equivalence(newtempx1,E1_m1_m2_5points)
-  equivalence(newtempy1,E2_m1_m2_5points)
-  equivalence(newtempz1,E3_m1_m2_5points)
-
-  real(kind=CUSTOM_REAL), dimension(m2,NGLLX) :: &
-    A1_mxm_m2_m1_5points,A2_mxm_m2_m1_5points,A3_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: &
-    C1_mxm_m2_m1_5points,C2_mxm_m2_m1_5points,C3_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: &
-    E1_mxm_m2_m1_5points,E2_mxm_m2_m1_5points,E3_mxm_m2_m1_5points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_5points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_5points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_5points)
-  equivalence(tempx3,C1_mxm_m2_m1_5points)
-  equivalence(tempy3,C2_mxm_m2_m1_5points)
-  equivalence(tempz3,C3_mxm_m2_m1_5points)
-  equivalence(newtempx3,E1_mxm_m2_m1_5points)
-  equivalence(newtempy3,E2_mxm_m2_m1_5points)
-  equivalence(newtempz3,E3_mxm_m2_m1_5points)
-
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sum_terms
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
-
   real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
   real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
 
@@ -170,12 +136,11 @@
   double precision cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq
   double precision theta,phi,factor,gxl,gyl,gzl,sx_l,sy_l,sz_l
   double precision Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: rho_s_H
   real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
 
   integer :: int_radius
   integer :: ispec,iglob
-  integer :: i,j,k
   integer :: num_elements,ispec_p
   integer :: iphase
 
@@ -185,6 +150,8 @@
 ! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop
   integer :: ijk
   real(kind=CUSTOM_REAL) :: R_xx_val,R_yy_val
+#else
+  integer :: i,j,k
 #endif
 
 ! ****************************************************
@@ -222,75 +189,83 @@
       ! subroutines adapted from Deville, Fischer and Mund, High-order methods
       ! for incompressible fluid flow, Cambridge University Press (2002),
       ! pages 386 and 389 and Figure 8.3.1
-      do j=1,m2
-         do i=1,m1
-            C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
-                 hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
-                 hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
-                 hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
-                 hprime_xx(i,5)*B1_m1_m2_5points(5,j)
 
-            C2_m1_m2_5points(i,j) = hprime_xx(i,1)*B2_m1_m2_5points(1,j) + &
-                 hprime_xx(i,2)*B2_m1_m2_5points(2,j) + &
-                 hprime_xx(i,3)*B2_m1_m2_5points(3,j) + &
-                 hprime_xx(i,4)*B2_m1_m2_5points(4,j) + &
-                 hprime_xx(i,5)*B2_m1_m2_5points(5,j)
+      ! computes 1. matrix multiplication for tempx1,..
+      call mxm5_3comp_singleA(hprime_xx,m1,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1,m2)
+      ! computes 2. matrix multiplication for tempx2,..
+      call mxm5_3comp_3dmat_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m1,hprime_xxT,m1,tempx2,tempy2,tempz2,NGLLX)
+      ! computes 3. matrix multiplication for tempx1,..
+      call mxm5_3comp_singleB(dummyx_loc,dummyy_loc,dummyz_loc,m2,hprime_xxT,tempx3,tempy3,tempz3,m1)
 
-            C3_m1_m2_5points(i,j) = hprime_xx(i,1)*B3_m1_m2_5points(1,j) + &
-                 hprime_xx(i,2)*B3_m1_m2_5points(2,j) + &
-                 hprime_xx(i,3)*B3_m1_m2_5points(3,j) + &
-                 hprime_xx(i,4)*B3_m1_m2_5points(4,j) + &
-                 hprime_xx(i,5)*B3_m1_m2_5points(5,j)
-         enddo
-      enddo
-
-      do j=1,m1
-         do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-               tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                    dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                    dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                    dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                    dummyx_loc(i,5,k)*hprime_xxT(5,j)
-
-               tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                    dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                    dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                    dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                    dummyy_loc(i,5,k)*hprime_xxT(5,j)
-
-               tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                    dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                    dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                    dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                    dummyz_loc(i,5,k)*hprime_xxT(5,j)
-            enddo
-         enddo
-      enddo
-
-      do j=1,m1
-         do i=1,m2
-            C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                 A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                 A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                 A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                 A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-
-            C2_mxm_m2_m1_5points(i,j) = A2_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                 A2_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                 A2_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                 A2_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                 A2_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-
-            C3_mxm_m2_m1_5points(i,j) = A3_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                 A3_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                 A3_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                 A3_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                 A3_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-         enddo
-      enddo
-
+!      do j=1,m2
+!         do i=1,m1
+!            C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
+!                 hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
+!                 hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
+!                 hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
+!                 hprime_xx(i,5)*B1_m1_m2_5points(5,j)
+!
+!            C2_m1_m2_5points(i,j) = hprime_xx(i,1)*B2_m1_m2_5points(1,j) + &
+!                 hprime_xx(i,2)*B2_m1_m2_5points(2,j) + &
+!                 hprime_xx(i,3)*B2_m1_m2_5points(3,j) + &
+!                 hprime_xx(i,4)*B2_m1_m2_5points(4,j) + &
+!                 hprime_xx(i,5)*B2_m1_m2_5points(5,j)
+!
+!            C3_m1_m2_5points(i,j) = hprime_xx(i,1)*B3_m1_m2_5points(1,j) + &
+!                 hprime_xx(i,2)*B3_m1_m2_5points(2,j) + &
+!                 hprime_xx(i,3)*B3_m1_m2_5points(3,j) + &
+!                 hprime_xx(i,4)*B3_m1_m2_5points(4,j) + &
+!                 hprime_xx(i,5)*B3_m1_m2_5points(5,j)
+!         enddo
+!      enddo
+!
+!      do j=1,m1
+!         do i=1,m1
+!            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+!            do k = 1,NGLLX
+!               tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+!                    dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+!                    dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+!                    dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+!                    dummyx_loc(i,5,k)*hprime_xxT(5,j)
+!
+!               tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+!                    dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+!                    dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+!                    dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+!                    dummyy_loc(i,5,k)*hprime_xxT(5,j)
+!
+!               tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+!                    dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+!                    dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+!                    dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+!                    dummyz_loc(i,5,k)*hprime_xxT(5,j)
+!            enddo
+!         enddo
+!      enddo
+!
+!      do j=1,m1
+!         do i=1,m2
+!            C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+!                 A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+!                 A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+!                 A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+!                 A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
+!
+!            C2_mxm_m2_m1_5points(i,j) = A2_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+!                 A2_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+!                 A2_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+!                 A2_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+!                 A2_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
+!
+!            C3_mxm_m2_m1_5points(i,j) = A3_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+!                 A3_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+!                 A3_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+!                 A3_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+!                 A3_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
+!         enddo
+!      enddo
+!
       DO_LOOP_IJK
 
         ! get derivatives of ux, uy and uz with respect to x, y and z
@@ -339,11 +314,11 @@
           else
             epsilon_trace_over_3(INDEX_IJK,ispec) = templ
           endif
-          epsilondev_loc(1,INDEX_IJK) = duxdxl - templ
-          epsilondev_loc(2,INDEX_IJK) = duydyl - templ
-          epsilondev_loc(3,INDEX_IJK) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-          epsilondev_loc(4,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-          epsilondev_loc(5,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+          epsilondev_loc(INDEX_IJK,1) = duxdxl - templ
+          epsilondev_loc(INDEX_IJK,2) = duydyl - templ
+          epsilondev_loc(INDEX_IJK,3) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+          epsilondev_loc(INDEX_IJK,4) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+          epsilondev_loc(INDEX_IJK,5) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
         endif
 
         if(ANISOTROPIC_INNER_CORE_VAL) then
@@ -428,39 +403,36 @@
 
           ! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop
           ! in order to vectorize the outer loop
-          R_xx_val = R_xx(1,INDEX_IJK,ispec)
-          R_yy_val = R_yy(1,INDEX_IJK,ispec)
+          R_xx_val = R_xx(INDEX_IJK,1,ispec)
+          R_yy_val = R_yy(INDEX_IJK,1,ispec)
           sigma_xx = sigma_xx - R_xx_val
           sigma_yy = sigma_yy - R_yy_val
           sigma_zz = sigma_zz + R_xx_val + R_yy_val
-          sigma_xy = sigma_xy - R_xy(1,INDEX_IJK,ispec)
-          sigma_xz = sigma_xz - R_xz(1,INDEX_IJK,ispec)
-          sigma_yz = sigma_yz - R_yz(1,INDEX_IJK,ispec)
+          sigma_xy = sigma_xy - R_xy(INDEX_IJK,1,ispec)
+          sigma_xz = sigma_xz - R_xz(INDEX_IJK,1,ispec)
+          sigma_yz = sigma_yz - R_yz(INDEX_IJK,1,ispec)
 
-          R_xx_val = R_xx(2,INDEX_IJK,ispec)
-          R_yy_val = R_yy(2,INDEX_IJK,ispec)
+          R_xx_val = R_xx(INDEX_IJK,2,ispec)
+          R_yy_val = R_yy(INDEX_IJK,2,ispec)
           sigma_xx = sigma_xx - R_xx_val
           sigma_yy = sigma_yy - R_yy_val
           sigma_zz = sigma_zz + R_xx_val + R_yy_val
-          sigma_xy = sigma_xy - R_xy(2,INDEX_IJK,ispec)
-          sigma_xz = sigma_xz - R_xz(2,INDEX_IJK,ispec)
-          sigma_yz = sigma_yz - R_yz(2,INDEX_IJK,ispec)
+          sigma_xy = sigma_xy - R_xy(INDEX_IJK,2,ispec)
+          sigma_xz = sigma_xz - R_xz(INDEX_IJK,2,ispec)
+          sigma_yz = sigma_yz - R_yz(INDEX_IJK,2,ispec)
 
-          R_xx_val = R_xx(3,INDEX_IJK,ispec)
-          R_yy_val = R_yy(3,INDEX_IJK,ispec)
+          R_xx_val = R_xx(INDEX_IJK,3,ispec)
+          R_yy_val = R_yy(INDEX_IJK,3,ispec)
           sigma_xx = sigma_xx - R_xx_val
           sigma_yy = sigma_yy - R_yy_val
           sigma_zz = sigma_zz + R_xx_val + R_yy_val
-          sigma_xy = sigma_xy - R_xy(3,INDEX_IJK,ispec)
-          sigma_xz = sigma_xz - R_xz(3,INDEX_IJK,ispec)
-          sigma_yz = sigma_yz - R_yz(3,INDEX_IJK,ispec)
+          sigma_xy = sigma_xy - R_xy(INDEX_IJK,3,ispec)
+          sigma_xz = sigma_xz - R_xz(INDEX_IJK,3,ispec)
+          sigma_yz = sigma_yz - R_yz(INDEX_IJK,3,ispec)
 #else
           ! note: Fortran passes pointers to array location, thus R_memory(1,1,...) is fine
-          call compute_element_att_stress(R_xx(1,INDEX_IJK,ispec), &
-                                          R_yy(1,INDEX_IJK,ispec), &
-                                          R_xy(1,INDEX_IJK,ispec), &
-                                          R_xz(1,INDEX_IJK,ispec), &
-                                          R_yz(1,INDEX_IJK,ispec), &
+          call compute_element_att_stress(i,j,k,R_xx(1,1,1,1,ispec),R_yy(1,1,1,1,ispec),R_xy(1,1,1,1,ispec), &
+                                          R_xz(1,1,1,1,ispec),R_yz(1,1,1,1,ispec), &
                                           sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
 #endif
         endif
@@ -546,9 +518,9 @@
 
             ! precompute vector
             factor = dble(jacobianl) * wgll_cube(INDEX_IJK)
-            rho_s_H(1,INDEX_IJK) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
-            rho_s_H(2,INDEX_IJK) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
-            rho_s_H(3,INDEX_IJK) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
+            rho_s_H(INDEX_IJK,1) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
+            rho_s_H(INDEX_IJK,2) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
+            rho_s_H(INDEX_IJK,3) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
 
           else
 
@@ -573,9 +545,9 @@
 
             ! precompute vector
             factor = jacobianl * wgll_cube(INDEX_IJK)
-            rho_s_H(1,INDEX_IJK) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-            rho_s_H(2,INDEX_IJK) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-            rho_s_H(3,INDEX_IJK) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
+            rho_s_H(INDEX_IJK,1) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+            rho_s_H(INDEX_IJK,2) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+            rho_s_H(INDEX_IJK,3) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
 
           endif
 
@@ -599,75 +571,83 @@
       ! subroutines adapted from Deville, Fischer and Mund, High-order methods
       ! for incompressible fluid flow, Cambridge University Press (2002),
       ! pages 386 and 389 and Figure 8.3.1
-      do j=1,m2
-        do i=1,m1
-          E1_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_5points(1,j) + &
-                                hprimewgll_xxT(i,2)*C1_m1_m2_5points(2,j) + &
-                                hprimewgll_xxT(i,3)*C1_m1_m2_5points(3,j) + &
-                                hprimewgll_xxT(i,4)*C1_m1_m2_5points(4,j) + &
-                                hprimewgll_xxT(i,5)*C1_m1_m2_5points(5,j)
 
-          E2_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_5points(1,j) + &
-                                hprimewgll_xxT(i,2)*C2_m1_m2_5points(2,j) + &
-                                hprimewgll_xxT(i,3)*C2_m1_m2_5points(3,j) + &
-                                hprimewgll_xxT(i,4)*C2_m1_m2_5points(4,j) + &
-                                hprimewgll_xxT(i,5)*C2_m1_m2_5points(5,j)
+      ! computes 1. matrix multiplication for newtempx1,..
+      call mxm5_3comp_singleA(hprimewgll_xxT,m1,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1,m2)
+      ! computes 2. matrix multiplication for tempx2,..
+      call mxm5_3comp_3dmat_singleB(tempx2,tempy2,tempz2,m1,hprimewgll_xx,m1,newtempx2,newtempy2,newtempz2,NGLLX)
+      ! computes 3. matrix multiplication for newtempx3,..
+      call mxm5_3comp_singleB(tempx3,tempy3,tempz3,m2,hprimewgll_xx,newtempx3,newtempy3,newtempz3,m1)
 
-          E3_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_5points(1,j) + &
-                                hprimewgll_xxT(i,2)*C3_m1_m2_5points(2,j) + &
-                                hprimewgll_xxT(i,3)*C3_m1_m2_5points(3,j) + &
-                                hprimewgll_xxT(i,4)*C3_m1_m2_5points(4,j) + &
-                                hprimewgll_xxT(i,5)*C3_m1_m2_5points(5,j)
-        enddo
-      enddo
-
-      do i=1,m1
-        do j=1,m1
-          ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-          do k = 1,NGLLX
-            newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                             tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                             tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                             tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                             tempx2(i,5,k)*hprimewgll_xx(5,j)
-
-            newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                             tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                             tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                             tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                             tempy2(i,5,k)*hprimewgll_xx(5,j)
-
-            newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                             tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                             tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                             tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                             tempz2(i,5,k)*hprimewgll_xx(5,j)
-          enddo
-        enddo
-      enddo
-
-      do j=1,m1
-        do i=1,m2
-          E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                    C1_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                    C1_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                    C1_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                    C1_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-
-          E2_mxm_m2_m1_5points(i,j) = C2_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                    C2_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                    C2_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                    C2_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                    C2_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-
-          E3_mxm_m2_m1_5points(i,j) = C3_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                    C3_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                    C3_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                    C3_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                    C3_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-        enddo
-      enddo
-
+!      do j=1,m2
+!        do i=1,m1
+!          E1_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_5points(1,j) + &
+!                                hprimewgll_xxT(i,2)*C1_m1_m2_5points(2,j) + &
+!                                hprimewgll_xxT(i,3)*C1_m1_m2_5points(3,j) + &
+!                                hprimewgll_xxT(i,4)*C1_m1_m2_5points(4,j) + &
+!                                hprimewgll_xxT(i,5)*C1_m1_m2_5points(5,j)
+!
+!          E2_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_5points(1,j) + &
+!                                hprimewgll_xxT(i,2)*C2_m1_m2_5points(2,j) + &
+!                                hprimewgll_xxT(i,3)*C2_m1_m2_5points(3,j) + &
+!                                hprimewgll_xxT(i,4)*C2_m1_m2_5points(4,j) + &
+!                                hprimewgll_xxT(i,5)*C2_m1_m2_5points(5,j)
+!
+!          E3_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_5points(1,j) + &
+!                                hprimewgll_xxT(i,2)*C3_m1_m2_5points(2,j) + &
+!                                hprimewgll_xxT(i,3)*C3_m1_m2_5points(3,j) + &
+!                                hprimewgll_xxT(i,4)*C3_m1_m2_5points(4,j) + &
+!                                hprimewgll_xxT(i,5)*C3_m1_m2_5points(5,j)
+!        enddo
+!      enddo
+!
+!      do i=1,m1
+!        do j=1,m1
+!          ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+!          do k = 1,NGLLX
+!            newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+!                             tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+!                             tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+!                             tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+!                             tempx2(i,5,k)*hprimewgll_xx(5,j)
+!
+!            newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+!                             tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+!                             tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+!                             tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+!                             tempy2(i,5,k)*hprimewgll_xx(5,j)
+!
+!            newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+!                             tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+!                             tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+!                             tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+!                             tempz2(i,5,k)*hprimewgll_xx(5,j)
+!          enddo
+!        enddo
+!      enddo
+!
+!      do j=1,m1
+!        do i=1,m2
+!          E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+!                                    C1_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+!                                    C1_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+!                                    C1_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+!                                    C1_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
+!
+!          E2_mxm_m2_m1_5points(i,j) = C2_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+!                                    C2_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+!                                    C2_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+!                                    C2_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+!                                    C2_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
+!
+!          E3_mxm_m2_m1_5points(i,j) = C3_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+!                                    C3_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+!                                    C3_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+!                                    C3_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+!                                    C3_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
+!        enddo
+!      enddo
+!
       ! sums contributions
 
       DO_LOOP_IJK
@@ -675,9 +655,9 @@
         fac1 = wgllwgll_yz_3D(INDEX_IJK)
         fac2 = wgllwgll_xz_3D(INDEX_IJK)
         fac3 = wgllwgll_xy_3D(INDEX_IJK)
-        sum_terms(1,INDEX_IJK) = - (fac1*newtempx1(INDEX_IJK) + fac2*newtempx2(INDEX_IJK) + fac3*newtempx3(INDEX_IJK))
-        sum_terms(2,INDEX_IJK) = - (fac1*newtempy1(INDEX_IJK) + fac2*newtempy2(INDEX_IJK) + fac3*newtempy3(INDEX_IJK))
-        sum_terms(3,INDEX_IJK) = - (fac1*newtempz1(INDEX_IJK) + fac2*newtempz2(INDEX_IJK) + fac3*newtempz3(INDEX_IJK))
+        sum_terms(INDEX_IJK,1) = - (fac1*newtempx1(INDEX_IJK) + fac2*newtempx2(INDEX_IJK) + fac3*newtempx3(INDEX_IJK))
+        sum_terms(INDEX_IJK,2) = - (fac1*newtempy1(INDEX_IJK) + fac2*newtempy2(INDEX_IJK) + fac3*newtempy3(INDEX_IJK))
+        sum_terms(INDEX_IJK,3) = - (fac1*newtempz1(INDEX_IJK) + fac2*newtempz2(INDEX_IJK) + fac3*newtempz3(INDEX_IJK))
 
       ENDDO_LOOP_IJK
 
@@ -692,9 +672,9 @@
         do k=1,NGLLZ
           do j=1,NGLLY
             do i=1,NGLLX
-              sum_terms(1,INDEX_IJK) = sum_terms(1,INDEX_IJK) + rho_s_H(1,INDEX_IJK)
-              sum_terms(2,INDEX_IJK) = sum_terms(2,INDEX_IJK) + rho_s_H(2,INDEX_IJK)
-              sum_terms(3,INDEX_IJK) = sum_terms(3,INDEX_IJK) + rho_s_H(3,INDEX_IJK)
+              sum_terms(INDEX_IJK,1) = sum_terms(INDEX_IJK,1) + rho_s_H(INDEX_IJK,1)
+              sum_terms(INDEX_IJK,2) = sum_terms(INDEX_IJK,2) + rho_s_H(INDEX_IJK,2)
+              sum_terms(INDEX_IJK,3) = sum_terms(INDEX_IJK,3) + rho_s_H(INDEX_IJK,3)
             enddo
           enddo
         enddo
@@ -722,9 +702,9 @@
             iglob = ibool(INDEX_IJK,ispec)
             ! do NOT use array syntax ":" for the three statements below
             ! otherwise most compilers will not be able to vectorize the outer loop
-            accel_inner_core(1,iglob) = accel_inner_core(1,iglob) + sum_terms(1,INDEX_IJK)
-            accel_inner_core(2,iglob) = accel_inner_core(2,iglob) + sum_terms(2,INDEX_IJK)
-            accel_inner_core(3,iglob) = accel_inner_core(3,iglob) + sum_terms(3,INDEX_IJK)
+            accel_inner_core(1,iglob) = accel_inner_core(1,iglob) + sum_terms(INDEX_IJK,1)
+            accel_inner_core(2,iglob) = accel_inner_core(2,iglob) + sum_terms(INDEX_IJK,2)
+            accel_inner_core(3,iglob) = accel_inner_core(3,iglob) + sum_terms(INDEX_IJK,3)
 
 #ifdef FORCE_VECTORIZATION
       enddo
@@ -755,8 +735,6 @@
                                                    R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk, &
                                                    ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec,factor_common, &
                                                    muvstore, &
-!                                                  epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-!                                                  epsilondev_xz,epsilondev_yz, &
                                                    epsilondev_loc, &
                                                    deltat)
         else
@@ -772,16 +750,165 @@
 
       ! save deviatoric strain for Runge-Kutta scheme
       if(COMPUTE_AND_STORE_STRAIN) then
-        epsilondev_xx(:,:,:,ispec) = epsilondev_loc(1,:,:,:)
-        epsilondev_yy(:,:,:,ispec) = epsilondev_loc(2,:,:,:)
-        epsilondev_xy(:,:,:,ispec) = epsilondev_loc(3,:,:,:)
-        epsilondev_xz(:,:,:,ispec) = epsilondev_loc(4,:,:,:)
-        epsilondev_yz(:,:,:,ispec) = epsilondev_loc(5,:,:,:)
+        epsilondev_xx(:,:,:,ispec) = epsilondev_loc(:,:,:,1)
+        epsilondev_yy(:,:,:,ispec) = epsilondev_loc(:,:,:,2)
+        epsilondev_xy(:,:,:,ispec) = epsilondev_loc(:,:,:,3)
+        epsilondev_xz(:,:,:,ispec) = epsilondev_loc(:,:,:,4)
+        epsilondev_yz(:,:,:,ispec) = epsilondev_loc(:,:,:,5)
       endif
 
     endif ! end of test to exclude fictitious elements in central cube
 
   enddo ! of spectral element loop
 
+  contains
+
+!--------------------------------------------------------------------------------------------
+!
+! matrix-matrix multiplications
+!
+! subroutines adapted from Deville, Fischer and Mund, High-order methods
+! for incompressible fluid flow, Cambridge University Press (2002),
+! pages 386 and 389 and Figure 8.3.1
+!
+!--------------------------------------------------------------------------------------------
+!
+! note: the matrix-matrix multiplications are used for very small matrices ( 5 x 5 x 5 elements);
+!       thus, calling external optimized libraries for these multiplications are in general slower
+!
+! please leave the routines here to help compilers inlining the code
+
+  subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5) :: A
+  real(kind=CUSTOM_REAL),dimension(5,n3) :: B1,B2,B3
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A(i,1) * B1(1,j) &
+               + A(i,2) * B1(2,j) &
+               + A(i,3) * B1(3,j) &
+               + A(i,4) * B1(4,j) &
+               + A(i,5) * B1(5,j)
+
+      C2(i,j) =  A(i,1) * B2(1,j) &
+               + A(i,2) * B2(2,j) &
+               + A(i,3) * B2(3,j) &
+               + A(i,4) * B2(4,j) &
+               + A(i,5) * B2(5,j)
+
+      C3(i,j) =  A(i,1) * B3(1,j) &
+               + A(i,2) * B3(2,j) &
+               + A(i,3) * B3(3,j) &
+               + A(i,4) * B3(4,j) &
+               + A(i,5) * B3(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_singleA
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(5,n3) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C1(i,j) =  A1(i,1) * B(1,j) &
+               + A1(i,2) * B(2,j) &
+               + A1(i,3) * B(3,j) &
+               + A1(i,4) * B(4,j) &
+               + A1(i,5) * B(5,j)
+
+      C2(i,j) =  A2(i,1) * B(1,j) &
+               + A2(i,2) * B(2,j) &
+               + A2(i,3) * B(3,j) &
+               + A2(i,4) * B(4,j) &
+               + A2(i,5) * B(5,j)
+
+      C3(i,j) =  A3(i,1) * B(1,j) &
+               + A3(i,2) * B(2,j) &
+               + A3(i,3) * B(3,j) &
+               + A3(i,4) * B(4,j) &
+               + A3(i,5) * B(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_singleB
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3)
+
+! 3 different arrays for x/y/z-components, 3-dimensional arrays (5,5,5), same B matrix for all 3 component arrays
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n2,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5,n3) :: A1,A2,A3
+  real(kind=CUSTOM_REAL),dimension(5,n2) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n2,n3) :: C1,C2,C3
+
+  ! local parameters
+  integer :: i,j,k
+
+  ! matrix-matrix multiplication
+  do j=1,n2
+    do i=1,n1
+      ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+      do k=1,n3
+        C1(i,j,k) =  A1(i,1,k) * B(1,j) &
+                   + A1(i,2,k) * B(2,j) &
+                   + A1(i,3,k) * B(3,j) &
+                   + A1(i,4,k) * B(4,j) &
+                   + A1(i,5,k) * B(5,j)
+
+        C2(i,j,k) =  A2(i,1,k) * B(1,j) &
+                   + A2(i,2,k) * B(2,j) &
+                   + A2(i,3,k) * B(3,j) &
+                   + A2(i,4,k) * B(4,j) &
+                   + A2(i,5,k) * B(5,j)
+
+        C3(i,j,k) =  A3(i,1,k) * B(1,j) &
+                   + A3(i,2,k) * B(2,j) &
+                   + A3(i,3,k) * B(3,j) &
+                   + A3(i,4,k) * B(4,j) &
+                   + A3(i,5,k) * B(5,j)
+      enddo
+    enddo
+  enddo
+
+  end subroutine mxm5_3comp_3dmat_singleB
+
   end subroutine compute_forces_inner_core_Dev
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_noDev.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_noDev.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_noDev.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -79,11 +79,12 @@
 
   ! variable sized array variables
   integer :: vnspec
-  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,N_SLS,vnspec) :: factor_common
   real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
 
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATT) :: R_xx,R_yy,R_xy,R_xz,R_yz
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATT) :: &
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_ATT) :: R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_ATT) :: &
     R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: &
@@ -94,7 +95,7 @@
   logical :: phase_is_inner
 
   ! local parameters
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: epsilondev_loc
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
@@ -114,7 +115,7 @@
   real(kind=CUSTOM_REAL) fac1,fac2,fac3
   real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
   real(kind=CUSTOM_REAL) kappal
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sum_terms
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: sum_terms
 
   real(kind=CUSTOM_REAL) minus_sum_beta
   real(kind=CUSTOM_REAL) c11l,c33l,c12l,c13l,c44l
@@ -134,7 +135,7 @@
   double precision cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq
   double precision theta,phi,factor,gxl,gyl,gzl,sx_l,sy_l,sz_l
   double precision Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM) :: rho_s_H
   real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
   integer :: int_radius
 
@@ -250,11 +251,11 @@
               else
                 epsilon_trace_over_3(i,j,k,ispec) = templ
               endif
-              epsilondev_loc(1,i,j,k) = duxdxl - templ
-              epsilondev_loc(2,i,j,k) = duydyl - templ
-              epsilondev_loc(3,i,j,k) = 0.5 * duxdyl_plus_duydxl
-              epsilondev_loc(4,i,j,k) = 0.5 * duzdxl_plus_duxdzl
-              epsilondev_loc(5,i,j,k) = 0.5 * duzdyl_plus_duydzl
+              epsilondev_loc(i,j,k,1) = duxdxl - templ
+              epsilondev_loc(i,j,k,2) = duydyl - templ
+              epsilondev_loc(i,j,k,3) = 0.5 * duxdyl_plus_duydxl
+              epsilondev_loc(i,j,k,4) = 0.5 * duzdxl_plus_duxdzl
+              epsilondev_loc(i,j,k,5) = 0.5 * duzdyl_plus_duydzl
             endif
 
             if(ANISOTROPIC_INNER_CORE_VAL) then
@@ -334,14 +335,14 @@
             ! subtract memory variables if attenuation
             if( ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL ) then
               do i_SLS = 1,N_SLS
-                R_xx_val = R_xx(i_SLS,i,j,k,ispec)
-                R_yy_val = R_yy(i_SLS,i,j,k,ispec)
+                R_xx_val = R_xx(i,j,k,i_SLS,ispec)
+                R_yy_val = R_yy(i,j,k,i_SLS,ispec)
                 sigma_xx = sigma_xx - R_xx_val
                 sigma_yy = sigma_yy - R_yy_val
                 sigma_zz = sigma_zz + R_xx_val + R_yy_val
-                sigma_xy = sigma_xy - R_xy(i_SLS,i,j,k,ispec)
-                sigma_xz = sigma_xz - R_xz(i_SLS,i,j,k,ispec)
-                sigma_yz = sigma_yz - R_yz(i_SLS,i,j,k,ispec)
+                sigma_xy = sigma_xy - R_xy(i,j,k,i_SLS,ispec)
+                sigma_xz = sigma_xz - R_xz(i,j,k,i_SLS,ispec)
+                sigma_yz = sigma_yz - R_yz(i,j,k,i_SLS,ispec)
               enddo
             endif
 
@@ -426,9 +427,9 @@
 
                 ! precompute vector
                 factor = dble(jacobianl) * wgll_cube(i,j,k)
-                rho_s_H(1,i,j,k) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
-                rho_s_H(2,i,j,k) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
-                rho_s_H(3,i,j,k) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
+                rho_s_H(i,j,k,1) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
+                rho_s_H(i,j,k,2) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
+                rho_s_H(i,j,k,3) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
 
               else
 
@@ -453,9 +454,9 @@
 
                 ! precompute vector
                 factor = jacobianl * wgll_cube(i,j,k)
-                rho_s_H(1,i,j,k) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-                rho_s_H(2,i,j,k) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-                rho_s_H(3,i,j,k) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
+                rho_s_H(i,j,k,1) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+                rho_s_H(i,j,k,2) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+                rho_s_H(i,j,k,3) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
 
               endif
 
@@ -519,12 +520,15 @@
             fac2 = wgllwgll_xz(i,k)
             fac3 = wgllwgll_xy(i,j)
 
-            sum_terms(1,i,j,k) = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
-            sum_terms(2,i,j,k) = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
-            sum_terms(3,i,j,k) = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
+            sum_terms(i,j,k,1) = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
+            sum_terms(i,j,k,2) = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
+            sum_terms(i,j,k,3) = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
 
-            if(GRAVITY_VAL) sum_terms(:,i,j,k) = sum_terms(:,i,j,k) + rho_s_H(:,i,j,k)
-
+            if(GRAVITY_VAL) then
+              sum_terms(i,j,k,1) = sum_terms(i,j,k,1) + rho_s_H(i,j,k,1)
+              sum_terms(i,j,k,2) = sum_terms(i,j,k,2) + rho_s_H(i,j,k,2)
+              sum_terms(i,j,k,3) = sum_terms(i,j,k,3) + rho_s_H(i,j,k,3)
+            endif
           enddo
         enddo
       enddo
@@ -534,7 +538,9 @@
         do j=1,NGLLY
           do i=1,NGLLX
             iglob = ibool(i,j,k,ispec)
-            accel_inner_core(:,iglob) = accel_inner_core(:,iglob) + sum_terms(:,i,j,k)
+            accel_inner_core(1,iglob) = accel_inner_core(1,iglob) + sum_terms(i,j,k,1)
+            accel_inner_core(2,iglob) = accel_inner_core(2,iglob) + sum_terms(i,j,k,2)
+            accel_inner_core(3,iglob) = accel_inner_core(3,iglob) + sum_terms(i,j,k,3)
           enddo
         enddo
       enddo
@@ -561,8 +567,6 @@
                                                    R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk, &
                                                    ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec,factor_common, &
                                                    muvstore, &
-!                                                  epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-!                                                  epsilondev_xz,epsilondev_yz, &
                                                    epsilondev_loc, &
                                                    deltat)
         else
@@ -581,11 +585,11 @@
         do k=1,NGLLZ
           do j=1,NGLLY
             do i=1,NGLLX
-              epsilondev_xx(i,j,k,ispec) = epsilondev_loc(1,i,j,k)
-              epsilondev_yy(i,j,k,ispec) = epsilondev_loc(2,i,j,k)
-              epsilondev_xy(i,j,k,ispec) = epsilondev_loc(3,i,j,k)
-              epsilondev_xz(i,j,k,ispec) = epsilondev_loc(4,i,j,k)
-              epsilondev_yz(i,j,k,ispec) = epsilondev_loc(5,i,j,k)
+              epsilondev_xx(i,j,k,ispec) = epsilondev_loc(i,j,k,1)
+              epsilondev_yy(i,j,k,ispec) = epsilondev_loc(i,j,k,2)
+              epsilondev_xy(i,j,k,ispec) = epsilondev_loc(i,j,k,3)
+              epsilondev_xz(i,j,k,ispec) = epsilondev_loc(i,j,k,4)
+              epsilondev_yz(i,j,k,ispec) = epsilondev_loc(i,j,k,5)
             enddo
           enddo
         enddo

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_outer_core_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_outer_core_Dev.F90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_outer_core_Dev.F90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -83,8 +83,6 @@
   logical :: phase_is_inner
 
   ! local parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1,tempx2,tempx3
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: newtempx1,newtempx2,newtempx3
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: sum_terms
 
   ! for gravity
@@ -92,45 +90,34 @@
   double precision :: radius,theta,phi,gxl,gyl,gzl
   double precision :: cos_theta,sin_theta,cos_phi,sin_phi
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: gravity_term
+
+  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: temp_gxl,temp_gyl,temp_gzl
+
   ! for the Euler scheme for rotation
   real(kind=CUSTOM_REAL) :: two_omega_deltat,cos_two_omega_t,sin_two_omega_t,A_rotation,B_rotation, &
        ux_rotation,uy_rotation,dpotentialdx_with_rot,dpotentialdy_with_rot
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: source_euler_A,source_euler_B
 
   integer :: ispec,iglob
-  integer :: i,j,k
   real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
   real(kind=CUSTOM_REAL) :: dpotentialdxl,dpotentialdyl,dpotentialdzl
 
-  ! manually inline the calls to the Deville et al. (2002) routines
+  ! Deville
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1,tempx2,tempx3
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: newtempx1,newtempx2,newtempx3
 
-  real(kind=CUSTOM_REAL), dimension(NGLLX,m2) :: B1_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: E1_m1_m2_5points
 
-  equivalence(dummyx_loc,B1_m1_m2_5points)
-  equivalence(tempx1,C1_m1_m2_5points)
-  equivalence(newtempx1,E1_m1_m2_5points)
-
-  real(kind=CUSTOM_REAL), dimension(m2,NGLLX) :: A1_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: C1_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: E1_mxm_m2_m1_5points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_5points)
-  equivalence(tempx3,C1_mxm_m2_m1_5points)
-  equivalence(newtempx3,E1_mxm_m2_m1_5points)
-
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: temp_gxl,temp_gyl,temp_gzl
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     displ_times_grad_x_ln_rho,displ_times_grad_y_ln_rho,displ_times_grad_z_ln_rho
 
-! integer :: computed_elements
   integer :: num_elements,ispec_p
   integer :: iphase
 
 #ifdef FORCE_VECTORIZATION
   integer :: ijk
+#else
+  integer :: i,j,k
 #endif
 
 ! ****************************************************
@@ -198,38 +185,14 @@
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
-    do j=1,m2
-      do i=1,m1
-        C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
-                                hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
-                                hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
-                                hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
-                                hprime_xx(i,5)*B1_m1_m2_5points(5,j)
-      enddo
-    enddo
 
-    do k = 1,NGLLX
-      do j=1,m1
-        do i=1,m1
-          tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                          dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                          dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                          dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                          dummyx_loc(i,5,k)*hprime_xxT(5,j)
-        enddo
-      enddo
-    enddo
+    ! computes 1. matrix multiplication for tempx1,..
+    call mxm5_single(hprime_xx,m1,dummyx_loc,tempx1,m2)
+    ! computes 2. matrix multiplication for tempx2,..
+    call mxm5_3dmat_single(dummyx_loc,m1,hprime_xxT,m1,tempx2,NGLLX)
+    ! computes 3. matrix multiplication for tempx1,..
+    call mxm5_single(dummyx_loc,m2,hprime_xxT,tempx3,m1)
 
-    do j=1,m1
-      do i=1,m2
-        C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                    A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                    A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                    A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                    A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-      enddo
-    enddo
-
     DO_LOOP_IJK
 
       ! get derivatives of velocity potential with respect to x, y and z
@@ -373,38 +336,14 @@
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
-    do j=1,m2
-      do i=1,m1
-        E1_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_5points(1,j) + &
-                                hprimewgll_xxT(i,2)*C1_m1_m2_5points(2,j) + &
-                                hprimewgll_xxT(i,3)*C1_m1_m2_5points(3,j) + &
-                                hprimewgll_xxT(i,4)*C1_m1_m2_5points(4,j) + &
-                                hprimewgll_xxT(i,5)*C1_m1_m2_5points(5,j)
-      enddo
-    enddo
 
-    do k = 1,NGLLX
-      do j=1,m1
-        do i=1,m1
-          newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                             tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                             tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                             tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                             tempx2(i,5,k)*hprimewgll_xx(5,j)
-        enddo
-      enddo
-    enddo
+    ! computes 1. matrix multiplication for newtempx1,..
+    call mxm5_single(hprimewgll_xxT,m1,tempx1,newtempx1,m2)
+    ! computes 2. matrix multiplication for tempx2,..
+    call mxm5_3dmat_single(tempx2,m1,hprimewgll_xx,m1,newtempx2,NGLLX)
+    ! computes 3. matrix multiplication for newtempx3,..
+    call mxm5_single(tempx3,m2,hprimewgll_xx,newtempx3,m1)
 
-    do j=1,m1
-      do i=1,m2
-        E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                    C1_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                    C1_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                    C1_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                    C1_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-      enddo
-    enddo
-
     ! sum contributions from each element to the global mesh and add gravity term
 
     DO_LOOP_IJK
@@ -488,5 +427,86 @@
 
   enddo   ! spectral element loop
 
+  contains
+
+!--------------------------------------------------------------------------------------------
+!
+! matrix-matrix multiplications
+!
+! subroutines adapted from Deville, Fischer and Mund, High-order methods
+! for incompressible fluid flow, Cambridge University Press (2002),
+! pages 386 and 389 and Figure 8.3.1
+!
+!--------------------------------------------------------------------------------------------
+!
+! note: the matrix-matrix multiplications are used for very small matrices ( 5 x 5 x 5 elements);
+!       thus, calling external optimized libraries for these multiplications are in general slower
+!
+! please leave the routines here to help compilers inlining the code
+
+  subroutine mxm5_single(A,n1,B,C,n3)
+
+! 2-dimensional arrays (25,5)/(5,25)
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5) :: A
+  real(kind=CUSTOM_REAL),dimension(5,n3) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n3) :: C
+
+  ! local parameters
+  integer :: i,j
+
+  ! matrix-matrix multiplication
+  do j=1,n3
+    do i=1,n1
+      C(i,j) =  A(i,1) * B(1,j) &
+              + A(i,2) * B(2,j) &
+              + A(i,3) * B(3,j) &
+              + A(i,4) * B(4,j) &
+              + A(i,5) * B(5,j)
+    enddo
+  enddo
+
+  end subroutine mxm5_single
+
+
+!--------------------------------------------------------------------------------------------
+
+  subroutine mxm5_3dmat_single(A,n1,B,n2,C,n3)
+
+! 3-dimensional arrays (5,5,5) for A and C
+
+  use constants_solver,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: n1,n2,n3
+  real(kind=CUSTOM_REAL),dimension(n1,5,n3) :: A
+  real(kind=CUSTOM_REAL),dimension(5,n2) :: B
+  real(kind=CUSTOM_REAL),dimension(n1,n2,n3) :: C
+
+  ! local parameters
+  integer :: i,j,k
+
+  ! matrix-matrix multiplication
+  do j=1,n2
+    do i=1,n1
+      ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+      do k=1,n3
+        C(i,j,k) =  A(i,1,k) * B(1,j) &
+                  + A(i,2,k) * B(2,j) &
+                  + A(i,3,k) * B(3,j) &
+                  + A(i,4,k) * B(4,j) &
+                  + A(i,5,k) * B(5,j)
+      enddo
+    enddo
+  enddo
+
+  end subroutine mxm5_3dmat_single
+
   end subroutine compute_forces_outer_core_Dev
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_kernels.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_kernels.F90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_kernels.F90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -74,7 +74,7 @@
   real(kind=CUSTOM_REAL), dimension(5) :: epsilondev_loc
   real(kind=CUSTOM_REAL), dimension(5) :: b_epsilondev_loc
 
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: b_epsilondev_loc_matrix
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: b_epsilondev_loc_matrix
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: b_eps_trace_over_3_loc_matrix
 
   integer :: i,j,k,ispec,iglob
@@ -108,11 +108,11 @@
       else
         ! backward/reconstructed strain arrays
         b_eps_trace_over_3_loc_matrix(:,:,:) = b_eps_trace_over_3_crust_mantle(:,:,:,ispec)
-        b_epsilondev_loc_matrix(1,:,:,:) = b_epsilondev_xx_crust_mantle(:,:,:,ispec)
-        b_epsilondev_loc_matrix(2,:,:,:) = b_epsilondev_yy_crust_mantle(:,:,:,ispec)
-        b_epsilondev_loc_matrix(3,:,:,:) = b_epsilondev_xy_crust_mantle(:,:,:,ispec)
-        b_epsilondev_loc_matrix(4,:,:,:) = b_epsilondev_xz_crust_mantle(:,:,:,ispec)
-        b_epsilondev_loc_matrix(5,:,:,:) = b_epsilondev_yz_crust_mantle(:,:,:,ispec)
+        b_epsilondev_loc_matrix(:,:,:,1) = b_epsilondev_xx_crust_mantle(:,:,:,ispec)
+        b_epsilondev_loc_matrix(:,:,:,2) = b_epsilondev_yy_crust_mantle(:,:,:,ispec)
+        b_epsilondev_loc_matrix(:,:,:,3) = b_epsilondev_xy_crust_mantle(:,:,:,ispec)
+        b_epsilondev_loc_matrix(:,:,:,4) = b_epsilondev_xz_crust_mantle(:,:,:,ispec)
+        b_epsilondev_loc_matrix(:,:,:,5) = b_epsilondev_yz_crust_mantle(:,:,:,ispec)
       endif
 
       ! For anisotropic kernels
@@ -138,8 +138,8 @@
               !                         behave better for smoother wavefields, thus containing less numerical artefacts.
               rho_kl_crust_mantle(i,j,k,ispec) =  rho_kl_crust_mantle(i,j,k,ispec) &
                  + deltat * (accel_crust_mantle(1,iglob) * b_displ_crust_mantle(1,iglob) &
-                 + accel_crust_mantle(2,iglob) * b_displ_crust_mantle(2,iglob) &
-                 + accel_crust_mantle(3,iglob) * b_displ_crust_mantle(3,iglob) )
+                           + accel_crust_mantle(2,iglob) * b_displ_crust_mantle(2,iglob) &
+                           + accel_crust_mantle(3,iglob) * b_displ_crust_mantle(3,iglob) )
 
               ! fully anisotropic kernel
               ! temporary arrays
@@ -149,7 +149,11 @@
               epsilondev_loc(4) = epsilondev_xz_crust_mantle(i,j,k,ispec)
               epsilondev_loc(5) = epsilondev_yz_crust_mantle(i,j,k,ispec)
 
-              b_epsilondev_loc(:) = b_epsilondev_loc_matrix(:,i,j,k)
+              b_epsilondev_loc(1) = b_epsilondev_loc_matrix(i,j,k,1)
+              b_epsilondev_loc(2) = b_epsilondev_loc_matrix(i,j,k,2)
+              b_epsilondev_loc(3) = b_epsilondev_loc_matrix(i,j,k,3)
+              b_epsilondev_loc(4) = b_epsilondev_loc_matrix(i,j,k,4)
+              b_epsilondev_loc(5) = b_epsilondev_loc_matrix(i,j,k,5)
 
               call compute_strain_product(prod,eps_trace_over_3_crust_mantle(i,j,k,ispec),epsilondev_loc, &
                                          b_eps_trace_over_3_loc_matrix(i,j,k),b_epsilondev_loc)
@@ -183,8 +187,8 @@
               !                         behave better for smoother wavefields, thus containing less numerical artefacts.
               rho_kl_crust_mantle(i,j,k,ispec) =  rho_kl_crust_mantle(i,j,k,ispec) &
                  + deltat * (accel_crust_mantle(1,iglob) * b_displ_crust_mantle(1,iglob) &
-                 + accel_crust_mantle(2,iglob) * b_displ_crust_mantle(2,iglob) &
-                 + accel_crust_mantle(3,iglob) * b_displ_crust_mantle(3,iglob) )
+                           + accel_crust_mantle(2,iglob) * b_displ_crust_mantle(2,iglob) &
+                           + accel_crust_mantle(3,iglob) * b_displ_crust_mantle(3,iglob) )
 
               ! isotropic kernels
               ! temporary arrays
@@ -194,7 +198,11 @@
               epsilondev_loc(4) = epsilondev_xz_crust_mantle(i,j,k,ispec)
               epsilondev_loc(5) = epsilondev_yz_crust_mantle(i,j,k,ispec)
 
-              b_epsilondev_loc(:) = b_epsilondev_loc_matrix(:,i,j,k)
+              b_epsilondev_loc(1) = b_epsilondev_loc_matrix(i,j,k,1)
+              b_epsilondev_loc(2) = b_epsilondev_loc_matrix(i,j,k,2)
+              b_epsilondev_loc(3) = b_epsilondev_loc_matrix(i,j,k,3)
+              b_epsilondev_loc(4) = b_epsilondev_loc_matrix(i,j,k,4)
+              b_epsilondev_loc(5) = b_epsilondev_loc_matrix(i,j,k,5)
 
               ! kernel for shear modulus, see e.g. Tromp et al. (2005), equation (17)
               ! note: multiplication with 2*mu(x) will be done after the time loop
@@ -205,7 +213,6 @@
                   + 2 * (epsilondev_loc(3)*b_epsilondev_loc(3) + epsilondev_loc(4)*b_epsilondev_loc(4) + &
                   epsilondev_loc(5)*b_epsilondev_loc(5)) )
 
-
               ! kernel for bulk modulus, see e.g. Tromp et al. (2005), equation (18)
               ! note: multiplication with kappa(x) will be done after the time loop
               alpha_kl_crust_mantle(i,j,k,ispec) = alpha_kl_crust_mantle(i,j,k,ispec) &
@@ -251,106 +258,153 @@
   real(kind=CUSTOM_REAL) :: tempz1l,tempz2l,tempz3l
   real(kind=CUSTOM_REAL), dimension(5) :: b_epsilondev_loc
   real(kind=CUSTOM_REAL), dimension(5) :: epsilondev_loc
-  real(kind=CUSTOM_REAL), dimension(3) :: vector_accel
   real(kind=CUSTOM_REAL) :: div_displ,b_div_displ
+  real(kind=CUSTOM_REAL) :: gradx,grady,gradz
   integer :: i,j,k,l,ispec,iglob
+  logical,dimension(NGLOB_OUTER_CORE) :: mask_ibool
 
   ! outer_core -- compute the actual displacement and acceleration (NDIM,NGLOBMAX_OUTER_CORE)
 
   if( .not. GPU_MODE ) then
     ! on CPU
+
+    ! pre-calculates gradients in outer core on CPU
+    mask_ibool(:) = .false.
     do ispec = 1, NSPEC_OUTER_CORE
       do k = 1, NGLLZ
         do j = 1, NGLLY
           do i = 1, NGLLX
+
+            ! global index
             iglob = ibool_outer_core(i,j,k,ispec)
 
-            xixl = xix_outer_core(i,j,k,ispec)
-            xiyl = xiy_outer_core(i,j,k,ispec)
-            xizl = xiz_outer_core(i,j,k,ispec)
-            etaxl = etax_outer_core(i,j,k,ispec)
-            etayl = etay_outer_core(i,j,k,ispec)
-            etazl = etaz_outer_core(i,j,k,ispec)
-            gammaxl = gammax_outer_core(i,j,k,ispec)
-            gammayl = gammay_outer_core(i,j,k,ispec)
-            gammazl = gammaz_outer_core(i,j,k,ispec)
+            ! only calculate the gradients once for shared nodes
+            if( .not. mask_ibool(iglob) ) then
 
-            ! calculates gradient grad(b_displ)
-            tempx1l = 0._CUSTOM_REAL
-            tempx2l = 0._CUSTOM_REAL
-            tempx3l = 0._CUSTOM_REAL
-            do l=1,NGLLX
-              tempx1l = tempx1l + b_displ_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
-            enddo
-            do l=1,NGLLY
-              tempx2l = tempx2l + b_displ_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
-            enddo
-            do l=1,NGLLZ
-              tempx3l = tempx3l +  b_displ_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
-            enddo
-            b_vector_displ_outer_core(1,iglob) = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-            b_vector_displ_outer_core(2,iglob) = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-            b_vector_displ_outer_core(3,iglob) = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+              ! masks this global point
+              mask_ibool(iglob) = .true.
 
-            ! calculates gradient grad(accel)
-            tempx1l = 0._CUSTOM_REAL
-            tempx2l = 0._CUSTOM_REAL
-            tempx3l = 0._CUSTOM_REAL
-            do l=1,NGLLX
-              tempx1l = tempx1l + accel_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
-            enddo
-            do l=1,NGLLY
-              tempx2l = tempx2l + accel_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
-            enddo
-            do l=1,NGLLZ
-              tempx3l = tempx3l + accel_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
-            enddo
-            vector_accel(1) = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-            vector_accel(2) = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-            vector_accel(3) = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+              xixl = xix_outer_core(i,j,k,ispec)
+              xiyl = xiy_outer_core(i,j,k,ispec)
+              xizl = xiz_outer_core(i,j,k,ispec)
+              etaxl = etax_outer_core(i,j,k,ispec)
+              etayl = etay_outer_core(i,j,k,ispec)
+              etazl = etaz_outer_core(i,j,k,ispec)
+              gammaxl = gammax_outer_core(i,j,k,ispec)
+              gammayl = gammay_outer_core(i,j,k,ispec)
+              gammazl = gammaz_outer_core(i,j,k,ispec)
 
-            ! density kernel
-!            rho_kl_outer_core(i,j,k,ispec) = rho_kl_outer_core(i,j,k,ispec) &
-!              + deltat * dot_product(vector_accel(:), b_vector_displ_outer_core(:,iglob))
+              ! calculates gradient grad(b_displ)
+              tempx1l = 0._CUSTOM_REAL
+              tempx2l = 0._CUSTOM_REAL
+              tempx3l = 0._CUSTOM_REAL
+              do l=1,NGLLX
+                tempx1l = tempx1l + b_displ_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
+              enddo
+              do l=1,NGLLY
+                tempx2l = tempx2l + b_displ_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
+              enddo
+              do l=1,NGLLZ
+                tempx3l = tempx3l + b_displ_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
+              enddo
+              gradx = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
+              grady = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
+              gradz = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
 
-!! DK DK July 2013: replaces dot_product() with an unrolled expression, otherwise most compilers
-!! DK DK July 2013: will try to vectorize this rather than the outer loop, resulting in a much slower code
-            rho_kl_outer_core(i,j,k,ispec) = rho_kl_outer_core(i,j,k,ispec) &
-              + deltat * (  vector_accel(1) * b_vector_displ_outer_core(1,iglob) &
-                          + vector_accel(2) * b_vector_displ_outer_core(2,iglob) &
-                          + vector_accel(3) * b_vector_displ_outer_core(3,iglob) )
+              ! assigns gradient field on global points
+              b_vector_displ_outer_core(1,iglob) = gradx
+              b_vector_displ_outer_core(2,iglob) = grady
+              b_vector_displ_outer_core(3,iglob) = gradz
 
-            ! bulk modulus kernel
-            kappal = rhostore_outer_core(i,j,k,ispec)/kappavstore_outer_core(i,j,k,ispec)
-
-            div_displ =  kappal * accel_outer_core(iglob)
-            b_div_displ =  kappal * b_accel_outer_core(iglob)
-
-            alpha_kl_outer_core(i,j,k,ispec) = alpha_kl_outer_core(i,j,k,ispec) &
-               + deltat * div_displ * b_div_displ
-
-            ! calculates gradient grad(displ) (also needed for boundary kernels)
-            if(SAVE_BOUNDARY_MESH .or. deviatoric_outercore) then
+              ! calculates gradient grad(accel)
               tempx1l = 0._CUSTOM_REAL
               tempx2l = 0._CUSTOM_REAL
               tempx3l = 0._CUSTOM_REAL
               do l=1,NGLLX
-                tempx1l = tempx1l + displ_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
+                tempx1l = tempx1l + accel_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
               enddo
               do l=1,NGLLY
-                tempx2l = tempx2l + displ_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
+                tempx2l = tempx2l + accel_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
               enddo
               do l=1,NGLLZ
-                tempx3l = tempx3l + displ_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
+                tempx3l = tempx3l + accel_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
               enddo
-              vector_displ_outer_core(1,iglob) = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-              vector_displ_outer_core(2,iglob) = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-              vector_displ_outer_core(3,iglob) = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
-            endif
+              gradx = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
+              grady = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
+              gradz = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
 
-            !deviatoric kernel check
-            if( deviatoric_outercore ) then
+              ! assigns gradient field on global points
+              vector_accel_outer_core(1,iglob) = gradx
+              vector_accel_outer_core(2,iglob) = grady
+              vector_accel_outer_core(3,iglob) = gradz
 
+              ! calculates gradient grad(displ) (also needed for boundary kernels)
+              if( SAVE_BOUNDARY_MESH .or. deviatoric_outercore ) then
+                tempx1l = 0._CUSTOM_REAL
+                tempx2l = 0._CUSTOM_REAL
+                tempx3l = 0._CUSTOM_REAL
+                do l=1,NGLLX
+                  tempx1l = tempx1l + displ_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
+                enddo
+                do l=1,NGLLY
+                  tempx2l = tempx2l + displ_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
+                enddo
+                do l=1,NGLLZ
+                  tempx3l = tempx3l + displ_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
+                enddo
+                gradx = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
+                grady = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
+                gradz = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+
+                vector_displ_outer_core(1,iglob) = gradx
+                vector_displ_outer_core(2,iglob) = grady
+                vector_displ_outer_core(3,iglob) = gradz
+              endif
+
+            endif ! mask_ibool
+
+          enddo
+        enddo
+      enddo
+    enddo
+
+    ! acoustic kernels
+    do ispec = 1, NSPEC_OUTER_CORE
+      do k = 1, NGLLZ
+        do j = 1, NGLLY
+          do i = 1, NGLLX
+
+            ! global index
+            iglob = ibool_outer_core(i,j,k,ispec)
+
+            gradx = vector_accel_outer_core(1,iglob) * b_vector_displ_outer_core(1,iglob)
+            grady = vector_accel_outer_core(2,iglob) * b_vector_displ_outer_core(2,iglob)
+            gradz = vector_accel_outer_core(3,iglob) * b_vector_displ_outer_core(3,iglob)
+
+            ! density kernel
+            ! note: we replace dot_product() with an unrolled expression, otherwise most compilers
+            !       will try to vectorize this rather than the outer loop, resulting in a much slower code
+            rho_kl_outer_core(i,j,k,ispec) = rho_kl_outer_core(i,j,k,ispec) + deltat * (gradx + grady + gradz)
+
+            ! bulk modulus kernel
+            kappal = rhostore_outer_core(i,j,k,ispec)/kappavstore_outer_core(i,j,k,ispec)
+            div_displ =  kappal * accel_outer_core(iglob)
+            b_div_displ =  kappal * b_accel_outer_core(iglob)
+
+            alpha_kl_outer_core(i,j,k,ispec) = alpha_kl_outer_core(i,j,k,ispec) + deltat * div_displ * b_div_displ
+
+          enddo
+        enddo
+      enddo
+    enddo
+
+    !deviatoric kernel check
+    if( deviatoric_outercore ) then
+
+      do ispec = 1, NSPEC_OUTER_CORE
+        do k = 1, NGLLZ
+          do j = 1, NGLLY
+            do i = 1, NGLLX
               tempx1l = 0._CUSTOM_REAL
               tempx2l = 0._CUSTOM_REAL
               tempx3l = 0._CUSTOM_REAL
@@ -469,13 +523,13 @@
                  + 2 * (epsilondev_loc(3)*b_epsilondev_loc(3) + epsilondev_loc(4)*b_epsilondev_loc(4) + &
                   epsilondev_loc(5)*b_epsilondev_loc(5)) )
 
-            endif !deviatoric kernel check
-
+            enddo
           enddo
         enddo
       enddo
-    enddo
 
+    endif !deviatoric kernel check
+
   else
     ! updates kernel contribution on GPU
     if( deviatoric_outercore ) call exit_mpi(myrank,'deviatoric kernel on GPU not supported yet')
@@ -506,7 +560,7 @@
   real(kind=CUSTOM_REAL), dimension(5) :: b_epsilondev_loc
   real(kind=CUSTOM_REAL), dimension(5) :: epsilondev_loc
 
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: b_epsilondev_loc_matrix
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: b_epsilondev_loc_matrix
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: b_eps_trace_over_3_loc_matrix
 
   integer :: i,j,k,ispec,iglob
@@ -538,11 +592,11 @@
       else
         ! backward/reconstructed strain arrays
         b_eps_trace_over_3_loc_matrix(:,:,:) = b_eps_trace_over_3_inner_core(:,:,:,ispec)
-        b_epsilondev_loc_matrix(1,:,:,:) = b_epsilondev_xx_inner_core(:,:,:,ispec)
-        b_epsilondev_loc_matrix(2,:,:,:) = b_epsilondev_yy_inner_core(:,:,:,ispec)
-        b_epsilondev_loc_matrix(3,:,:,:) = b_epsilondev_xy_inner_core(:,:,:,ispec)
-        b_epsilondev_loc_matrix(4,:,:,:) = b_epsilondev_xz_inner_core(:,:,:,ispec)
-        b_epsilondev_loc_matrix(5,:,:,:) = b_epsilondev_yz_inner_core(:,:,:,ispec)
+        b_epsilondev_loc_matrix(:,:,:,1) = b_epsilondev_xx_inner_core(:,:,:,ispec)
+        b_epsilondev_loc_matrix(:,:,:,2) = b_epsilondev_yy_inner_core(:,:,:,ispec)
+        b_epsilondev_loc_matrix(:,:,:,3) = b_epsilondev_xy_inner_core(:,:,:,ispec)
+        b_epsilondev_loc_matrix(:,:,:,4) = b_epsilondev_xz_inner_core(:,:,:,ispec)
+        b_epsilondev_loc_matrix(:,:,:,5) = b_epsilondev_yz_inner_core(:,:,:,ispec)
       endif
 
       do k = 1, NGLLZ
@@ -561,7 +615,11 @@
             epsilondev_loc(4) = epsilondev_xz_inner_core(i,j,k,ispec)
             epsilondev_loc(5) = epsilondev_yz_inner_core(i,j,k,ispec)
 
-            b_epsilondev_loc(:) = b_epsilondev_loc_matrix(:,i,j,k)
+            b_epsilondev_loc(1) = b_epsilondev_loc_matrix(i,j,k,1)
+            b_epsilondev_loc(2) = b_epsilondev_loc_matrix(i,j,k,2)
+            b_epsilondev_loc(3) = b_epsilondev_loc_matrix(i,j,k,3)
+            b_epsilondev_loc(4) = b_epsilondev_loc_matrix(i,j,k,4)
+            b_epsilondev_loc(5) = b_epsilondev_loc_matrix(i,j,k,5)
 
             beta_kl_inner_core(i,j,k,ispec) =  beta_kl_inner_core(i,j,k,ispec) &
                + deltat * (epsilondev_loc(1)*b_epsilondev_loc(1) + epsilondev_loc(2)*b_epsilondev_loc(2) &

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_seismograms.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_seismograms.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -167,7 +167,7 @@
 
   ! element strain
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: eps_trace_over_3_loc_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,5) :: epsilondev_loc_crust_mantle
 
   do irec_local = 1,nrec_local
 
@@ -201,11 +201,11 @@
     else
       ! element adjoint strain
       eps_trace_over_3_loc_crust_mantle(:,:,:) = eps_trace_over_3_crust_mantle(:,:,:,ispec)
-      epsilondev_loc_crust_mantle(1,:,:,:) = epsilondev_xx_crust_mantle(:,:,:,ispec)
-      epsilondev_loc_crust_mantle(2,:,:,:) = epsilondev_yy_crust_mantle(:,:,:,ispec)
-      epsilondev_loc_crust_mantle(3,:,:,:) = epsilondev_xy_crust_mantle(:,:,:,ispec)
-      epsilondev_loc_crust_mantle(4,:,:,:) = epsilondev_xz_crust_mantle(:,:,:,ispec)
-      epsilondev_loc_crust_mantle(5,:,:,:) = epsilondev_yz_crust_mantle(:,:,:,ispec)
+      epsilondev_loc_crust_mantle(:,:,:,1) = epsilondev_xx_crust_mantle(:,:,:,ispec)
+      epsilondev_loc_crust_mantle(:,:,:,2) = epsilondev_yy_crust_mantle(:,:,:,ispec)
+      epsilondev_loc_crust_mantle(:,:,:,3) = epsilondev_xy_crust_mantle(:,:,:,ispec)
+      epsilondev_loc_crust_mantle(:,:,:,4) = epsilondev_xz_crust_mantle(:,:,:,ispec)
+      epsilondev_loc_crust_mantle(:,:,:,5) = epsilondev_yz_crust_mantle(:,:,:,ispec)
     endif
 
     ! perform the general interpolation using Lagrange polynomials
@@ -223,11 +223,11 @@
 
           eps_trace = eps_trace + dble(eps_trace_over_3_loc_crust_mantle(i,j,k))*hlagrange
 
-          dxx = dxx + dble(epsilondev_loc_crust_mantle(1,i,j,k))*hlagrange
-          dyy = dyy + dble(epsilondev_loc_crust_mantle(2,i,j,k))*hlagrange
-          dxy = dxy + dble(epsilondev_loc_crust_mantle(3,i,j,k))*hlagrange
-          dxz = dxz + dble(epsilondev_loc_crust_mantle(4,i,j,k))*hlagrange
-          dyz = dyz + dble(epsilondev_loc_crust_mantle(5,i,j,k))*hlagrange
+          dxx = dxx + dble(epsilondev_loc_crust_mantle(i,j,k,1))*hlagrange
+          dyy = dyy + dble(epsilondev_loc_crust_mantle(i,j,k,2))*hlagrange
+          dxy = dxy + dble(epsilondev_loc_crust_mantle(i,j,k,3))*hlagrange
+          dxz = dxz + dble(epsilondev_loc_crust_mantle(i,j,k,4))*hlagrange
+          dyz = dyz + dble(epsilondev_loc_crust_mantle(i,j,k,5))*hlagrange
 
           displ_s(:,i,j,k) = displ_crust_mantle(:,iglob)
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/get_attenuation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/get_attenuation.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/get_attenuation.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -43,14 +43,14 @@
   !       this is better, it works fine and these arrays are really huge
   !       in the crust_mantle region, thus let us not double their size
   real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec),intent(out) :: one_minus_sum_beta, scale_factor
-  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec),intent(out) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,N_SLS,vnspec),intent(out) :: factor_common
 
   double precision, dimension(N_SLS),intent(out) :: tau_s
 
   integer :: iregion_code
 
   ! local parameters
-  integer :: i,j,k,ispec,ier
+  integer :: i,j,k,ispec,ier,i_sls
   double precision, dimension(N_SLS) :: tau_e, fc
   double precision :: omsb, Q_mu, sf, T_c_source, scale_t
   character(len=150) :: prname
@@ -85,21 +85,32 @@
   T_c_source               = 1000.0d0 / T_c_source
   T_c_source               = T_c_source / scale_t
 
+  ! loops over elements
   do ispec = 1, vnspec
+
+    ! loops over GLL points
     do k = 1, ATT3_VAL
       do j = 1, ATT2_VAL
         do i = 1, ATT1_VAL
-          tau_e(:) = factor_common(:,i,j,k,ispec)
+
+          ! gets relaxation times for each standard linear solid
+          do i_sls = 1,N_SLS
+            tau_e(i_sls) = factor_common(i,j,k,i_sls,ispec)
+          enddo
           Q_mu     = scale_factor(i,j,k,ispec)
 
           ! Determine the factor_common and one_minus_sum_beta from tau_s and tau_e
           call get_attenuation_property_values(tau_s, tau_e, fc, omsb)
 
           if( CUSTOM_REAL == SIZE_REAL ) then
-            factor_common(:,i,j,k,ispec)    = sngl(fc(:))
+            do i_sls = 1,N_SLS
+              factor_common(i,j,k,i_sls,ispec)    = sngl(fc(i_sls))
+            enddo
             one_minus_sum_beta(i,j,k,ispec) = sngl(omsb)
           else
-            factor_common(:,i,j,k,ispec)    = fc(:)
+            do i_sls = 1,N_SLS
+              factor_common(i,j,k,i_sls,ispec)    = fc(i_sls)
+            enddo
             one_minus_sum_beta(i,j,k,ispec) = omsb
           endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/prepare_timerun.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/prepare_timerun.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -1345,19 +1345,19 @@
 
     ! attenuation memory variables
     ! crust/mantle
-    allocate(R_xx_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION), &
-             R_yy_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION), &
-             R_xy_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION), &
-             R_xz_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION), &
-             R_yz_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION), &
+    allocate(R_xx_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_ATTENUATION), &
+             R_yy_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_ATTENUATION), &
+             R_xy_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_ATTENUATION), &
+             R_xz_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_ATTENUATION), &
+             R_yz_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_ATTENUATION), &
              stat=ier)
     if(ier /= 0) stop 'error: not enough memory to allocate array R_memory_crust_mantle_lddrk'
     ! inner core
-    allocate(R_xx_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION), &
-             R_yy_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION), &
-             R_xy_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION), &
-             R_xz_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION), &
-             R_yz_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION), &
+    allocate(R_xx_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_ATTENUATION), &
+             R_yy_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_ATTENUATION), &
+             R_xy_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_ATTENUATION), &
+             R_xz_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_ATTENUATION), &
+             R_yz_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_ATTENUATION), &
              stat=ier)
     if(ier /= 0) stop 'error: not enough memory to allocate array R_memory_inner_core_lddrk'
     if(ATTENUATION_VAL) then
@@ -1375,19 +1375,19 @@
     endif
     if( SIMULATION_TYPE == 3 ) then
       ! crust/mantle
-      allocate(b_R_xx_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_AND_ATT), &
-               b_R_yy_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_AND_ATT), &
-               b_R_xy_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_AND_ATT), &
-               b_R_xz_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_AND_ATT), &
-               b_R_yz_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_AND_ATT), &
+      allocate(b_R_xx_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_STR_AND_ATT), &
+               b_R_yy_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_STR_AND_ATT), &
+               b_R_xy_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_STR_AND_ATT), &
+               b_R_xz_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_STR_AND_ATT), &
+               b_R_yz_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_STR_AND_ATT), &
                stat=ier)
       if(ier /= 0) stop 'error: not enough memory to allocate array R_memory_crust_mantle_lddrk'
       ! inner core
-      allocate(b_R_xx_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_AND_ATT), &
-               b_R_yy_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_AND_ATT), &
-               b_R_xy_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_AND_ATT), &
-               b_R_xz_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_AND_ATT), &
-               b_R_yz_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_AND_ATT), &
+      allocate(b_R_xx_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_STR_AND_ATT), &
+               b_R_yy_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_STR_AND_ATT), &
+               b_R_xy_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_STR_AND_ATT), &
+               b_R_xz_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_STR_AND_ATT), &
+               b_R_yz_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_STR_AND_ATT), &
                stat=ier)
       if(ier /= 0) stop 'error: not enough memory to allocate array R_memory_inner_core_lddrk'
       if(ATTENUATION_VAL) then
@@ -1417,18 +1417,18 @@
     if(ier /= 0) stop 'error: not enough memory to allocate array A_array_rotation_lddrk'
     allocate(B_array_rotation_lddrk(NGLLX,NGLLY,NGLLZ,1),stat=ier)
     if(ier /= 0) stop 'error: not enough memory to allocate array B_array_rotation_lddrk'
-    allocate(R_xx_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-             R_yy_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-             R_xy_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-             R_xz_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-             R_yz_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
+    allocate(R_xx_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+             R_yy_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+             R_xy_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+             R_xz_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+             R_yz_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
              stat=ier)
     if(ier /= 0) stop 'error: not enough memory to allocate array R_memory_crust_mantle_lddrk'
-    allocate(R_xx_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-             R_yy_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-             R_xy_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-             R_xz_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-             R_yz_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
+    allocate(R_xx_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+             R_yy_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+             R_xy_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+             R_xz_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+             R_yz_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
              stat=ier)
     if(ier /= 0) stop 'error: not enough memory to allocate array R_memory_inner_core_lddrk'
     if( SIMULATION_TYPE == 3 ) then
@@ -1436,18 +1436,18 @@
       if(ier /= 0) stop 'error: not enough memory to allocate array b_A_array_rotation_lddrk'
       allocate(b_B_array_rotation_lddrk(NGLLX,NGLLY,NGLLZ,1),stat=ier)
       if(ier /= 0) stop 'error: not enough memory to allocate array b_B_array_rotation_lddrk'
-      allocate(b_R_xx_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-               b_R_yy_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-               b_R_xy_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-               b_R_xz_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-               b_R_yz_crust_mantle_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
+      allocate(b_R_xx_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+               b_R_yy_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+               b_R_xy_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+               b_R_xz_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+               b_R_yz_crust_mantle_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
                stat=ier)
       if(ier /= 0) stop 'error: not enough memory to allocate array b_R_memory_crust_mantle_lddrk'
-      allocate(b_R_xx_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-               b_R_yy_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-               b_R_xy_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-               b_R_xz_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
-               b_R_yz_inner_core_lddrk(N_SLS,NGLLX,NGLLY,NGLLZ,1), &
+      allocate(b_R_xx_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+               b_R_yy_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+               b_R_xy_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+               b_R_xz_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
+               b_R_yz_inner_core_lddrk(NGLLX,NGLLY,NGLLZ,N_SLS,1), &
                stat=ier)
       if(ier /= 0) stop 'error: not enough memory to allocate array b_R_memory_inner_core_lddrk'
     endif

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/read_attenuation_adios.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/read_attenuation_adios.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/read_attenuation_adios.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -43,7 +43,7 @@
 
   integer :: vnspec
   real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec) :: scale_factor
-  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,N_SLS,vnspec) :: factor_common
   double precision, dimension(N_SLS)                 :: tau_s
 
   integer :: iregion_code

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D.F90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D.F90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -408,10 +408,14 @@
 !             thus it has longer latency to access variables than stack memory variables.
 !
 !             however, declaring the static arrays needed in compute_forces_crust_mantle_Dev()
-!             like e.g. sum_terms, tempx1,...B1_m1_m2_5points,... in this main routine and
+!             like e.g. sum_terms, tempx1,... in this main routine and
 !             passing them along as arguments to the routine makes the code slower.
-!             it seems that this stack/heap criterion is more complicated.
+!             it seems that this stack/heap criterion is more complicated, 
+!             and inlining functions is a performance criteria as well.
 !
+!             for vectorization, we asssume that arrays have contiguous memory allocated. this holds true for most compilers and
+!             static memory allocation. however, note that dynamically allocated memory could in principle be non-contiguous.
+!
 !             another reason why the use of modules is restricted is to make the code thread safe.
 !             having different threads access the same data structure and modifying it at the same time
 !             would lead to problems. passing arguments is a way to avoid such complications.

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D_par.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D_par.F90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D_par.F90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -365,9 +365,10 @@
   ! memory variables and standard linear solids for attenuation
   real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,ATT4_VAL) :: &
     one_minus_sum_beta_crust_mantle, factor_scale_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1_VAL,ATT2_VAL,ATT3_VAL,ATT4_VAL) :: &
+  real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,N_SLS,ATT4_VAL) :: &
     factor_common_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION) :: &
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_ATTENUATION) :: &
     R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle,R_xz_crust_mantle,R_yz_crust_mantle
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT),target :: &
@@ -378,7 +379,7 @@
     eps_trace_over_3_crust_mantle
 
   ! ADJOINT
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_AND_ATT) :: &
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_STR_AND_ATT) :: &
     b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle,b_R_xz_crust_mantle,b_R_yz_crust_mantle
 
   real(kind=CUSTOM_REAL), dimension(:,:,:,:),pointer :: &
@@ -537,9 +538,10 @@
   ! memory variables and standard linear solids for attenuation
   real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,ATT5_VAL) :: &
     one_minus_sum_beta_inner_core, factor_scale_inner_core
-  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1_VAL,ATT2_VAL,ATT3_VAL,ATT5_VAL) :: &
+  real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,N_SLS,ATT5_VAL) :: &
     factor_common_inner_core
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: &
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_ATTENUATION) :: &
     R_xx_inner_core,R_yy_inner_core,R_xy_inner_core,R_xz_inner_core,R_yz_inner_core
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_OR_ATT),target :: &
@@ -550,7 +552,7 @@
     eps_trace_over_3_inner_core
 
   ! ADJOINT
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_AND_ATT) :: &
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_INNER_CORE_STR_AND_ATT) :: &
     b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core,b_R_xz_inner_core,b_R_yz_inner_core
 
   real(kind=CUSTOM_REAL), dimension(:,:,:,:),pointer :: &

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/update_displacement_LDDRK.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/update_displacement_LDDRK.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/update_displacement_LDDRK.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -89,20 +89,18 @@
   use specfem_par_outercore
 
   implicit none
-
   ! local parameters
-  integer :: i
+  real(kind=CUSTOM_REAL) :: alpha,beta
 
-  do i=1,NGLOB_OUTER_CORE
-    veloc_outer_core_lddrk(i) =  ALPHA_LDDRK(istage) * veloc_outer_core_lddrk(i) + deltat * accel_outer_core(i)
+  ! current runge-kutta coefficients
+  alpha = ALPHA_LDDRK(istage)
+  beta = BETA_LDDRK(istage)
 
-    displ_outer_core_lddrk(i) =  ALPHA_LDDRK(istage) * displ_outer_core_lddrk(i) + deltat * veloc_outer_core(i)
+  ! forward wavefields
+  call update_acoustic_lddrk(NGLOB_OUTER_CORE,displ_outer_core,veloc_outer_core,accel_outer_core, &
+                             displ_outer_core_lddrk,veloc_outer_core_lddrk, &
+                             deltat,alpha,beta)
 
-    veloc_outer_core(i) = veloc_outer_core(i) + BETA_LDDRK(istage) * veloc_outer_core_lddrk(i)
-
-    displ_outer_core(i) = displ_outer_core(i) + BETA_LDDRK(istage) * displ_outer_core_lddrk(i)
-  enddo
-
   end subroutine update_veloc_acoustic_lddrk
 
 !
@@ -117,22 +115,63 @@
   use specfem_par_outercore
 
   implicit none
+  ! local parameters
+  real(kind=CUSTOM_REAL) :: alpha,beta
 
+  ! current runge-kutta coefficients
+  alpha = ALPHA_LDDRK(istage)
+  beta = BETA_LDDRK(istage)
+
+  ! backward/reconstructed wavefields
+  call update_acoustic_lddrk(NGLOB_OUTER_CORE_ADJOINT,b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core, &
+                             b_displ_outer_core_lddrk,b_veloc_outer_core_lddrk, &
+                             b_deltat,alpha,beta)
+
+  end subroutine update_veloc_acoustic_lddrk_backward
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine update_acoustic_lddrk(NGLOB,displ,veloc,accel,displ_lddrk,veloc_lddrk,deltat,alpha,beta)
+
+! updates acceleration and velocity in outer core
+
+  use constants,only: CUSTOM_REAL
+
+  implicit none
+
+  integer,intent(in) :: NGLOB
+  real(kind=CUSTOM_REAL),dimension(NGLOB),intent(inout) :: displ,veloc,accel
+  real(kind=CUSTOM_REAL),dimension(NGLOB),intent(inout) :: displ_lddrk,veloc_lddrk
+
+  real(kind=CUSTOM_REAL),intent(in) :: deltat
+
+  ! runge-kutta coefficients
+  real(kind=CUSTOM_REAL),intent(in) :: alpha,beta
+
   ! local parameters
   integer :: i
 
-  do i=1,NGLOB_OUTER_CORE
-    b_veloc_outer_core_lddrk(i) =  ALPHA_LDDRK(istage) * b_veloc_outer_core_lddrk(i) + b_deltat * b_accel_outer_core(i)
+  ! runge-kutta scheme update
 
-    b_displ_outer_core_lddrk(i) =  ALPHA_LDDRK(istage) * b_displ_outer_core_lddrk(i) + b_deltat * b_veloc_outer_core(i)
+  ! note: splitting the do-loops seems to be slightly more effective
 
-    b_veloc_outer_core(i) = b_veloc_outer_core(i) + BETA_LDDRK(istage) * b_veloc_outer_core_lddrk(i)
-
-    b_displ_outer_core(i) = b_displ_outer_core(i) + BETA_LDDRK(istage) * b_displ_outer_core_lddrk(i)
+  ! low-memory runge-kutta: intermediate storage wavefields
+  do i=1,NGLOB
+    veloc_lddrk(i) =  alpha * veloc_lddrk(i) + deltat * accel(i)
+    displ_lddrk(i) =  alpha * displ_lddrk(i) + deltat * veloc(i)
   enddo
+  ! updates wavefields
+  do i=1,NGLOB
+    veloc(i) = veloc(i) + beta * veloc_lddrk(i)
+    displ(i) = displ(i) + beta * displ_lddrk(i)
+  enddo
 
-  end subroutine update_veloc_acoustic_lddrk_backward
+  end subroutine update_acoustic_lddrk
 
+
 !
 !-------------------------------------------------------------------------------------------------
 !
@@ -150,36 +189,25 @@
   use specfem_par_innercore
 
   implicit none
-
   ! local parameters
-  integer :: i
+  real(kind=CUSTOM_REAL) :: alpha,beta
 
+  ! current runge-kutta coefficients
+  alpha = ALPHA_LDDRK(istage)
+  beta = BETA_LDDRK(istage)
+
+  ! forward wavefields
   ! crust/mantle
-  do i=1,NGLOB_CRUST_MANTLE
-    veloc_crust_mantle_lddrk(:,i) = ALPHA_LDDRK(istage) * veloc_crust_mantle_lddrk(:,i) &
-                                    + deltat * accel_crust_mantle(:,i)
+  call update_elastic_lddrk(NGLOB_CRUST_MANTLE,displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
+                            displ_crust_mantle_lddrk,veloc_crust_mantle_lddrk, &
+                            deltat,alpha,beta)
 
-    displ_crust_mantle_lddrk(:,i) = ALPHA_LDDRK(istage) * displ_crust_mantle_lddrk(:,i) &
-                                    + deltat * veloc_crust_mantle(:,i)
-
-    veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) + BETA_LDDRK(istage) * veloc_crust_mantle_lddrk(:,i)
-
-    displ_crust_mantle(:,i) = displ_crust_mantle(:,i) + BETA_LDDRK(istage) * displ_crust_mantle_lddrk(:,i)
-  enddo
-
   ! inner core
-  do i=1,NGLOB_INNER_CORE
-    veloc_inner_core_lddrk(:,i) = ALPHA_LDDRK(istage) * veloc_inner_core_lddrk(:,i) &
-                                    + deltat * accel_inner_core(:,i)
+  call update_elastic_lddrk(NGLOB_INNER_CORE,displ_inner_core,veloc_inner_core,accel_inner_core, &
+                            displ_inner_core_lddrk,veloc_inner_core_lddrk, &
+                            deltat,alpha,beta)
 
-    displ_inner_core_lddrk(:,i) = ALPHA_LDDRK(istage) * displ_inner_core_lddrk(:,i) &
-                                    + deltat * veloc_inner_core(:,i)
 
-    veloc_inner_core(:,i) = veloc_inner_core(:,i) + BETA_LDDRK(istage) * veloc_inner_core_lddrk(:,i)
-
-    displ_inner_core(:,i) = displ_inner_core(:,i) + BETA_LDDRK(istage) * displ_inner_core_lddrk(:,i)
-  enddo
-
   end subroutine update_veloc_elastic_lddrk
 
 !
@@ -195,35 +223,84 @@
   use specfem_par_innercore
 
   implicit none
+  ! local parameters
+  real(kind=CUSTOM_REAL) :: alpha,beta
 
+  ! current runge-kutta coefficients
+  alpha = ALPHA_LDDRK(istage)
+  beta = BETA_LDDRK(istage)
+
+  ! backward/reconstructed wavefields
+  ! crust/mantle
+  call update_elastic_lddrk(NGLOB_CRUST_MANTLE_ADJOINT,b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
+                            b_displ_crust_mantle_lddrk,b_veloc_crust_mantle_lddrk, &
+                            b_deltat,alpha,beta)
+
+  ! inner core
+  call update_elastic_lddrk(NGLOB_INNER_CORE_ADJOINT,b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
+                            b_displ_inner_core_lddrk,b_veloc_inner_core_lddrk, &
+                            b_deltat,alpha,beta)
+
+  end subroutine update_veloc_elastic_lddrk_backward
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine update_elastic_lddrk(NGLOB,displ,veloc,accel, &
+                                  displ_lddrk,veloc_lddrk, &
+                                  deltat,alpha,beta)
+
+
+  use constants_solver,only: CUSTOM_REAL,NDIM,FORCE_VECTORIZATION_VAL
+
+  implicit none
+
+  integer,intent(in) :: NGLOB
+
+  ! wavefields
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB),intent(inout) :: displ,veloc,accel
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB),intent(inout) :: displ_lddrk,veloc_lddrk
+
+  real(kind=CUSTOM_REAL),intent(in) :: deltat
+  ! runge-kutta coefficients
+  real(kind=CUSTOM_REAL),intent(in) :: alpha,beta
+
   ! local parameters
   integer :: i
 
-  ! crust/mantle
-  do i=1,NGLOB_CRUST_MANTLE
-    b_veloc_crust_mantle_lddrk(:,i) = ALPHA_LDDRK(istage) * b_veloc_crust_mantle_lddrk(:,i) &
-                                    + b_deltat * b_accel_crust_mantle(:,i)
+  ! low-memory runge-kutta scheme
 
-    b_displ_crust_mantle_lddrk(:,i) = ALPHA_LDDRK(istage) * b_displ_crust_mantle_lddrk(:,i) &
-                                    + b_deltat * b_veloc_crust_mantle(:,i)
+  if(FORCE_VECTORIZATION_VAL) then
 
-    b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) + BETA_LDDRK(istage) * b_veloc_crust_mantle_lddrk(:,i)
+    ! note: splitting the do-loops seems to be slightly more effective
 
-    b_displ_crust_mantle(:,i) = b_displ_crust_mantle(:,i) + BETA_LDDRK(istage) * b_displ_crust_mantle_lddrk(:,i)
-  enddo
+    ! low-memory runge-kutta: intermediate storage wavefields
+    do i=1,NGLOB * NDIM
+      veloc_lddrk(i,1) = alpha * veloc_lddrk(i,1) + deltat * accel(i,1)
+      displ_lddrk(i,1) = alpha * displ_lddrk(i,1) + deltat * veloc(i,1)
+    enddo
+    ! updates wavefields
+    do i=1,NGLOB * NDIM
+      veloc(i,1) = veloc(i,1) + beta * veloc_lddrk(i,1)
+      displ(i,1) = displ(i,1) + beta * displ_lddrk(i,1)
+    enddo
 
-  ! inner core
-  do i=1,NGLOB_INNER_CORE
-    b_veloc_inner_core_lddrk(:,i) = ALPHA_LDDRK(istage) * b_veloc_inner_core_lddrk(:,i) &
-                                    + b_deltat * b_accel_inner_core(:,i)
+  else
 
-    b_displ_inner_core_lddrk(:,i) = ALPHA_LDDRK(istage) * b_displ_inner_core_lddrk(:,i) &
-                                    + b_deltat * b_veloc_inner_core(:,i)
+    ! non-vectorized loops
 
-    b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) + BETA_LDDRK(istage) * b_veloc_inner_core_lddrk(:,i)
+    do i=1,NGLOB
+      ! low-memory runge-kutta: intermediate storage wavefields
+      veloc_lddrk(:,i) = alpha * veloc_lddrk(:,i) + deltat * accel(:,i)
+      displ_lddrk(:,i) = alpha * displ_lddrk(:,i) + deltat * veloc(:,i)
+      ! updates wavefields
+      veloc(:,i) = veloc(:,i) + beta * veloc_lddrk(:,i)
+      displ(:,i) = displ(:,i) + beta * displ_lddrk(:,i)
+    enddo
 
-    b_displ_inner_core(:,i) = b_displ_inner_core(:,i) + BETA_LDDRK(istage) * b_displ_inner_core_lddrk(:,i)
-  enddo
+  endif
 
-  end subroutine update_veloc_elastic_lddrk_backward
+  end subroutine update_elastic_lddrk
 
+

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/update_displacement_Newmark.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/update_displacement_Newmark.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/update_displacement_Newmark.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -371,15 +371,15 @@
 
   implicit none
 
-  integer :: NGLOB_CM,NGLOB_IC
+  integer,intent(in) :: NGLOB_CM,NGLOB_IC
 
   ! acceleration & velocity
   ! crust/mantle region
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CM) :: veloc_crust_mantle,accel_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CM),intent(inout) :: veloc_crust_mantle,accel_crust_mantle
   ! inner core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_IC) :: veloc_inner_core,accel_inner_core
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_IC),intent(inout) :: veloc_inner_core,accel_inner_core
 
-  real(kind=CUSTOM_REAL) :: deltatover2
+  real(kind=CUSTOM_REAL),intent(in) :: deltatover2
 
   ! local parameters
   integer :: i
@@ -393,30 +393,26 @@
   !   - inner core region
   !         needs both, acceleration update & velocity corrector terms
 
-  ! crust/mantle
   if(FORCE_VECTORIZATION_VAL) then
 
+    ! crust/mantle
     do i=1,NGLOB_CM * NDIM
       veloc_crust_mantle(i,1) = veloc_crust_mantle(i,1) + deltatover2*accel_crust_mantle(i,1)
     enddo
 
-  else
-
-    do i=1,NGLOB_CM
-      veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) + deltatover2*accel_crust_mantle(:,i)
-    enddo
-
-  endif
-
-  ! inner core
-  if(FORCE_VECTORIZATION_VAL) then
-
+    ! inner core
     do i=1,NGLOB_IC * NDIM
       veloc_inner_core(i,1) = veloc_inner_core(i,1) + deltatover2*accel_inner_core(i,1)
     enddo
 
   else
 
+    ! crust/mantle
+    do i=1,NGLOB_CM
+      veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) + deltatover2*accel_crust_mantle(:,i)
+    enddo
+
+    ! inner core
     do i=1,NGLOB_IC
       veloc_inner_core(:,i) = veloc_inner_core(:,i) + deltatover2*accel_inner_core(:,i)
     enddo

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/utils/attenuation/attenuation_output.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/utils/attenuation/attenuation_output.f90	2013-11-01 21:27:52 UTC (rev 22976)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/utils/attenuation/attenuation_output.f90	2013-11-04 08:16:42 UTC (rev 22977)
@@ -21,7 +21,7 @@
     integer myrank, vnspec, process, iregion
     character(len=150) prname, LOCAL_PATH
     double precision, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_AC)       :: one_minus_sum_beta, scale_factor
-    double precision, dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_AC) :: factor_common
+    double precision, dimension(NGLLX,NGLLY,NGLLZ,N_SLS,NSPEC_CRUST_MANTLE_AC) :: factor_common
     double precision, dimension(N_SLS)                                         :: tau_s
     double precision T_c_source
 
@@ -56,9 +56,9 @@
        do k = 1, NGLLZ
           do j = 1, NGLLY
              do i = 1, NGLLX
-                write(*,*)' tau_mu(1) = ', factor_common(1,i,j,k,ispec)
-                write(*,*)' tau_mu(2) = ', factor_common(2,i,j,k,ispec)
-                write(*,*)' tau_mu(3) = ', factor_common(3,i,j,k,ispec)
+                write(*,*)' tau_mu(1) = ', factor_common(i,j,k,1,ispec)
+                write(*,*)' tau_mu(2) = ', factor_common(i,j,k,2,ispec)
+                write(*,*)' tau_mu(3) = ', factor_common(i,j,k,3,ispec)
                 write(*,*)' Qmu = ', scale_factor(i,j,k,ispec)
                 write(*,*)
              enddo



More information about the CIG-COMMITS mailing list