[cig-commits] r13803 - seismo/3D/SPECFEM3D_SESAME/trunk

Sat Dec 27 14:34:47 PST 2008

Author: dkomati1
Date: 2008-12-27 14:34:47 -0800 (Sat, 27 Dec 2008)
New Revision: 13803

Added:
   seismo/3D/SPECFEM3D_SESAME/trunk/compute_forces_no_Deville.f90
Modified:
   seismo/3D/SPECFEM3D_SESAME/trunk/Makefile.in
   seismo/3D/SPECFEM3D_SESAME/trunk/compute_forces.f90
   seismo/3D/SPECFEM3D_SESAME/trunk/constants.h.in
   seismo/3D/SPECFEM3D_SESAME/trunk/flags.guess
   seismo/3D/SPECFEM3D_SESAME/trunk/specfem3D.f90
Log:
switched to optimized matrix multiplication routines from Deville et al. (2002) to improve performance


Modified: seismo/3D/SPECFEM3D_SESAME/trunk/Makefile.in
===================================================================

--- seismo/3D/SPECFEM3D_SESAME/trunk/Makefile.in	2008-12-27 21:20:45 UTC (rev 13802)
+++ seismo/3D/SPECFEM3D_SESAME/trunk/Makefile.in	2008-12-27 22:34:47 UTC (rev 13803)
@@ -487,7 +487,7 @@
 	${FCCOMPILE_CHECK} -c -o $O/compute_boundary_kernel.o compute_boundary_kernel.f90
 
 $O/compute_forces.o: constants.h compute_forces.f90
-	${FCCOMPILE_CHECK} -c -o $O/compute_forces.o compute_forces.f90
+	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces.o compute_forces.f90
 
 $O/combine_vol_data.o: constants.h combine_vol_data.f90
 	${FCCOMPILE_CHECK} -c -o $O/combine_vol_data.o combine_vol_data.f90

Modified: seismo/3D/SPECFEM3D_SESAME/trunk/compute_forces.f90
===================================================================
--- seismo/3D/SPECFEM3D_SESAME/trunk/compute_forces.f90	2008-12-27 21:20:45 UTC (rev 13802)
+++ seismo/3D/SPECFEM3D_SESAME/trunk/compute_forces.f90	2008-12-27 22:34:47 UTC (rev 13803)
@@ -24,10 +24,9 @@
 !=====================================================================
 
 subroutine compute_forces(NSPEC_AB,NGLOB_AB,displ,accel,xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-     hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+     hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT,wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
      kappastore,mustore,jacobian,ibool,ispec_is_inner,phase_is_inner, &
-     NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,xi_source,eta_source,gamma_source,nu_source,hdur,dt &
-     )
+     NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,xi_source,eta_source,gamma_source,nu_source,hdur,dt)
 
   implicit none
 
@@ -46,13 +45,14 @@
         kappastore,mustore,jacobian
 
 ! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
-  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
   real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
 
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+
 ! communication overlap
   logical, dimension(NSPEC_AB) :: ispec_is_inner
   logical :: phase_is_inner
@@ -69,7 +69,7 @@
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
 
   integer ispec,iglob
-  integer i,j,k,l
+  integer i,j,k
 
   real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
   real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
@@ -79,13 +79,8 @@
 
   real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
 
-  real(kind=CUSTOM_REAL) hp1,hp2,hp3
   real(kind=CUSTOM_REAL) fac1,fac2,fac3
 
-  real(kind=CUSTOM_REAL) tempx1l,tempx2l,tempx3l
-  real(kind=CUSTOM_REAL) tempy1l,tempy2l,tempy3l
-  real(kind=CUSTOM_REAL) tempz1l,tempz2l,tempz3l
-
   real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
   real(kind=CUSTOM_REAL) kappal
 
@@ -100,43 +95,30 @@
     do k=1,NGLLZ
       do j=1,NGLLY
         do i=1,NGLLX
+            iglob = ibool(i,j,k,ispec)
+            dummyx_loc(i,j,k) = displ(1,iglob)
+            dummyy_loc(i,j,k) = displ(2,iglob)
+            dummyz_loc(i,j,k) = displ(3,iglob)
+        enddo
+      enddo
+    enddo
 
-          tempx1l = 0.
-          tempx2l = 0.
-          tempx3l = 0.
+!! DK DK subroutines adapted from Deville, Fischer and Mund, High-order methods
+!! DK DK for incompressible fluid flow, Cambridge University Press (2002),
+!! DK DK pages 386 and 389 and Figure 8.3.1
+  call mxm_m1_m2_5points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
 
-          tempy1l = 0.
-          tempy2l = 0.
-          tempy3l = 0.
+  do k = 1,NGLLX
+    call mxm_m1_m1_5points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+           hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+  enddo
 
-          tempz1l = 0.
-          tempz2l = 0.
-          tempz3l = 0.
+  call mxm_m2_m1_5points(dummyx_loc,dummyy_loc,dummyz_loc,hprime_xxT,tempx3,tempy3,tempz3)
 
-          do l=1,NGLLX
-            hp1 = hprime_xx(i,l)
-            iglob = ibool(l,j,k,ispec)
-            tempx1l = tempx1l + displ(1,iglob)*hp1
-            tempy1l = tempy1l + displ(2,iglob)*hp1
-            tempz1l = tempz1l + displ(3,iglob)*hp1
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
 
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-            hp2 = hprime_yy(j,l)
-            iglob = ibool(i,l,k,ispec)
-            tempx2l = tempx2l + displ(1,iglob)*hp2
-            tempy2l = tempy2l + displ(2,iglob)*hp2
-            tempz2l = tempz2l + displ(3,iglob)*hp2
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-            hp3 = hprime_zz(k,l)
-            iglob = ibool(i,j,l,ispec)
-            tempx3l = tempx3l + displ(1,iglob)*hp3
-            tempy3l = tempy3l + displ(2,iglob)*hp3
-            tempz3l = tempz3l + displ(3,iglob)*hp3
-          enddo
-
 !         get derivatives of ux, uy and uz with respect to x, y and z
           xixl = xix(i,j,k,ispec)
           xiyl = xiy(i,j,k,ispec)
@@ -149,17 +131,17 @@
           gammazl = gammaz(i,j,k,ispec)
           jacobianl = jacobian(i,j,k,ispec)
 
-          duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-          duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-          duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+          duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+          duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+          duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
 
-          duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
-          duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
-          duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
+          duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+          duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+          duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
 
-          duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
-          duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
-          duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
+          duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+          duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+          duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
 
 ! precompute some sums to save CPU time
           duxdxl_plus_duydyl = duxdxl + duydyl
@@ -201,60 +183,37 @@
       enddo
     enddo
 
+!! DK DK subroutines adapted from Deville, Fischer and Mund, High-order methods
+!! DK DK for incompressible fluid flow, Cambridge University Press (2002),
+!! DK DK pages 386 and 389 and Figure 8.3.1
+  call mxm_m1_m2_5points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+
+  do k = 1,NGLLX
+    call mxm_m1_m1_5points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+          hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+  enddo
+
+  call mxm_m2_m1_5points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+
     do k=1,NGLLZ
       do j=1,NGLLY
         do i=1,NGLLX
 
-          tempx1l = 0.
-          tempy1l = 0.
-          tempz1l = 0.
-
-          tempx2l = 0.
-          tempy2l = 0.
-          tempz2l = 0.
-
-          tempx3l = 0.
-          tempy3l = 0.
-          tempz3l = 0.
-
-          do l=1,NGLLX
-            fac1 = hprimewgll_xx(l,i)
-            tempx1l = tempx1l + tempx1(l,j,k)*fac1
-            tempy1l = tempy1l + tempy1(l,j,k)*fac1
-            tempz1l = tempz1l + tempz1(l,j,k)*fac1
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-            fac2 = hprimewgll_yy(l,j)
-            tempx2l = tempx2l + tempx2(i,l,k)*fac2
-            tempy2l = tempy2l + tempy2(i,l,k)*fac2
-            tempz2l = tempz2l + tempz2(i,l,k)*fac2
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-            fac3 = hprimewgll_zz(l,k)
-            tempx3l = tempx3l + tempx3(i,j,l)*fac3
-            tempy3l = tempy3l + tempy3(i,j,l)*fac3
-            tempz3l = tempz3l + tempz3(i,j,l)*fac3
-          enddo
-
           fac1 = wgllwgll_yz(j,k)
           fac2 = wgllwgll_xz(i,k)
           fac3 = wgllwgll_xy(i,j)
 
-! sum contributions from each element to the global mesh
-
+! sum contributions from each element to the global mesh using indirect addressing
           iglob = ibool(i,j,k,ispec)
+          accel(1,iglob) = accel(1,iglob) - (fac1*newtempx1(i,j,k) + fac2*newtempx2(i,j,k) + fac3*newtempx3(i,j,k))
+          accel(2,iglob) = accel(2,iglob) - (fac1*newtempy1(i,j,k) + fac2*newtempy2(i,j,k) + fac3*newtempy3(i,j,k))
+          accel(3,iglob) = accel(3,iglob) - (fac1*newtempz1(i,j,k) + fac2*newtempz2(i,j,k) + fac3*newtempz3(i,j,k))
 
-          accel(1,iglob) = accel(1,iglob) - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
-          accel(2,iglob) = accel(2,iglob) - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
-          accel(3,iglob) = accel(3,iglob) - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
-
         enddo
       enddo
     enddo
 
-  endif ! if (ispec_is_inner(ispec) == phase_is_inner)
+  endif ! if (ispec_is_inner(ispec) .eqv. phase_is_inner)
 
   enddo  ! spectral element loop
 
@@ -274,20 +233,19 @@
            ispec_selected_source(isource))
       f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
       t0 = 1.2d0/f0
-if (it == 1 .and. myrank == 0) then
 
-print *,'using a source of dominant frequency ',f0
-print *,'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
-print *,'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
-endif
+  if (it == 1 .and. myrank == 0) then
+    print *,'using a source of dominant frequency ',f0
+    print *,'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
+    print *,'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
+  endif
 
-
       ! we use nu_source(:,3) here because we want a source normal to the surface.
       ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
       !accel(:,iglob) = accel(:,iglob) + &
       !     sngl(nu_source(:,3,isource) * 10000000.d0 * (1.d0-2.d0*PI*PI*f0*f0*(dble(it-1)*DT-t0)*(dble(it-1)*DT-t0)) * &
       !     exp(-PI*PI*f0*f0*(dble(it-1)*DT-t0)*(dble(it-1)*DT-t0)))
-accel(:,iglob) = accel(:,iglob) + &
+    accel(:,iglob) = accel(:,iglob) + &
            sngl(nu_source(:,3,isource) * 1.d10 * (1.d0-2.d0*PI*PI*f0*f0*(dble(it-1)*DT-t0)*(dble(it-1)*DT-t0)) * &
            exp(-PI*PI*f0*f0*(dble(it-1)*DT-t0)*(dble(it-1)*DT-t0)))
 
@@ -300,3 +258,126 @@
 
 end subroutine compute_forces
 
+
+!! DK DK subroutines adapted from Deville, Fischer and Mund, High-order methods
+!! DK DK for incompressible fluid flow, Cambridge University Press (2002),
+!! DK DK pages 386 and 389 and Figure 8.3.1
+
+  subroutine mxm_m1_m2_5points(A,B1,B2,B3,C1,C2,C3)
+
+  implicit none
+
+  include "constants.h"
+
+  real(kind=CUSTOM_REAL), dimension(m1,NGLLX) :: A
+  real(kind=CUSTOM_REAL), dimension(NGLLX,m2) :: B1,B2,B3
+  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1,C2,C3
+
+  integer :: i,j
+
+  do j=1,m2
+    do i=1,m1
+
+      C1(i,j) = A(i,1)*B1(1,j) + &
+                A(i,2)*B1(2,j) + &
+                A(i,3)*B1(3,j) + &
+                A(i,4)*B1(4,j) + &
+                A(i,5)*B1(5,j)
+
+      C2(i,j) = A(i,1)*B2(1,j) + &
+                A(i,2)*B2(2,j) + &
+                A(i,3)*B2(3,j) + &
+                A(i,4)*B2(4,j) + &
+                A(i,5)*B2(5,j)
+
+      C3(i,j) = A(i,1)*B3(1,j) + &
+                A(i,2)*B3(2,j) + &
+                A(i,3)*B3(3,j) + &
+                A(i,4)*B3(4,j) + &
+                A(i,5)*B3(5,j)
+
+    enddo
+  enddo
+
+  end subroutine mxm_m1_m2_5points
+
+!---------
+
+  subroutine mxm_m1_m1_5points(A1,A2,A3,B,C1,C2,C3)
+
+  implicit none
+
+  include "constants.h"
+
+  real(kind=CUSTOM_REAL), dimension(m1,NGLLX) :: A1,A2,A3
+  real(kind=CUSTOM_REAL), dimension(NGLLX,m1) :: B
+  real(kind=CUSTOM_REAL), dimension(m1,m1) :: C1,C2,C3
+
+  integer :: i,j
+
+  do j=1,m1
+    do i=1,m1
+
+      C1(i,j) = A1(i,1)*B(1,j) + &
+                A1(i,2)*B(2,j) + &
+                A1(i,3)*B(3,j) + &
+                A1(i,4)*B(4,j) + &
+                A1(i,5)*B(5,j)
+
+      C2(i,j) = A2(i,1)*B(1,j) + &
+                A2(i,2)*B(2,j) + &
+                A2(i,3)*B(3,j) + &
+                A2(i,4)*B(4,j) + &
+                A2(i,5)*B(5,j)
+
+      C3(i,j) = A3(i,1)*B(1,j) + &
+                A3(i,2)*B(2,j) + &
+                A3(i,3)*B(3,j) + &
+                A3(i,4)*B(4,j) + &
+                A3(i,5)*B(5,j)
+
+    enddo
+  enddo
+
+  end subroutine mxm_m1_m1_5points
+
+!---------
+
+  subroutine mxm_m2_m1_5points(A1,A2,A3,B,C1,C2,C3)
+
+  implicit none
+
+  include "constants.h"
+
+  real(kind=CUSTOM_REAL), dimension(m2,NGLLX) :: A1,A2,A3
+  real(kind=CUSTOM_REAL), dimension(NGLLX,m1) :: B
+  real(kind=CUSTOM_REAL), dimension(m2,m1) :: C1,C2,C3
+
+  integer :: i,j
+
+  do j=1,m1
+    do i=1,m2
+
+      C1(i,j) = A1(i,1)*B(1,j) + &
+                A1(i,2)*B(2,j) + &
+                A1(i,3)*B(3,j) + &
+                A1(i,4)*B(4,j) + &
+                A1(i,5)*B(5,j)
+
+      C2(i,j) = A2(i,1)*B(1,j) + &
+                A2(i,2)*B(2,j) + &
+                A2(i,3)*B(3,j) + &
+                A2(i,4)*B(4,j) + &
+                A2(i,5)*B(5,j)
+
+      C3(i,j) = A3(i,1)*B(1,j) + &
+                A3(i,2)*B(2,j) + &
+                A3(i,3)*B(3,j) + &
+                A3(i,4)*B(4,j) + &
+                A3(i,5)*B(5,j)
+
+    enddo
+  enddo
+
+  end subroutine mxm_m2_m1_5points
+

Added: seismo/3D/SPECFEM3D_SESAME/trunk/compute_forces_no_Deville.f90
===================================================================
--- seismo/3D/SPECFEM3D_SESAME/trunk/compute_forces_no_Deville.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_SESAME/trunk/compute_forces_no_Deville.f90	2008-12-27 22:34:47 UTC (rev 13803)
@@ -0,0 +1,300 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  1 . 4
+!               ---------------------------------------
+!
+!                 Dimitri Komatitsch and Jeroen Tromp
+!    Seismological Laboratory - California Institute of Technology
+!         (c) California Institute of Technology September 2006
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+subroutine compute_forces(NSPEC_AB,NGLOB_AB,displ,accel,xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+     hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+     kappastore,mustore,jacobian,ibool,ispec_is_inner,phase_is_inner, &
+     NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,xi_source,eta_source,gamma_source,nu_source,hdur,dt)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
+  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! communication overlap
+  logical, dimension(NSPEC_AB) :: ispec_is_inner
+  logical :: phase_is_inner
+
+! source
+  integer :: NSOURCES,myrank,it
+  integer, dimension(NSOURCES) :: islice_selected_source,ispec_selected_source
+  double precision, dimension(NSOURCES) :: xi_source,eta_source,gamma_source
+  double precision, dimension(3,3,NSOURCES) :: nu_source
+  double precision, dimension(NSOURCES) :: hdur
+  double precision :: dt
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  integer ispec,iglob
+  integer i,j,k,l
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
+
+  real(kind=CUSTOM_REAL) hp1,hp2,hp3
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) tempx1l,tempx2l,tempx3l
+  real(kind=CUSTOM_REAL) tempy1l,tempy2l,tempy3l
+  real(kind=CUSTOM_REAL) tempz1l,tempz2l,tempz3l
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  integer :: isource
+  double precision :: t0,f0
+
+
+  do ispec = 1,NSPEC_AB
+
+  if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          tempx1l = 0.
+          tempx2l = 0.
+          tempx3l = 0.
+
+          tempy1l = 0.
+          tempy2l = 0.
+          tempy3l = 0.
+
+          tempz1l = 0.
+          tempz2l = 0.
+          tempz3l = 0.
+
+          do l=1,NGLLX
+            hp1 = hprime_xx(i,l)
+            iglob = ibool(l,j,k,ispec)
+            tempx1l = tempx1l + displ(1,iglob)*hp1
+            tempy1l = tempy1l + displ(2,iglob)*hp1
+            tempz1l = tempz1l + displ(3,iglob)*hp1
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            hp2 = hprime_yy(j,l)
+            iglob = ibool(i,l,k,ispec)
+            tempx2l = tempx2l + displ(1,iglob)*hp2
+            tempy2l = tempy2l + displ(2,iglob)*hp2
+            tempz2l = tempz2l + displ(3,iglob)*hp2
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            hp3 = hprime_zz(k,l)
+            iglob = ibool(i,j,l,ispec)
+            tempx3l = tempx3l + displ(1,iglob)*hp3
+            tempy3l = tempy3l + displ(2,iglob)*hp3
+            tempz3l = tempz3l + displ(3,iglob)*hp3
+          enddo
+
+!         get derivatives of ux, uy and uz with respect to x, y and z
+          xixl = xix(i,j,k,ispec)
+          xiyl = xiy(i,j,k,ispec)
+          xizl = xiz(i,j,k,ispec)
+          etaxl = etax(i,j,k,ispec)
+          etayl = etay(i,j,k,ispec)
+          etazl = etaz(i,j,k,ispec)
+          gammaxl = gammax(i,j,k,ispec)
+          gammayl = gammay(i,j,k,ispec)
+          gammazl = gammaz(i,j,k,ispec)
+          jacobianl = jacobian(i,j,k,ispec)
+
+          duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
+          duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
+          duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+
+          duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
+          duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
+          duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
+
+          duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
+          duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
+          duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
+
+! precompute some sums to save CPU time
+          duxdxl_plus_duydyl = duxdxl + duydyl
+          duxdxl_plus_duzdzl = duxdxl + duzdzl
+          duydyl_plus_duzdzl = duydyl + duzdzl
+          duxdyl_plus_duydxl = duxdyl + duydxl
+          duzdxl_plus_duxdzl = duzdxl + duxdzl
+          duzdyl_plus_duydzl = duzdyl + duydzl
+
+          kappal = kappastore(i,j,k,ispec)
+          mul = mustore(i,j,k,ispec)
+
+          lambdalplus2mul = kappal + FOUR_THIRDS * mul
+          lambdal = lambdalplus2mul - 2.*mul
+
+! compute stress sigma
+          sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+          sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+          sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+          sigma_xy = mul*duxdyl_plus_duydxl
+          sigma_xz = mul*duzdxl_plus_duxdzl
+          sigma_yz = mul*duzdyl_plus_duydzl
+
+! form dot product with test vector, symmetric form
+          tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl)
+          tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl)
+          tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl)
+
+          tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl)
+          tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl)
+          tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl)
+
+          tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl)
+          tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl)
+          tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl)
+
+        enddo
+      enddo
+    enddo
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          tempx1l = 0.
+          tempy1l = 0.
+          tempz1l = 0.
+
+          tempx2l = 0.
+          tempy2l = 0.
+          tempz2l = 0.
+
+          tempx3l = 0.
+          tempy3l = 0.
+          tempz3l = 0.
+
+          do l=1,NGLLX
+            fac1 = hprimewgll_xx(l,i)
+            tempx1l = tempx1l + tempx1(l,j,k)*fac1
+            tempy1l = tempy1l + tempy1(l,j,k)*fac1
+            tempz1l = tempz1l + tempz1(l,j,k)*fac1
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            fac2 = hprimewgll_yy(l,j)
+            tempx2l = tempx2l + tempx2(i,l,k)*fac2
+            tempy2l = tempy2l + tempy2(i,l,k)*fac2
+            tempz2l = tempz2l + tempz2(i,l,k)*fac2
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            fac3 = hprimewgll_zz(l,k)
+            tempx3l = tempx3l + tempx3(i,j,l)*fac3
+            tempy3l = tempy3l + tempy3(i,j,l)*fac3
+            tempz3l = tempz3l + tempz3(i,j,l)*fac3
+          enddo
+
+          fac1 = wgllwgll_yz(j,k)
+          fac2 = wgllwgll_xz(i,k)
+          fac3 = wgllwgll_xy(i,j)
+
+! sum contributions from each element to the global mesh
+
+          iglob = ibool(i,j,k,ispec)
+
+          accel(1,iglob) = accel(1,iglob) - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
+          accel(2,iglob) = accel(2,iglob) - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
+          accel(3,iglob) = accel(3,iglob) - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
+
+        enddo
+      enddo
+    enddo
+
+  endif ! if (ispec_is_inner(ispec) .eqv. phase_is_inner)
+
+  enddo  ! spectral element loop
+
+! adding source
+  do isource = 1,NSOURCES
+
+  if (ispec_is_inner(ispec_selected_source(isource)) .eqv. phase_is_inner) then
+
+  if(USE_FORCE_POINT_SOURCE) then
+
+!   add the source (only if this proc carries the source)
+    if(myrank == islice_selected_source(isource)) then
+
+      iglob = ibool(nint(xi_source(isource)), &
+           nint(eta_source(isource)), &
+           nint(gamma_source(isource)), &
+           ispec_selected_source(isource))
+      f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
+      t0 = 1.2d0/f0
+
+  if (it == 1 .and. myrank == 0) then
+    print *,'using a source of dominant frequency ',f0
+    print *,'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
+    print *,'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
+  endif
+
+      ! we use nu_source(:,3) here because we want a source normal to the surface.
+      ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
+      !accel(:,iglob) = accel(:,iglob) + &
+      !     sngl(nu_source(:,3,isource) * 10000000.d0 * (1.d0-2.d0*PI*PI*f0*f0*(dble(it-1)*DT-t0)*(dble(it-1)*DT-t0)) * &
+      !     exp(-PI*PI*f0*f0*(dble(it-1)*DT-t0)*(dble(it-1)*DT-t0)))
+    accel(:,iglob) = accel(:,iglob) + &
+           sngl(nu_source(:,3,isource) * 1.d10 * (1.d0-2.d0*PI*PI*f0*f0*(dble(it-1)*DT-t0)*(dble(it-1)*DT-t0)) * &
+           exp(-PI*PI*f0*f0*(dble(it-1)*DT-t0)*(dble(it-1)*DT-t0)))
+
+    endif
+  endif
+
+  endif
+
+  enddo
+
+end subroutine compute_forces
+

Modified: seismo/3D/SPECFEM3D_SESAME/trunk/constants.h.in
===================================================================
--- seismo/3D/SPECFEM3D_SESAME/trunk/constants.h.in	2008-12-27 21:20:45 UTC (rev 13802)
+++ seismo/3D/SPECFEM3D_SESAME/trunk/constants.h.in	2008-12-27 22:34:47 UTC (rev 13803)
@@ -52,6 +52,9 @@
   integer, parameter :: NGLLY = NGLLX
   integer, parameter :: NGLLZ = NGLLX
 
+! for optimized routines by Deville et al. (2002)
+  integer, parameter :: m1 = NGLLX, m2 = NGLLX * NGLLY
+
 ! input, output and main MPI I/O files
   integer, parameter :: ISTANDARD_OUTPUT = 6
   integer, parameter :: IIN = 40,IOUT = 41

Modified: seismo/3D/SPECFEM3D_SESAME/trunk/flags.guess
===================================================================
--- seismo/3D/SPECFEM3D_SESAME/trunk/flags.guess	2008-12-27 21:20:45 UTC (rev 13802)
+++ seismo/3D/SPECFEM3D_SESAME/trunk/flags.guess	2008-12-27 22:34:47 UTC (rev 13803)
@@ -29,7 +29,9 @@
             FLAGS_CHECK="-O3 -vec-report0 -e03 -std03 -implicitnone -warn truncated_source -warn argument_checking -warn unused -warn declarations -warn alignments -warn ignore_loc -warn usage -check nobounds -align sequence -assume byterecl -fpe0 -ftz -traceback -ftrapuv" # -mcmodel=medium
         fi
         if test x"$FLAGS_NO_CHECK" = x; then
-            # standard options (leave option -ftz, which is *critical* for performance)
+# standard options (leave option -ftz, which is *critical* for performance)
+# add -Winline to get information about routines that are inlined
+# add -vec-report3 to get information about loops that are vectorized or not
             FLAGS_NO_CHECK="-O3 -xP -vec-report0 -e03 -std03 -implicitnone -warn truncated_source -warn argument_checking -warn unused -warn declarations -warn alignments -warn ignore_loc -warn usage -check nobounds -align sequence -assume byterecl -fpe3 -ftz" # -mcmodel=medium
         fi
         ;;

Modified: seismo/3D/SPECFEM3D_SESAME/trunk/specfem3D.f90
===================================================================
--- seismo/3D/SPECFEM3D_SESAME/trunk/specfem3D.f90	2008-12-27 21:20:45 UTC (rev 13802)
+++ seismo/3D/SPECFEM3D_SESAME/trunk/specfem3D.f90	2008-12-27 22:34:47 UTC (rev 13803)
@@ -288,7 +288,7 @@
   double precision, dimension(NGLLZ) :: zigll,wzgll
 
 ! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT
   real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
   real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
@@ -480,6 +480,10 @@
 ! and decomposed with METIS or SCOTCH)
   if (.not. USE_EXTERNAL_MESH) stop 'SPECFEM3D_SESAME is for external meshes only'
 
+! check that optimized routines from Deville et al. (2002) can be used
+  if(NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5) &
+    stop 'optimized routines from Deville et al. (2002) such as mxm_m1_m2_5points can only be used if NGLL = 5'
+
 ! info about external mesh simulation
 ! nlegoff -- should be put in compute_parameters and read_parameter_file for clarity
   NPROC = sizeprocs
@@ -655,6 +659,14 @@
          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz)
 
+! define transpose of derivation matrix
+  do j = 1,NGLLY
+    do i = 1,NGLLX
+      hprime_xxT(j,i) = hprime_xx(i,j)
+      hprimewgll_xxT(j,i) = hprimewgll_xx(i,j)
+    enddo
+  enddo
+
 ! allocate 1-D Lagrange interpolators and derivatives
   allocate(hxir(NGLLX))
   allocate(hpxir(NGLLX))
@@ -1784,7 +1796,7 @@
 
 ! assemble all the contributions between slices using MPI
     call compute_forces(NSPEC_AB,NGLOB_AB,displ,accel,xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-         hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+         hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT,wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
          kappastore,mustore,jacobian,ibool,ispec_is_inner_ext_mesh,.false., &
          NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,xi_source,eta_source,gamma_source,nu_source,hdur,dt)
 
@@ -1795,7 +1807,7 @@
          request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
 
     call compute_forces(NSPEC_AB,NGLOB_AB,displ,accel,xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-         hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+         hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT,wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
          kappastore,mustore,jacobian,ibool,ispec_is_inner_ext_mesh,.true., &
          NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,xi_source,eta_source,gamma_source,nu_source,hdur,dt)