[cig-commits] [commit] devel: turned -DFORCE_VECTORIZATION off by default in flags.guess to be safe (see https://github.com/geodynamics/specfem3d/issues/81 for more details). Also removed a few unused variables in src/shared/get_global.f90 (e1bd242)

Thu May 1 15:50:37 PDT 2014

Repository : ssh://geoshell/specfem3d

On branch  : devel
Link       : https://github.com/geodynamics/specfem3d/compare/50aa953c1db3f565d76415f5305410a529996b75...4027beef0c03478c027c29d6fa078f1ae2bf5ec9

>---------------------------------------------------------------

commit e1bd242421bb3efc1ef08e8afdd7444748f623f3
Author: Dimitri Komatitsch <komatitsch at lma.cnrs-mrs.fr>
Date:   Fri May 2 00:47:51 2014 +0200

    turned -DFORCE_VECTORIZATION off by default in flags.guess to be safe (see https://github.com/geodynamics/specfem3d/issues/81 for more details).
    Also removed a few unused variables in src/shared/get_global.f90


>---------------------------------------------------------------

e1bd242421bb3efc1ef08e8afdd7444748f623f3
 doc/USER_MANUAL/manual_SPECFEM3D_Cartesian.pdf | Bin 12643602 -> 12644494 bytes
 doc/USER_MANUAL/manual_SPECFEM3D_Cartesian.tex |   6 ++++++
 flags.guess                                    |  23 ++++++++++++++++-------
 src/shared/get_global.f90                      |  11 ++++++++---
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/doc/USER_MANUAL/manual_SPECFEM3D_Cartesian.pdf b/doc/USER_MANUAL/manual_SPECFEM3D_Cartesian.pdf
index ba54322..7e571af 100644
Binary files a/doc/USER_MANUAL/manual_SPECFEM3D_Cartesian.pdf and b/doc/USER_MANUAL/manual_SPECFEM3D_Cartesian.pdf differ
diff --git a/doc/USER_MANUAL/manual_SPECFEM3D_Cartesian.tex b/doc/USER_MANUAL/manual_SPECFEM3D_Cartesian.tex
index d367c21..8256789 100644
--- a/doc/USER_MANUAL/manual_SPECFEM3D_Cartesian.tex
+++ b/doc/USER_MANUAL/manual_SPECFEM3D_Cartesian.tex
@@ -387,6 +387,12 @@ On SGI systems, \texttt{flags.guess} automatically informs \texttt{configure}
 to insert `\texttt{`TRAP\_FPE=OFF}'' into the generated \texttt{Makefile}
 in order to turn underflow trapping off.\\
 
+You can add -DFORCE\_VECTORIZATION to the compiler options in \texttt{flags.guess} (for all compilers, except for IBM for which the syntax is -WF,-DFORCE\_VECTORIZATION )
+to speed up the code in the fluid (acoustic) parts (only; FORCE\_VECTORIZATION support for elastic parts has been discontinued in the source code).
+This works fine if (and only if) your computer always allocates a contiguous memory block for each allocatable array;
+this is the case for most machines and most compilers, but not all. For more details see https://github.com/geodynamics/specfem3d/issues/81 .
+To check if that option works fine on your machine, run the code with and without it for a model containing a significant fluid layer
+(or entirely fluid) and make sure the seismograms are identical.\\
 
 Note that we use CUBIT (now called Trelis) to create meshes of hexahedra, but other packages
 can be used as well, for instance GiD from \url{http://gid.cimne.upc.es}
diff --git a/flags.guess b/flags.guess
index 9152895..7b547a6 100644
--- a/flags.guess
+++ b/flags.guess
@@ -2,6 +2,15 @@
 
 # Attempt to guess suitable flags for the Fortran compiler.
 
+#
+# You can add -DFORCE_VECTORIZATION to the options below (for all compilers, except for IBM for which the syntax is -WF,-DFORCE_VECTORIZATION )
+# to speed up the code in the fluid (acoustic) parts (only; FORCE_VECTORIZATION support for elastic parts has been discontinued in the source code).
+# This works fine if (and only if) your computer always allocates a contiguous memory block for each allocatable array;
+# this is the case for most machines and most compilers, but not all. For more details see https://github.com/geodynamics/specfem3d/issues/81 .
+# To check if that option works fine on your machine, run the code with and without it for a model containing a significant fluid layer
+# (or entirely fluid) and make sure the seismograms are identical.
+#
+
 # can add -DUSE_SERIAL_CASCADE_FOR_IOs to the compiler options to make the mesher output mesh data
 # to the disk for one MPI slice after the other, and to make the solver do the same thing when reading the files back from disk.
 
@@ -42,14 +51,14 @@ case $my_FC in
         #
         # Cray Fortran
         #
-        DEF_FFLAGS="-O3 -Onoaggress -Oipa0 -hfp2 -Ovector3 -Oscalar3 -Ocache2 -Ounroll2 -Ofusion2 -DFORCE_VECTORIZATION -p \$O" # turn on optimization; -Oaggress -Oipa4 would make it even more aggressive
+        DEF_FFLAGS="-O3 -Onoaggress -Oipa0 -hfp2 -Ovector3 -Oscalar3 -Ocache2 -Ounroll2 -Ofusion2 -p \$O" # turn on optimization; -Oaggress -Oipa4 would make it even more aggressive
         # -eC -eD -ec -en -eI -ea -g -G0 -M 1193 -M 1438 # turn on full debugging and range checking
         ;;
     pgf95|*/pgf95|pgf90|*/pgf90)
         #
         # Portland PGI
         #
-        DEF_FFLAGS="-fast -Mnobounds -Minline -Mneginfo -Mdclchk -Knoieee -Minform=inform -Mdaz -Mflushz -Mvect -DFORCE_VECTORIZATION -mcmodel=medium"
+        DEF_FFLAGS="-fast -Mnobounds -Minline -Mneginfo -Mdclchk -Knoieee -Minform=inform -Mdaz -Mflushz -Mvect -mcmodel=medium"
         # -Mbounds
         # -fastsse -tp amd64e -Msmart
         ;;
@@ -62,16 +71,16 @@ case $my_FC in
 # parallel file systems like SFS 3.2 / Lustre 1.8. If omitted
 # I/O throughput lingers at 2.5 MB/s, with it it can increase to ~44 MB/s
 # However it does not make much of a difference on NFS mounted volumes or with SFS 3.1.1 / Lustre 1.6.7.1 
-        DEF_FFLAGS="-O3 -DFORCE_VECTORIZATION -check nobounds -xHost -fpe0 -ftz -assume buffered_io -assume byterecl -align sequence -vec-report0 -std03 -diag-disable 6477 -implicitnone -gen-interfaces -warn all"
+        DEF_FFLAGS="-O3 -check nobounds -xHost -fpe0 -ftz -assume buffered_io -assume byterecl -align sequence -vec-report0 -std03 -diag-disable 6477 -implicitnone -gen-interfaces -warn all"
         # useful for debugging...
-        # for debugging: change "-O3 -check nobounds -DFORCE_VECTORIZATION" to  "-check all -debug -g -O0 -fp-stack-check -traceback -ftrapuv"
+        # for debugging: change "-O3 -check nobounds" to  "-check all -debug -g -O0 -fp-stack-check -traceback -ftrapuv"
         ;;
     gfortran|*/gfortran|f95|*/f95)
         #
         # GNU gfortran
         #
-        DEF_FFLAGS="-std=f2003 -fimplicit-none -frange-check -O2 -DFORCE_VECTORIZATION -fmax-errors=10 -pedantic -pedantic-errors -Waliasing -Wampersand -Wcharacter-truncation -Wline-truncation -Wsurprising -Wno-tabs -Wunderflow -ffpe-trap=invalid,zero,overflow"
-        # for debugging, add -ggdb -fbacktrace -fbounds-check -ffpe-trap=overflow,zero  and suppress -DFORCE_VECTORIZATION
+        DEF_FFLAGS="-std=f2003 -fimplicit-none -frange-check -O2 -fmax-errors=10 -pedantic -pedantic-errors -Waliasing -Wampersand -Wcharacter-truncation -Wline-truncation -Wsurprising -Wno-tabs -Wunderflow -ffpe-trap=invalid,zero,overflow"
+        # for debugging, add -ggdb -fbacktrace -fbounds-check -ffpe-trap=overflow,zero
         ;;
     g95|*/g95)
         #
@@ -144,7 +153,7 @@ case $my_FC in
         # CFLAGS = -Wl,-relax
         #
 # deleted -qxflag=dvz because it requires handler function __xl_dzx and thus linking will fail 
-        DEF_FFLAGS="-O4 -qnostrict -qassert=contig -qhot -q64 -qtune=auto -qarch=auto -qcache=auto -qfree=f90 -qsuffix=f=f90 -qhalt=w -qlanglvl=2003std -g -qsuppress=1518-234 -qsuppress=1518-317 -qsuppress=1518-318 -qsuppress=1500-036 -Q -Q+rank,swap_all -Wl,-relax -WF,-DFORCE_VECTORIZATION"
+        DEF_FFLAGS="-O4 -qnostrict -qassert=contig -qhot -q64 -qtune=auto -qarch=auto -qcache=auto -qfree=f90 -qsuffix=f=f90 -qhalt=w -qlanglvl=2003std -g -qsuppress=1518-234 -qsuppress=1518-317 -qsuppress=1518-318 -qsuppress=1500-036 -Q -Q+rank,swap_all -Wl,-relax"
         # Options -qreport -qsource -qlist create a *.lst file containing detailed information about vectorization.
         # On IBM BlueGene at IDRIS (France) use:
         # -qtune=auto -qarch=450d -qsave     instead of -qtune=auto -qarch=auto
diff --git a/src/shared/get_global.f90 b/src/shared/get_global.f90
index d34713d..d47554d 100644
--- a/src/shared/get_global.f90
+++ b/src/shared/get_global.f90
@@ -45,8 +45,7 @@
   double precision xp(npointot),yp(npointot),zp(npointot)
   double precision UTM_X_MIN,UTM_X_MAX
 
-  integer ispec,i,j,ier
-  integer ieoff,ilocnum,nseg,ioff,iseg,ig
+  integer :: ier
 
   integer, dimension(:), allocatable :: ninseg,idummy
 
@@ -79,7 +78,12 @@
 
 !
 !- we can create a new indirect addressing to reduce cache misses
-! (put into this subroutine but compiler keeps on complaining that it can't vectorize loops...)
+! (put into this subroutine but compiler keeps on complaining that it cannot vectorize loops...)
+!! DK DK
+!! DK DK answer from Dimitri, April 2014: that is normal because the nested loops have a dependency
+!! DK DK (they can write to the same memory location through the mask_ibool() array) and thus
+!! DK DK they cannot be vectorized. Thus the compiler is right.
+!! DK DK
 
   implicit none
 
@@ -126,3 +130,4 @@
   deallocate(mask_ibool,stat=ier); if(ier /= 0) stop 'error in deallocate'
 
   end subroutine get_global_indirect_addressing
+