[cig-commits] [commit] devel: adds flop count to compute_forces_viscoelastic_Dev_5p() routine (d364e44)

Thu Sep 11 10:23:39 PDT 2014

Repository : https://github.com/geodynamics/specfem3d

On branch  : devel
Link       : https://github.com/geodynamics/specfem3d/compare/3704c888212d30d103ff64ed797e3904be39dd35...e84e7ba6f8959cb80449833335af69c684b5a5ee

>---------------------------------------------------------------

commit d364e4499b2e6dea9aaf5d01e1e2148ae6fc2ad7
Author: daniel peter <peterda at ethz.ch>
Date:   Tue Aug 26 10:34:43 2014 +0200

    adds flop count to compute_forces_viscoelastic_Dev_5p() routine


>---------------------------------------------------------------

d364e4499b2e6dea9aaf5d01e1e2148ae6fc2ad7
 src/specfem3D/compute_forces_viscoelastic_Dev.F90 | 110 +++++++++++++++++++++-
 1 file changed, 109 insertions(+), 1 deletion(-)

diff --git a/src/specfem3D/compute_forces_viscoelastic_Dev.F90 b/src/specfem3D/compute_forces_viscoelastic_Dev.F90
index 61d716b..b0f8420 100644
--- a/src/specfem3D/compute_forces_viscoelastic_Dev.F90
+++ b/src/specfem3D/compute_forces_viscoelastic_Dev.F90
@@ -259,6 +259,18 @@
 
   do ispec_p = 1,num_elements
 
+! arithmetic intensity: ratio of number-of-arithmetic-operations / number-of-bytes-accessed-on-DRAM
+!
+! hand-counts on floating-point operations: counts addition/subtraction/multiplication/division
+!                                           no counts for operations on indices in do-loops (?)
+!
+!                                           counts accesses to global memory, but no shared/cache memory or register loads/stores
+!                                           float/real has 4 bytes
+
+! hand-counts: floating-point operations FLOP, DRAM accesses in BYTES
+!              for "simplest kernel" (isotropic without attenuation, dynamic fault, etc.)
+!              and for single element, assuming NGLLX == NGLLY == NGLLZ == 5
+
         ! returns element id from stored element list
         ispec = phase_ispec_inner_elastic(ispec_p,iphase)
 
@@ -271,6 +283,11 @@
           endif
         endif ! adjoint
 
+! counts:
+! 0 FLOP
+!
+! 1 float = 4 BYTE
+
        ! Kelvin Voigt damping: artificial viscosity around dynamic faults
 
         ! stores displacment values in local array
@@ -300,6 +317,12 @@
           enddo
         endif
 
+! counts:
+! + 0 FLOP
+! 
+! + NGLLX * NGLLY * NGLLZ * ( 1 + 3 ) float = 2000 BYTE
+
+
         ! use first order Taylor expansion of displacement for local storage of stresses
         ! at this current time step, to fix attenuation in a consistent way
         if(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
@@ -354,6 +377,11 @@
           enddo
         enddo
 
+! counts:
+! + m1 * m2 * 3 * 9 = 5 * 25 * 3 * 9 = 3375 FLOP
+!
+! + m1 * 5 float = 100 BYTE  (hprime_xx once, assuming B3_** in cache)
+
         if(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
            ! temporary variables used for fixing attenuation in a consistent way
            do j=1,m2
@@ -436,6 +464,12 @@
           enddo
         enddo
 
+! counts:
+! + m1 * m1 * NGLLX * 3 * 9 = 5 * 5 * 5 * 3 * 9 = 3375 FLOP
+!
+! + m1 * 5 float = 100 BYTE  (hprime_xxT once, assuming dummy*_** in cache)
+
+
         if(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
            ! temporary variables used for fixing attenuation in a consistent way
            do j=1,m1
@@ -521,6 +555,12 @@
           enddo
         enddo
 
+! counts:
+! + m1 * m2 * 3 * 9 = 5 * 25 * 3 * 9 = 3375 FLOP
+!
+! + 0 BYTE  (assuming A3_**, hprime_xxT in cache)
+
+
         if(ATTENUATION .and. COMPUTE_AND_STORE_STRAIN) then
            ! temporary variables used for fixing attenuation in a consistent way
            do j=1,m1
@@ -594,6 +634,11 @@
               gammazl = gammaz(i,j,k,ispec)
               jacobianl = jacobian(i,j,k,ispec)
 
+! counts:
+! + 0 FLOP
+!
+! + NGLLX * NGLLY * NGLLZ * 10 float = 5000 BYTE  (assuming A3_**, hprime_xxT in cache)
+
               duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
               duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
               duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
@@ -606,6 +651,11 @@
               duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
               duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
 
+! counts:
+! + NGLLX * NGLLY * NGLLZ * 9 * 5 = 5625 FLOP
+!
+! + 0 BYTE  (assuming temp*_** in cache)
+
               ! save strain on the Moho boundary
               if (SAVE_MOHO_MESH ) then
                 if (is_moho_top(ispec)) then
@@ -639,6 +689,11 @@
               duzdxl_plus_duxdzl = duzdxl + duxdzl
               duzdyl_plus_duydzl = duzdyl + duydzl
 
+! counts:
+! + NGLLX * NGLLY * NGLLZ * 6 * 1 = 750 FLOP
+!
+! + 0 BYTE  (assuming registers)
+
               if ( ATTENUATION .and. COMPUTE_AND_STORE_STRAIN ) then
                  ! temporary variables used for fixing attenuation in a consistent way
                  duxdxl_att = xixl*tempx1_att(i,j,k) + etaxl*tempx2_att(i,j,k) + gammaxl*tempx3_att(i,j,k)
@@ -721,6 +776,11 @@
               kappal = kappastore(i,j,k,ispec)
               mul = mustore(i,j,k,ispec)
 
+! counts:
+! + 0 FLOP
+!
+! + NGLLX * NGLLY * NGLLZ * 2 float = 1000 BYTE
+
               ! attenuation
               if(ATTENUATION) then
                 ! use unrelaxed parameters if attenuation
@@ -782,6 +842,11 @@
 
               endif ! ANISOTROPY
 
+! counts:
+! + NGLLX * NGLLY * NGLLZ * 16 =  2000 FLOP
+!
+! + 0 BYTE
+
               ! subtract memory variables if attenuation
               if(ATTENUATION) then
 ! way 1
@@ -863,7 +928,7 @@
                 enddo
               endif
 
-              endif
+            endif ! ATTENUATION
 
             ! define symmetric components of sigma
             sigma_yx = sigma_xy
@@ -883,6 +948,12 @@
             tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
             tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
 
+
+! counts:
+! + NGLLX * NGLLY * NGLLZ * 9 * 6 = 6750 FLOP
+!
+! + NGLLX * NGLLY * NGLLZ * 9 float = 4500 BYTE (temp* stores)
+
             enddo
           enddo
         enddo
@@ -931,6 +1002,11 @@
           enddo
         enddo
 
+! counts:
+! + m1 * m2 * 3 * 9 = 3375 FLOP
+!
+! + m1 * 5 float = 100 BYTE (hprimewgll_xxT once, assumes E3*, C1* in cache)
+
         !   call mxm_m1_m1_5points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
         !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
         do i=1,m1
@@ -956,6 +1032,11 @@
           enddo
         enddo
 
+! counts:
+! + m1 * m1 * NGLLX * 3 * 9 = 3375 FLOP
+!
+! + m1 * 5 float = 100 BYTE (hprimewgll_xx once, assumes E3*, C1* in cache)
+
         ! call mxm_m2_m1_5points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
         do j=1,m1
           do i=1,m2
@@ -977,6 +1058,11 @@
           enddo
         enddo
 
+! counts:
+! + m1 * m2 * 3 * 9 = 3375 FLOP
+!
+! + 0 BYTE (assumes E1*, C1*, hprime* in cache)
+
         do k=1,NGLLZ
           do j=1,NGLLY
             do i=1,NGLLX
@@ -994,6 +1080,13 @@
               accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
                                 fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
 
+
+! counts:
+! + NGLLX * NGLLY * NGLLZ * 3 * 6 = 2250 FLOP
+!
+! + NGLLX * NGLLY * 3 float = 300 BYTE (wgllwgll once)
+! + NGLLX * NGLLY * NGLLZ * (1 + 3 ) float = 2000 BYTE (ibool & accel, assumes newtemp* in cache)
+
               !  update memory variables based upon the Runge-Kutta scheme
               if(ATTENUATION) then
 
@@ -1079,6 +1172,21 @@
           epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
         endif
 
+! counts:
+! + 0 FLOP
+!
+! + 0 BYTE
+
+! counts:
+! -----------------
+! total of: 37625 FLOP per element
+!
+!           15204 BYTE DRAM accesses per block
+!
+! arithmetic intensity: 37625 FLOP / 15204 BYTES ~ 2.5 FLOP/BYTE
+! -----------------
+
+
   enddo  ! spectral element loop
 
   end subroutine compute_forces_viscoelastic_Dev_5p