[cig-commits] r21928 - seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D

dkomati1 at geodynamics.org dkomati1 at geodynamics.org
Wed Apr 24 16:33:11 PDT 2013


Author: dkomati1
Date: 2013-04-24 16:33:11 -0700 (Wed, 24 Apr 2013)
New Revision: 21928

Modified:
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_Dev.F90
Log:
done suppressing _HANDOPT, which was unused; we know how to vectorize the code better, we will do that once it is restructured to undo attenuation


Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90	2013-04-24 23:23:11 UTC (rev 21927)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90	2013-04-24 23:33:11 UTC (rev 21928)
@@ -25,21 +25,6 @@
 !
 !=====================================================================
 
-! preprocessing definition: #define _HANDOPT :  turns hand-optimized code on
-!                                         #undef _HANDOPT :  turns hand-optimized code off
-! or compile with: -D_HANDOPT
-!#define _HANDOPT
-
-! BEWARE:
-! BEWARE: we have observed that using _HANDOPT in combination with -O3 or higher can lead to problems on some machines;
-! BEWARE: thus, be careful when using it. At the very least, run the same test simulation once with _HANDOPT and once without
-! BEWARE: and make sure that all the seismograms you get are the same down to roundoff noise.
-! BEWARE:
-
-! note: these hand optimizations should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          the original routines are commented with "! way 1", the hand-optimized routines with  "! way 2"
-
   subroutine compute_element_iso(ispec, &
                     minus_gravity_table,density_table,minus_deriv_gravity_table, &
                     xstore,ystore,zstore, &
@@ -1286,82 +1271,20 @@
 ! local parameters
   real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1
   integer :: i_SLS
-#ifdef _HANDOPT
-  real(kind=CUSTOM_REAL) R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  integer :: imodulo_N_SLS
-  integer :: i_SLS1,i_SLS2
-#endif
 
-#ifdef _HANDOPT
-! way 2:
-! note: this should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          by default, N_SLS = 3, therefore we take steps of 3
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  if(imodulo_N_SLS >= 1) then
-    do i_SLS = 1,imodulo_N_SLS
-      R_xx_val1 = R_memory_loc(1,i_SLS)
-      R_yy_val1 = R_memory_loc(2,i_SLS)
-      sigma_xx = sigma_xx - R_xx_val1
-      sigma_yy = sigma_yy - R_yy_val1
-      sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-      sigma_xy = sigma_xy - R_memory_loc(3,i_SLS)
-      sigma_xz = sigma_xz - R_memory_loc(4,i_SLS)
-      sigma_yz = sigma_yz - R_memory_loc(5,i_SLS)
-    enddo
-  endif
-  if(N_SLS >= imodulo_N_SLS+1) then
-    ! note: another possibility would be using a reduction example for this loop; was tested but it does not improve,
-    ! probably since N_SLS == 3 is too small for a loop benefit
-    do i_SLS = imodulo_N_SLS+1,N_SLS,3
-      R_xx_val1 = R_memory_loc(1,i_SLS)
-      R_yy_val1 = R_memory_loc(2,i_SLS)
-      sigma_xx = sigma_xx - R_xx_val1
-      sigma_yy = sigma_yy - R_yy_val1
-      sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-      sigma_xy = sigma_xy - R_memory_loc(3,i_SLS)
-      sigma_xz = sigma_xz - R_memory_loc(4,i_SLS)
-      sigma_yz = sigma_yz - R_memory_loc(5,i_SLS)
-
-      i_SLS1=i_SLS+1
-      R_xx_val2 = R_memory_loc(1,i_SLS1)
-      R_yy_val2 = R_memory_loc(2,i_SLS1)
-      sigma_xx = sigma_xx - R_xx_val2
-      sigma_yy = sigma_yy - R_yy_val2
-      sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-      sigma_xy = sigma_xy - R_memory_loc(3,i_SLS1)
-      sigma_xz = sigma_xz - R_memory_loc(4,i_SLS1)
-      sigma_yz = sigma_yz - R_memory_loc(5,i_SLS1)
-
-      i_SLS2 =i_SLS+2
-      R_xx_val3 = R_memory_loc(1,i_SLS2)
-      R_yy_val3 = R_memory_loc(2,i_SLS2)
-      sigma_xx = sigma_xx - R_xx_val3
-      sigma_yy = sigma_yy - R_yy_val3
-      sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-      sigma_xy = sigma_xy - R_memory_loc(3,i_SLS2)
-      sigma_xz = sigma_xz - R_memory_loc(4,i_SLS2)
-      sigma_yz = sigma_yz - R_memory_loc(5,i_SLS2)
-    enddo
-  endif
-#else
-! way 1:
   do i_SLS = 1,N_SLS
-    R_xx_val1 = R_memory_loc(1,i_SLS) ! R_memory(1,i_SLS,i,j,k,ispec)
-    R_yy_val1 = R_memory_loc(2,i_SLS) ! R_memory(2,i_SLS,i,j,k,ispec)
+    R_xx_val1 = R_memory_loc(1,i_SLS)
+    R_yy_val1 = R_memory_loc(2,i_SLS)
     sigma_xx = sigma_xx - R_xx_val1
     sigma_yy = sigma_yy - R_yy_val1
     sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-    sigma_xy = sigma_xy - R_memory_loc(3,i_SLS) ! R_memory(3,i_SLS,i,j,k,ispec)
-    sigma_xz = sigma_xz - R_memory_loc(4,i_SLS) ! R_memory(4,i_SLS,i,j,k,ispec)
-    sigma_yz = sigma_yz - R_memory_loc(5,i_SLS) ! R_memory(5,i_SLS,i,j,k,ispec)
+    sigma_xy = sigma_xy - R_memory_loc(3,i_SLS)
+    sigma_xz = sigma_xz - R_memory_loc(4,i_SLS)
+    sigma_yz = sigma_yz - R_memory_loc(5,i_SLS)
   enddo
-#endif
 
   end subroutine compute_element_att_stress
 
-
 !
 !--------------------------------------------------------------------------------------------
 !
@@ -1415,12 +1338,7 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX, NGLLY, NGLLZ) :: factor_common_c44_muv
   integer :: i_SLS
 
-#ifdef _HANDOPT
-  real(kind=CUSTOM_REAL) :: alphal,betal,gammal
-  integer :: i,j,k
-#else
   integer :: i_memory
-#endif
 
   ! use Runge-Kutta scheme to march in time
 
@@ -1428,14 +1346,7 @@
   ! IMPROVE we use mu_v here even if there is some anisotropy
   ! IMPROVE we should probably use an average value instead
 
-#ifdef _HANDOPT
-! way 2:
   do i_SLS = 1,N_SLS
-
-    alphal = alphaval(i_SLS)
-    betal = betaval(i_SLS)
-    gammal = gammaval(i_SLS)
-
     ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
     factor_common_c44_muv(:,:,:) = factor_common(i_SLS,:,:,:,ispec)
 
@@ -1445,38 +1356,13 @@
       factor_common_c44_muv(:,:,:) = factor_common_c44_muv(:,:,:) * muvstore(:,:,:,ispec)
     endif
 
-    ! this helps to vectorize the inner most loop
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-          R_memory(:,i_SLS,i,j,k,ispec) = alphal * R_memory(:,i_SLS,i,j,k,ispec) &
-                  + factor_common_c44_muv(i,j,k) &
-                  *( betal * epsilondev(:,i,j,k,ispec) + gammal * epsilondev_loc(:,i,j,k))
-        enddo
-      enddo
-    enddo
-  enddo ! i_SLS
-#else
-! way 1:
-  do i_SLS = 1,N_SLS
-    ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
-    factor_common_c44_muv(:,:,:) = factor_common(i_SLS,:,:,:,ispec)
-
-    if(ANISOTROPIC_3D_MANTLE_VAL) then
-      factor_common_c44_muv(:,:,:) = factor_common_c44_muv(:,:,:) * c44store(:,:,:,ispec)
-    else
-      factor_common_c44_muv(:,:,:) = factor_common_c44_muv(:,:,:) * muvstore(:,:,:,ispec)
-    endif
-
     do i_memory = 1,5
       R_memory(i_memory,i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_memory(i_memory,i_SLS,:,:,:,ispec) &
                 + factor_common_c44_muv(:,:,:) &
                 * (betaval(i_SLS) * epsilondev(i_memory,:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(i_memory,:,:,:))
     enddo
   enddo ! i_SLS
-#endif
 
-
   end subroutine compute_element_att_memory_cr
 
 !
@@ -1532,12 +1418,7 @@
 
   integer :: i_SLS
 
-#ifdef _HANDOPT
-  real(kind=CUSTOM_REAL) :: alphal,betal,gammal
-  integer :: i,j,k
-#else
   integer :: i_memory
-#endif
 
   ! use Runge-Kutta scheme to march in time
 
@@ -1545,43 +1426,14 @@
   ! IMPROVE we use mu_v here even if there is some anisotropy
   ! IMPROVE we should probably use an average value instead
 
-#ifdef _HANDOPT
-! way 2:
   do i_SLS = 1,N_SLS
-
-    alphal = alphaval(i_SLS)
-    betal = betaval(i_SLS)
-    gammal = gammaval(i_SLS)
-
-    ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
     factor_common_use(:,:,:) = factor_common(i_SLS,:,:,:,ispec)
-
-    factor_common_use(:,:,:) = factor_common_use(:,:,:) * muvstore(:,:,:,ispec)
-
-    ! this helps to vectorize the inner most loop
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-          R_memory(:,i_SLS,i,j,k,ispec) = alphal * R_memory(:,i_SLS,i,j,k,ispec) &
-                  + factor_common_use(i,j,k) &
-                  *( betal * epsilondev(:,i,j,k,ispec) + gammal * epsilondev_loc(:,i,j,k))
-        enddo
-      enddo
-    enddo
-
-  enddo ! i_SLS
-#else
-! way 1:
-  do i_SLS = 1,N_SLS
-    factor_common_use(:,:,:) = factor_common(i_SLS,:,:,:,ispec)
     do i_memory = 1,5
        R_memory(i_memory,i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_memory(i_memory,i_SLS,:,:,:,ispec) &
             + muvstore(:,:,:,ispec) * factor_common_use(:,:,:) * &
             (betaval(i_SLS) * epsilondev(i_memory,:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(i_memory,:,:,:))
     enddo
   enddo
-#endif
 
   end subroutine compute_element_att_memory_ic
 
-

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90	2013-04-24 23:23:11 UTC (rev 21927)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90	2013-04-24 23:33:11 UTC (rev 21928)
@@ -25,25 +25,9 @@
 !
 !=====================================================================
 
-! preprocessing definition: #define _HANDOPT :  turns hand-optimized code on
-!                                         #undef _HANDOPT :  turns hand-optimized code off
-! or compile with: -D_HANDOPT
-!#define _HANDOPT
-
-! BEWARE:
-! BEWARE: we have observed that using _HANDOPT in combination with -O3 or higher can lead to problems on some machines;
-! BEWARE: thus, be careful when using it. At the very least, run the same test simulation once with _HANDOPT and once without
-! BEWARE: and make sure that all the seismograms you get are the same down to roundoff noise.
-! BEWARE:
-
-! note: these hand optimizations should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          the original routines are commented with "! way 1", the hand-optimized routines with  "! way 2"
-
   subroutine compute_forces_crust_mantle_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
           displ_crust_mantle,accel_crust_mantle,xstore,ystore,zstore, &
           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-!----------------------
             is_on_a_slice_edge_crust_mantle,icall, &
             accel_inner_core,ibool_inner_core,idoubling_inner_core, &
             myrank,iproc_xi,iproc_eta,ichunk,addressing, &
@@ -60,7 +44,6 @@
             nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
             npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
             receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
           hprime_xx,hprime_xxT, &
           hprimewgll_xx,hprimewgll_xxT, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
@@ -69,7 +52,6 @@
           c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
           c36store,c44store,c45store,c46store,c55store,c56store,c66store, &
           ibool,ispec_is_tiso, &
-        ! --idoubling,
           R_memory,epsilondev,epsilon_trace_over_3,one_minus_sum_beta, &
           alphaval,betaval,gammaval,factor_common,vx,vy,vz,vnspec)
 
@@ -245,10 +227,6 @@
   integer NSPEC2D_BOTTOM_INNER_CORE,iend,ispec_glob
   integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
 
-#ifdef _HANDOPT
-  integer, dimension(5) :: iglobv5
-#endif
-
 ! ****************************************************
 !   big loop over all spectral elements in the solid
 ! ****************************************************
@@ -283,9 +261,6 @@
 !$OMP tempx1,tempx2,tempx3, &
 !$OMP newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3, &
 !$OMP dummyx_loc,dummyy_loc,dummyz_loc,rho_s_H,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
-#ifdef _HANDOPT
-!$OMP iglobv5, &
-#endif
 !$OMP iglob1,epsilondev_loc)
 
   do ispec_glob = 1,NSPEC_CRUST_MANTLE,ELEMENTS_NONBLOCKING_CM_IC
@@ -344,43 +319,15 @@
     ! pages 386 and 389 and Figure 8.3.1
     do k=1,NGLLZ
       do j=1,NGLLY
-
-#ifdef _HANDOPT
-! way 2:
-        ! since we know that NGLLX = 5, this should help pipelining
-        iglobv5(:) = ibool(:,j,k,ispec)
-
-        dummyx_loc(1,j,k) = displ_crust_mantle(1,iglobv5(1))
-        dummyy_loc(1,j,k) = displ_crust_mantle(2,iglobv5(1))
-        dummyz_loc(1,j,k) = displ_crust_mantle(3,iglobv5(1))
-
-        dummyx_loc(2,j,k) = displ_crust_mantle(1,iglobv5(2))
-        dummyy_loc(2,j,k) = displ_crust_mantle(2,iglobv5(2))
-        dummyz_loc(2,j,k) = displ_crust_mantle(3,iglobv5(2))
-
-        dummyx_loc(3,j,k) = displ_crust_mantle(1,iglobv5(3))
-        dummyy_loc(3,j,k) = displ_crust_mantle(2,iglobv5(3))
-        dummyz_loc(3,j,k) = displ_crust_mantle(3,iglobv5(3))
-
-        dummyx_loc(4,j,k) = displ_crust_mantle(1,iglobv5(4))
-        dummyy_loc(4,j,k) = displ_crust_mantle(2,iglobv5(4))
-        dummyz_loc(4,j,k) = displ_crust_mantle(3,iglobv5(4))
-
-        dummyx_loc(5,j,k) = displ_crust_mantle(1,iglobv5(5))
-        dummyy_loc(5,j,k) = displ_crust_mantle(2,iglobv5(5))
-        dummyz_loc(5,j,k) = displ_crust_mantle(3,iglobv5(5))
-
-#else
-! way 1:
         do i=1,NGLLX
             iglob1 = ibool(i,j,k,ispec)
             dummyx_loc(i,j,k) = displ_crust_mantle(1,iglob1)
             dummyy_loc(i,j,k) = displ_crust_mantle(2,iglob1)
             dummyz_loc(i,j,k) = displ_crust_mantle(3,iglob1)
         enddo
-#endif
       enddo
     enddo
+
     do j=1,m2
       do i=1,m1
         C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
@@ -402,6 +349,7 @@
                               hprime_xx(i,5)*B3_m1_m2_5points(5,j)
       enddo
     enddo
+
     do j=1,m1
       do i=1,m1
         ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
@@ -426,6 +374,7 @@
         enddo
       enddo
     enddo
+
     do j=1,m1
       do i=1,m2
         C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
@@ -520,6 +469,7 @@
                               hprimewgll_xxT(i,5)*C3_m1_m2_5points(5,j)
       enddo
     enddo
+
     do i=1,m1
       do j=1,m1
         ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
@@ -544,6 +494,7 @@
         enddo
       enddo
     enddo
+
     do j=1,m1
       do i=1,m2
         E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
@@ -581,31 +532,16 @@
           if(GRAVITY_VAL) sum_terms(:,i,j,k) = sum_terms(:,i,j,k) + rho_s_H(:,i,j,k)
 
         enddo ! NGLLX
-
       enddo ! NGLLY
     enddo ! NGLLZ
 
     ! sum contributions from each element to the global mesh and add gravity terms
     do k=1,NGLLZ
       do j=1,NGLLY
-
-#ifdef _HANDOPT
-! way 2:
-        iglobv5(:) = ibool(:,j,k,ispec)
-
-        accel_crust_mantle(:,iglobv5(1)) = accel_crust_mantle(:,iglobv5(1)) + sum_terms(:,1,j,k)
-        accel_crust_mantle(:,iglobv5(2)) = accel_crust_mantle(:,iglobv5(2)) + sum_terms(:,2,j,k)
-        accel_crust_mantle(:,iglobv5(3)) = accel_crust_mantle(:,iglobv5(3)) + sum_terms(:,3,j,k)
-        accel_crust_mantle(:,iglobv5(4)) = accel_crust_mantle(:,iglobv5(4)) + sum_terms(:,4,j,k)
-        accel_crust_mantle(:,iglobv5(5)) = accel_crust_mantle(:,iglobv5(5)) + sum_terms(:,5,j,k)
-
-#else
-! way 1:
         do i=1,NGLLX
           iglob1 = ibool(i,j,k,ispec)
           accel_crust_mantle(:,iglob1) = accel_crust_mantle(:,iglob1) + sum_terms(:,i,j,k)
         enddo
-#endif
       enddo
     enddo
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_Dev.F90	2013-04-24 23:23:11 UTC (rev 21927)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_Dev.F90	2013-04-24 23:33:11 UTC (rev 21928)
@@ -25,25 +25,9 @@
 !
 !=====================================================================
 
-! preprocessing definition: #define _HANDOPT :  turns hand-optimized code on
-!                                         #undef _HANDOPT :  turns hand-optimized code off
-! or compile with: -D_HANDOPT
-!#define _HANDOPT
-
-! BEWARE:
-! BEWARE: we have observed that using _HANDOPT in combination with -O3 or higher can lead to problems on some machines;
-! BEWARE: thus, be careful when using it. At the very least, run the same test simulation once with _HANDOPT and once without
-! BEWARE: and make sure that all the seismograms you get are the same down to roundoff noise.
-! BEWARE:
-
-! note: these hand optimizations should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          the original routines are commented with "! way 1", the hand-optimized routines with  "! way 2"
-
   subroutine compute_forces_inner_core_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
           displ_inner_core,accel_inner_core,xstore,ystore,zstore, &
           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-!----------------------
             is_on_a_slice_edge_inner_core,icall, &
             accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
             myrank,iproc_xi,iproc_eta,ichunk,addressing, &
@@ -60,7 +44,6 @@
             nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
             npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
             receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
           hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
           kappavstore,muvstore,ibool,idoubling, &
@@ -254,10 +237,6 @@
   integer NSPEC2D_BOTTOM_INNER_CORE
   integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
 
-#ifdef _HANDOPT
-  integer, dimension(5) :: iglobv5
-#endif
-
 ! ****************************************************
 !   big loop over all spectral elements in the solid
 ! ****************************************************
@@ -309,40 +288,12 @@
 
       do k=1,NGLLZ
         do j=1,NGLLY
-#ifdef _HANDOPT
-! way 2:
-        ! since we know that NGLLX = 5, this should help pipelining
-        iglobv5(:) = ibool(:,j,k,ispec)
-
-        dummyx_loc(1,j,k) = displ_inner_core(1,iglobv5(1))
-        dummyy_loc(1,j,k) = displ_inner_core(2,iglobv5(1))
-        dummyz_loc(1,j,k) = displ_inner_core(3,iglobv5(1))
-
-        dummyx_loc(2,j,k) = displ_inner_core(1,iglobv5(2))
-        dummyy_loc(2,j,k) = displ_inner_core(2,iglobv5(2))
-        dummyz_loc(2,j,k) = displ_inner_core(3,iglobv5(2))
-
-        dummyx_loc(3,j,k) = displ_inner_core(1,iglobv5(3))
-        dummyy_loc(3,j,k) = displ_inner_core(2,iglobv5(3))
-        dummyz_loc(3,j,k) = displ_inner_core(3,iglobv5(3))
-
-        dummyx_loc(4,j,k) = displ_inner_core(1,iglobv5(4))
-        dummyy_loc(4,j,k) = displ_inner_core(2,iglobv5(4))
-        dummyz_loc(4,j,k) = displ_inner_core(3,iglobv5(4))
-
-        dummyx_loc(5,j,k) = displ_inner_core(1,iglobv5(5))
-        dummyy_loc(5,j,k) = displ_inner_core(2,iglobv5(5))
-        dummyz_loc(5,j,k) = displ_inner_core(3,iglobv5(5))
-
-#else
-! way 1:
           do i=1,NGLLX
             iglob1 = ibool(i,j,k,ispec)
             dummyx_loc(i,j,k) = displ_inner_core(1,iglob1)
             dummyy_loc(i,j,k) = displ_inner_core(2,iglob1)
             dummyz_loc(i,j,k) = displ_inner_core(3,iglob1)
           enddo
-#endif
         enddo
       enddo
 
@@ -367,6 +318,7 @@
                                 hprime_xx(i,5)*B3_m1_m2_5points(5,j)
         enddo
       enddo
+
       do j=1,m1
         do i=1,m1
           ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
@@ -391,6 +343,7 @@
           enddo
         enddo
       enddo
+
       do j=1,m1
         do i=1,m2
           C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
@@ -704,6 +657,7 @@
                                 hprimewgll_xxT(i,5)*C3_m1_m2_5points(5,j)
         enddo
       enddo
+
       do i=1,m1
         do j=1,m1
           ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
@@ -728,6 +682,7 @@
           enddo
         enddo
       enddo
+
       do j=1,m1
         do i=1,m2
           E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
@@ -771,23 +726,10 @@
       ! sum contributions from each element to the global mesh and add gravity terms
       do k=1,NGLLZ
         do j=1,NGLLY
-#ifdef _HANDOPT
-! way 2:
-          iglobv5(:) = ibool(:,j,k,ispec)
-
-          accel_inner_core(:,iglobv5(1)) = accel_inner_core(:,iglobv5(1)) + sum_terms(:,1,j,k)
-          accel_inner_core(:,iglobv5(2)) = accel_inner_core(:,iglobv5(2)) + sum_terms(:,2,j,k)
-          accel_inner_core(:,iglobv5(3)) = accel_inner_core(:,iglobv5(3)) + sum_terms(:,3,j,k)
-          accel_inner_core(:,iglobv5(4)) = accel_inner_core(:,iglobv5(4)) + sum_terms(:,4,j,k)
-          accel_inner_core(:,iglobv5(5)) = accel_inner_core(:,iglobv5(5)) + sum_terms(:,5,j,k)
-
-#else
-! way 1:
           do i=1,NGLLX
             iglob1 = ibool(i,j,k,ispec)
             accel_inner_core(:,iglob1) = accel_inner_core(:,iglob1) + sum_terms(:,i,j,k)
           enddo
-#endif
         enddo
       enddo
 



More information about the CIG-COMMITS mailing list