[cig-commits] r19657 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER: DATA EXAMPLES/regional_Greece_small EXAMPLES/regional_Greece_small/DATA src/cuda src/specfem3D

Mon Feb 20 16:33:57 PST 2012

Author: danielpeter
Date: 2012-02-20 16:33:56 -0800 (Mon, 20 Feb 2012)
New Revision: 19657

Removed:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu
Modified:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/CMTSOLUTION
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/Par_file
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/STATIONS
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/DATA/Par_file
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/process.sh
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/Makefile.in
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_sources_receivers.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90
Log:
updates stacey and sources routines

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/CMTSOLUTION
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/CMTSOLUTION	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/CMTSOLUTION	2012-02-21 00:33:56 UTC (rev 19657)
@@ -1,13 +1,13 @@
-PDE 1994  6  9  0 33 16.40 -13.8300  -67.5600 637.0 6.9 6.8 NORTHERN BOLIVIA                        
-event name:     060994A        
-time shift:     29.0000
-half duration:  20.0000
-latitude:      -13.8200
-longitude:     -67.2500
-depth:         647.1000
-Mrr:      -7.590000e+27
-Mtt:       7.750000e+27
-Mpp:      -1.600000e+26
-Mrt:      -2.503000e+28
-Mrp:       4.200000e+26
-Mtp:      -2.480000e+27
+PDEW 2008  1  6  5 14 23.70  37.2200   22.6900  75.0 6.1 6.2 SOUTHERN GREECE
+event name:     200801060514A
+time shift:      0.000
+half duration:   3.0000
+latitude:       36.9800
+longitude:      22.8700
+depth:          92.3900
+Mrr:       7.740000e+24
+Mtt:      -1.830000e+25
+Mpp:       1.060000e+25
+Mrt:       1.290000e+25
+Mrp:      -8.450000e+24
+Mtp:      -4.430000e+24

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/Par_file
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/Par_file	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/Par_file	2012-02-21 00:33:56 UTC (rev 19657)
@@ -1,11 +1,11 @@
 
 # forward or adjoint simulation
-SIMULATION_TYPE                 = 1
+SIMULATION_TYPE                 = 1 
 NOISE_TOMOGRAPHY                = 0        # flag of noise tomography, three steps (1,2,3). If earthquake simulation, set it to 0.
 SAVE_FORWARD                    = .false.  # save last frame of forward simulation or not
 
 # number of chunks (1,2,3 or 6)
-NCHUNKS                         = 1
+NCHUNKS                         = 1 
 
 # angular width of the first chunk (not used if full sphere with six chunks)
 ANGULAR_WIDTH_XI_IN_DEGREES   = 20.d0      # angular size of a chunk
@@ -16,37 +16,32 @@
 
 # number of elements at the surface along the two sides of the first chunk
 # (must be multiple of 16 and 8 * multiple of NPROC below)
-NEX_XI                          = 64
+NEX_XI                          = 64 
 NEX_ETA                         = 64
 
 # number of MPI processors along the two sides of the first chunk
-NPROC_XI                        = 2
+NPROC_XI                        = 2 
 NPROC_ETA                       = 2
 
 # 1D models with real structure:
-# 1D_isotropic_prem, 1D_transversely_isotropic_prem, 1D_iasp91, 1D_1066a, 1D_ak135, 1D_ref, 1D_ref_iso, 1D_jp3d,1D_sea99
+# 1D_isotropic_prem, 1D_transversely_isotropic_prem, 1D_iasp91, 1D_1066a, 1D_ak135, 1D_ref, 1D_ref_iso
 #
 # 1D models with only one fictitious averaged crustal layer:
 # 1D_isotropic_prem_onecrust, 1D_transversely_isotropic_prem_onecrust, 1D_iasp91_onecrust, 1D_1066a_onecrust, 1D_ak135_onecrust
 #
 # fully 3D models:
 # transversely_isotropic_prem_plus_3D_crust_2.0, 3D_anisotropic, 3D_attenuation,
-# s20rts, s40rts, s362ani, s362iso, s362wmani, s362ani_prem, s362ani_3DQ, s362iso_3DQ,
+# s20rts, s362ani, s362iso, s362wmani, s362ani_prem, s362ani_3DQ, s362iso_3DQ,
 # s29ea, s29ea,sea99_jp3d1994,sea99,jp3d1994,heterogen
-#
-# 3D models with 1D crust: append "_1Dcrust" the the 3D model name
-#                          to take the 1D crustal model from the
-#                          associated reference model rather than the default 3D crustal model 
-# e.g. s20rts_1Dcrust, s362ani_1Dcrust, etc.
 MODEL                           = s362ani
 
 # parameters describing the Earth model
-OCEANS                          = .true.
-ELLIPTICITY                     = .true.
-TOPOGRAPHY                      = .true.
-GRAVITY                         = .true.
-ROTATION                        = .true.
-ATTENUATION                     = .true.
+OCEANS                          = .false.
+ELLIPTICITY                     = .false.
+TOPOGRAPHY                      = .false.
+GRAVITY                         = .false.
+ROTATION                        = .false.
+ATTENUATION                     = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .true.
@@ -55,12 +50,10 @@
 RECORD_LENGTH_IN_MINUTES        = 2.5d0
 
 # save AVS or OpenDX movies
-#MOVIE_COARSE saves movie only at corners of elements (SURFACE OR VOLUME) 
-#MOVIE_COARSE does not work with create_movie_AVS_DX
 MOVIE_SURFACE                   = .false.
 MOVIE_VOLUME                    = .false.
-MOVIE_COARSE                    = .false.
-NTSTEP_BETWEEN_FRAMES           = 100
+MOVIE_COARSE                    = .true.
+NTSTEP_BETWEEN_FRAMES           = 50
 HDUR_MOVIE                      = 0.d0
 
 # save movie in volume.  Will save element if center of element is in prescribed volume
@@ -69,6 +62,7 @@
 # start/stop: frames will be stored at MOVIE_START + i*NSTEP_BETWEEN_FRAMES, where i=(0,1,2..) and iNSTEP_BETWEEN_FRAMES <= MOVIE_STOP
 # movie_volume_type: 1=strain, 2=time integral of strain, 3=\mu*time integral of strain
 # type 4 saves the trace and deviatoric stress in the whole volume, 5=displacement, 6=velocity
+#MOVIE_VOLUME_COARSE saves movie only at corners of elements
 MOVIE_VOLUME_TYPE               = 2
 MOVIE_TOP_KM                    = -100.0
 MOVIE_BOTTOM_KM                 = 1000.0
@@ -80,7 +74,7 @@
 MOVIE_STOP                      = 40000
 
 # save mesh files to check the mesh
-SAVE_MESH_FILES                 = .false.
+SAVE_MESH_FILES                 = .true.
 
 # restart files (number of runs can be 1, 2 or 3, choose 1 for no restart files)
 NUMBER_OF_RUNS                  = 1
@@ -90,7 +84,7 @@
 LOCAL_PATH                      = ./DATABASES_MPI
 
 # interval at which we output time step info and max of norm of displacement
-NTSTEP_BETWEEN_OUTPUT_INFO      = 100
+NTSTEP_BETWEEN_OUTPUT_INFO      = 50
 
 # interval in time steps for temporary writing of seismograms
 NTSTEP_BETWEEN_OUTPUT_SEISMOS   = 5000000
@@ -105,7 +99,7 @@
 ROTATE_SEISMOGRAMS_RT           = .false.
 
 # decide if master process writes all the seismograms or if all processes do it in parallel
-WRITE_SEISMOGRAMS_BY_MASTER     = .true.
+WRITE_SEISMOGRAMS_BY_MASTER     = .false.
 
 # save all seismograms in one large combined file instead of one file per seismogram
 # to avoid overloading shared non-local file systems such as GPFS for instance
@@ -120,3 +114,4 @@
 
 # set to true to use GPUs 
 GPU_MODE                        = .false.
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/STATIONS
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/STATIONS	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/STATIONS	2012-02-21 00:33:56 UTC (rev 19657)
@@ -1,129 +1,6 @@
-AAK        II       42.6390     74.4940    1645.0    30.0
-ABKT       II       37.9304     58.1189     678.0     7.0
-ABPO       II      -19.0180     47.2290    1528.0     5.3
-ALE        II       82.5033    -62.3500      60.0     0.0
-ARU        II       56.4302     58.5625     250.0     0.0
-ASCN       II       -7.9327    -14.3601     173.0   100.0
-BFO        II       48.3319      8.3311     589.0     0.0
-BORG       II       64.7474    -21.3268     110.0    95.0
-BRVK       II       53.0581     70.2828     330.0    15.0
-CMLA       II       37.7637    -25.5243     429.0     7.0
-COCO       II      -12.1901     96.8349       1.0    70.0
-DGAR       II       -7.4121     72.4525       1.0     2.0
-EFI        II      -51.6753    -58.0637     110.0    80.0
-ERM        II       42.0150    143.1572      40.0     0.0
-ESK        II       55.3167     -3.2050     242.0     0.0
-FFC        II       54.7250   -101.9783     338.0     0.0
-GAR        II       39.0000     70.3167    1300.0     0.0
-HOPE       II      -54.2836    -36.4879      20.0     0.0
-JTS        II       10.2908    -84.9525     340.0     0.0
-KAPI       II       -5.0142    119.7517     300.0   100.0
-KDAK       II       57.7828   -152.5835     152.0     5.5
-KIV        II       43.9562     42.6888    1210.0     0.0
-KURK       II       50.7154     78.6202     184.0    25.0
-KWAJ       II        8.8019    167.6130       0.0     0.0
-LVZ        II       67.8979     34.6514     630.0   200.0
-MBAR       II       -0.6019     30.7382    1390.0   100.0
-MSEY       II       -4.6737     55.4792     475.0    91.0
-MSVF       II      -17.7448    178.0528     801.1   100.0
-NIL        II       33.6506     73.2686     629.0    68.0
-NNA        II      -11.9875    -76.8422     575.0    40.0
-NRIL       II       69.5049     88.4414      92.0   506.0
-NVS        II       54.8404     83.2346     150.0     0.0
-OBN        II       55.1146     36.5674     160.0    30.0
-PALK       II        7.2728     80.7022     460.0    90.0
-RAYN       II       23.5225     45.5032     631.0     2.0
-RPN        II      -27.1267   -109.3344     110.0     0.0
-SACV       II       14.9702    -23.6085     387.0    97.0
-SHEL       II      -15.9588     -5.7457     537.0    60.0
-SUR        II      -32.3797     20.8117    1770.0     0.0
-TAU        II      -42.9099    147.3204     132.0     0.0
-TLY        II       51.6807    103.6438     579.0    20.0
-UOSS       II       24.9453     56.2042     284.4     0.0
-WRAB       II      -19.9336    134.3600     366.0   100.0
-XPFO       II       33.6107   -116.4555    1280.0     0.0
-AAE        IU        9.0292     38.7656    2442.0     0.0
-ADK        IU       51.8823   -176.6842     130.0     0.0
-AFI        IU      -13.9093   -171.7773     705.0     1.0
-ANMO       IU       34.9459   -106.4572    1720.0   100.0
-ANTO       IU       39.8680     32.7934     895.0   195.0
-BBSR       IU       32.3713    -64.6963      -1.3    31.4
-BILL       IU       68.0653    166.4531     320.0     0.0
-BOCO       IU        4.5869    -74.0432    3137.0    23.0
-CASY       IU      -66.2792    110.5354       5.0     5.0
-CCM        IU       38.0557    -91.2446     171.0    51.0
-CHTO       IU       18.8141     98.9443     420.0     0.0
-COLA       IU       64.8736   -147.8616     200.0     0.0
-COR        IU       44.5855   -123.3046     110.0     0.0
-CTAO       IU      -20.0882    146.2545     320.0    37.0
-DAV        IU        7.0697    125.5791     149.0     1.0
-DWPF       IU       28.1103    -81.4327    -132.0   162.0
-FUNA       IU       -8.5259    179.1966      19.0     1.0
-FURI       IU        8.8952     38.6798    2565.0     5.0
-GNI        IU       40.1480     44.7410    1509.0   100.0
 GRFO       IU       49.6909     11.2203     384.0   116.0
-GUMO       IU       13.5893    144.8684      61.0   109.0
-HKT        IU       29.9618    -95.8384    -413.0   450.0
-HNR        IU       -9.4387    159.9475       0.0   100.0
-HRV        IU       42.5064    -71.5583     200.0     0.0
-INCN       IU       37.4776    126.6239      79.0     1.0
-JOHN       IU       16.7329   -169.5292     -37.0    39.0
-KBS        IU       78.9154     11.9385      90.0     3.0
-KEV        IU       69.7565     27.0035      85.0    15.0
-KIEV       IU       50.7012     29.2242     140.0    40.0
-KIP        IU       21.4233   -158.0150      37.0    33.0
-KMBO       IU       -1.1271     37.2525    1930.0    20.0
-KNTN       IU       -2.7744   -171.7186      18.0     2.0
-KONO       IU       59.6491      9.5982     216.0   340.0
-KOWA       IU       14.4967     -4.0140     316.0     5.0
-LCO        IU      -29.0110    -70.7004    2300.0     0.0
-LSZ        IU      -15.2779     28.1882    1200.0     0.0
-LVC        IU      -22.6127    -68.9111    2930.0    30.0
-MA2        IU       59.5756    150.7700     337.0     2.0
-MAJO       IU       36.5457    138.2041     405.0     0.0
-MAKZ       IU       46.8080     81.9770     590.0    10.0
-MBWA       IU      -21.1590    119.7313     181.0     9.0
-MIDW       IU       28.2155   -177.3697      17.8     1.0
-MSKU       IU       -1.6557     13.6116     287.0    25.0
-NAI        IU       -1.2739     36.8037    1692.0     0.0
-NWAO       IU      -32.9277    117.2390     370.9     9.1
-OTAV       IU        0.2376    -78.4508    3495.0    15.0
-PAB        IU       39.5446     -4.3499     950.0     0.0
-PAYG       IU       -0.6742    -90.2861     170.0   100.0
-PET        IU       53.0233    158.6499     105.0     5.0
-PMG        IU       -9.4047    147.1597      90.0     0.0
-PMSA       IU      -64.7744    -64.0489      40.0     0.0
-POHA       IU       19.7573   -155.5326    1910.0    80.0
-PTCN       IU      -25.0713   -130.0953     218.0     2.0
-PTGA       IU       -0.7308    -59.9666     141.0    96.0
-QSPA       IU      -89.9289    144.4382    2847.0     3.0
-RAIO       IU       46.0403   -122.8851      11.0     0.0
-RAO        IU      -29.2450   -177.9290      59.5     0.5
-RAR        IU      -21.2125   -159.7733     -72.0   100.0
-RCBR       IU       -5.8274    -35.9014     291.0   109.0
-RSSD       IU       44.1212   -104.0359    2022.7    67.3
-SAML       IU       -8.9489    -63.1831     120.0     0.0
-SBA        IU      -77.8492    166.7572      48.0     2.0
-SDV        IU        8.8839    -70.6340    1588.0    32.0
-SFJD       IU       66.9961    -50.6208     329.0     1.0
-SJG        IU       18.1091    -66.1500     420.0     0.0
-SLBS       IU       23.6858   -109.9443     825.0     0.0
-SNZO       IU      -41.3087    174.7043      20.0   100.0
-SSPA       IU       40.6358    -77.8876     170.0   100.0
-TARA       IU        1.3549    172.9229      19.0     1.0
-TATO       IU       24.9735    121.4971      77.1    82.9
-TBT        IU       28.6794    -17.9145     180.0    40.0
-TEIG       IU       20.2263    -88.2763      15.0    25.0
-TIXI       IU       71.6341    128.8667      40.0     0.0
-TOL        IU       39.8814     -4.0485     480.0     0.0
-TRIS       IU      -37.0681    -12.3152      58.0     2.0
-TRQA       IU      -38.0568    -61.9787     439.0   101.0
-TSUM       IU      -19.2022     17.5838    1260.0     0.0
-TUC        IU       32.3098   -110.7847     909.0     1.0
-ULN        IU       47.8651    107.0532    1610.0     0.0
-WAKE       IU       19.2834    166.6520      19.0     1.0
-WCI        IU       38.2289    -86.2939      78.0   132.0
-WVT        IU       36.1297    -87.8300     170.0     0.0
-XMAS       IU        2.0448   -157.4457      19.0     1.0
-YAK        IU       62.0310    129.6805     110.0    14.0
-YSS        IU       46.9587    142.7604     148.0     2.0
+BEKI       YL       41.31500    34.26300   1415.0     0.0
+BGIO       SR       31.72200    35.08780    752.0   100.0
+LIT        HT       40.10080    22.49000    480.0     0.0
+ZKR        GE       35.11470    26.21700    270.0     0.0
+S001       XS       37.28300    21.71800    156.0     0.0

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/DATA/Par_file
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/DATA/Par_file	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/DATA/Par_file	2012-02-21 00:33:56 UTC (rev 19657)
@@ -33,15 +33,15 @@
 # transversely_isotropic_prem_plus_3D_crust_2.0, 3D_anisotropic, 3D_attenuation,
 # s20rts, s362ani, s362iso, s362wmani, s362ani_prem, s362ani_3DQ, s362iso_3DQ,
 # s29ea, s29ea,sea99_jp3d1994,sea99,jp3d1994,heterogen
-MODEL                           = 1D_transversely_isotropic_prem
+MODEL                           = s362ani
 
 # parameters describing the Earth model
-OCEANS                          = .true.
-ELLIPTICITY                     = .true.
-TOPOGRAPHY                      = .true.
-GRAVITY                         = .true.
-ROTATION                        = .true.
-ATTENUATION                     = .true.
+OCEANS                          = .false.
+ELLIPTICITY                     = .false.
+TOPOGRAPHY                      = .false.
+GRAVITY                         = .false.
+ROTATION                        = .false.
+ATTENUATION                     = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .true.

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/process.sh
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/process.sh	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/process.sh	2012-02-21 00:33:56 UTC (rev 19657)
@@ -32,15 +32,15 @@
 # using default configuration
 cd ../../
 # configures package with ifort compiler
-./configure F90=ifort MPIF90=/usr/local/openmpi-ifort/bin/mpif90 FLAGS_CHECK="-O3 -assume byterecl" FLAGS_NO_CHECK="-O3 -assume byterecl" > tmp.log
+#./configure F90=ifort MPIF90=/usr/local/openmpi-ifort/bin/mpif90 FLAGS_CHECK="-O3 -assume byterecl" FLAGS_NO_CHECK="-O3 -assume byterecl" > tmp.log
 # configures package with gfortran compiler
 #./configure F90=gfortran MPIF90=mpif90 FLAGS_CHECK="-O3" FLAGS_NO_CHECK="-O3"
 
 # compiles for a forward simulation
 cp $currentdir/DATA/Par_file DATA/Par_file
 make clean
-make >& $currentdir/tmp_make_output.log
-make xcombine_vol_data >> $currentdir/tmp_make_output.log
+make #>& $currentdir/tmp_make_output.log
+make xcombine_vol_data #>> $currentdir/tmp_make_output.log
 
 # backup of constants setup
 cp setup/* $currentdir/OUTPUT_FILES/

Deleted: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu	2012-02-21 00:33:56 UTC (rev 19657)
@@ -1,369 +0,0 @@
-/*
- !=====================================================================
- !
- !               S p e c f e m 3 D  V e r s i o n  2 . 0
- !               ---------------------------------------
- !
- !          Main authors: Dimitri Komatitsch and Jeroen Tromp
- !    Princeton University, USA and University of Pau / CNRS / INRIA
- ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
- !                            April 2011
- !
- ! This program is free software; you can redistribute it and/or modify
- ! it under the terms of the GNU General Public License as published by
- ! the Free Software Foundation; either version 2 of the License, or
- ! (at your option) any later version.
- !
- ! This program is distributed in the hope that it will be useful,
- ! but WITHOUT ANY WARRANTY; without even the implied warranty of
- ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- ! GNU General Public License for more details.
- !
- ! You should have received a copy of the GNU General Public License along
- ! with this program; if not, write to the Free Software Foundation, Inc.,
- ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- !
- !=====================================================================
- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <cublas.h>
-
-#include <sys/types.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-
-#include "config.h"
-#include "mesh_constants_cuda.h"
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// acoustic sources
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void compute_add_sources_acoustic_kernel(realw* potential_dot_dot_acoustic,
-                                                    int* ibool,
-                                                    int* ispec_is_inner,
-                                                    int phase_is_inner,
-                                                    realw* sourcearrays,
-                                                    double* stf_pre_compute,
-                                                    int myrank,
-                                                    int* islice_selected_source,
-                                                    int* ispec_selected_source,
-                                                    int* ispec_is_acoustic,
-                                                    realw* kappastore,
-                                                    int NSOURCES) {
-  int i = threadIdx.x;
-  int j = threadIdx.y;
-  int k = threadIdx.z;
-
-  int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
-
-  int ispec;
-  int iglob;
-  realw stf;
-  realw kappal;
-
-  if( isource < NSOURCES ){
-
-    if(myrank == islice_selected_source[isource]) {
-
-      ispec = ispec_selected_source[isource]-1;
-
-      if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) {
-
-        stf = (realw) stf_pre_compute[isource];
-        iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
-        kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
-
-        atomicAdd(&potential_dot_dot_acoustic[iglob],
-                  -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal);
-
-        //      potential_dot_dot_acoustic[iglob] +=
-        //                -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal;
-      }
-    }
-  }
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(compute_add_sources_ac_cuda,
-              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
-                                                 int* phase_is_innerf,
-                                                 int* NSOURCESf,
-                                                 int* SIMULATION_TYPEf,
-                                                 double* h_stf_pre_compute,
-                                                 int* myrankf) {
-
-TRACE("compute_add_sources_ac_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  // check if anything to do
-  if( mp->nsources_local == 0 ) return;
-
-  int phase_is_inner = *phase_is_innerf;
-  int NSOURCES = *NSOURCESf;
-  int myrank = *myrankf;
-
-  int num_blocks_x = NSOURCES;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  // copies pre-computed source time factors onto GPU
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
-                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,5);
-
-  compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
-                                                        mp->d_ibool,
-                                                        mp->d_ispec_is_inner,
-                                                        phase_is_inner,
-                                                        mp->d_sourcearrays,
-                                                        mp->d_stf_pre_compute,
-                                                        myrank,
-                                                        mp->d_islice_selected_source,
-                                                        mp->d_ispec_selected_source,
-                                                        mp->d_ispec_is_acoustic,
-                                                        mp->d_kappastore,
-                                                        NSOURCES);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_ac_cuda");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(compute_add_sources_ac_s3_cuda,
-              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
-                                                      int* phase_is_innerf,
-                                                      int* NSOURCESf,
-                                                      int* SIMULATION_TYPEf,
-                                                      double* h_stf_pre_compute,
-                                                      int* myrankf) {
-
-TRACE("compute_add_sources_ac_s3_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  // check if anything to do
-  if( mp->nsources_local == 0 ) return;
-
-  int phase_is_inner = *phase_is_innerf;
-  int NSOURCES = *NSOURCESf;
-  int myrank = *myrankf;
-
-  int num_blocks_x = NSOURCES;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  // copies source time factors onto GPU
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
-                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,5);
-
-  compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
-                                                        mp->d_ibool,
-                                                        mp->d_ispec_is_inner,
-                                                        phase_is_inner,
-                                                        mp->d_sourcearrays,
-                                                        mp->d_stf_pre_compute,
-                                                        myrank,
-                                                        mp->d_islice_selected_source,
-                                                        mp->d_ispec_selected_source,
-                                                        mp->d_ispec_is_acoustic,
-                                                        mp->d_kappastore,
-                                                        NSOURCES);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_ac_s3_cuda");
-#endif
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// acoustic adjoint sources
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_acoustic,
-                                                      int nrec,
-                                                      realw* adj_sourcearrays,
-                                                      int* ibool,
-                                                      int* ispec_is_inner,
-                                                      int* ispec_is_acoustic,
-                                                      int* ispec_selected_rec,
-                                                      int phase_is_inner,
-                                                      int* pre_computed_irec,
-                                                      int nadj_rec_local,
-                                                      realw* kappastore) {
-
-  int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
-
-  // because of grid shape, irec_local can be too big
-  if(irec_local < nadj_rec_local) {
-
-    int irec = pre_computed_irec[irec_local];
-
-    int ispec = ispec_selected_rec[irec]-1;
-    if( ispec_is_acoustic[ispec] ){
-
-      // checks if element is in phase_is_inner run
-      if(ispec_is_inner[ispec] == phase_is_inner) {
-        int i = threadIdx.x;
-        int j = threadIdx.y;
-        int k = threadIdx.z;
-
-        int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
-
-        //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
-
-        //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
-        //                                            pre_computed_irec_local_index[irec],
-        //                                            pre_computed_index,
-        //                                            0,
-        //                                            i,j,k)]/kappal;
-
-        // beware, for acoustic medium, a pressure source would be taking the negative
-        // and divide by Kappa of the fluid;
-        // this would have to be done when constructing the adjoint source.
-        //
-        // note: we take the first component of the adj_sourcearrays
-        //          the idea is to have e.g. a pressure source, where all 3 components would be the same
-        realw stf = adj_sourcearrays[INDEX5(5,5,5,3,i,j,k,0,irec_local)]; // / kappal
-
-        atomicAdd(&potential_dot_dot_acoustic[iglob],stf);
-
-                  //+adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
-                  //                         pre_computed_irec_local_index[irec],pre_computed_index-1,
-                  //                         0,i,j,k)] // / kappal
-                  //                         );
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-extern "C"
-void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
-              ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
-                                               realw* h_adj_sourcearrays,
-                                               int* phase_is_inner,
-                                               int* h_ispec_is_inner,
-                                               int* h_ispec_is_acoustic,
-                                               int* h_ispec_selected_rec,
-                                               int* myrank,
-                                               int* nrec,
-                                               int* time_index,
-                                               int* h_islice_selected_rec,
-                                               int* nadj_rec_local,
-                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
-
-TRACE("add_sources_ac_sim_2_or_3_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-
-  // checks
-  if( *nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
-
-  // make sure grid dimension is less than 65535 in x dimension
-  int num_blocks_x = mp->nadj_rec_local;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y,1);
-  dim3 threads(5,5,5);
-
-  // build slice of adj_sourcearrays because full array is *very* large.
-  // note: this extracts array values for local adjoint sources at given time step "time_index"
-  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
-  int ispec,i,j,k;
-  int irec_local = 0;
-  for(int irec = 0; irec < *nrec; irec++) {
-    if(*myrank == h_islice_selected_rec[irec]) {
-      irec_local++;
-
-      // takes only acoustic sources
-      ispec = h_ispec_selected_rec[irec]-1;
-      if( h_ispec_is_acoustic[ispec] ){
-
-        if( h_ispec_is_inner[ispec] == *phase_is_inner) {
-          for(k=0;k<5;k++) {
-            for(j=0;j<5;j++) {
-              for(i=0;i<5;i++) {
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,irec_local-1)]
-                  = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
-                                            *NTSTEP_BETWEEN_READ_ADJSRC,
-                                            3,5,5,
-                                            irec_local-1,(*time_index)-1,
-                                            0,i,j,k)];
-
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,irec_local-1)]
-                  = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
-                                            *NTSTEP_BETWEEN_READ_ADJSRC,
-                                            3,5,5,
-                                            irec_local-1,(*time_index)-1,
-                                            1,i,j,k)];
-
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,irec_local-1)]
-                  = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
-                                            *NTSTEP_BETWEEN_READ_ADJSRC,
-                                            3,5,5,
-                                            irec_local-1,(*time_index)-1,
-                                            2,i,j,k)];
-              }
-            }
-          }
-        } // phase_is_inner
-      } // h_ispec_is_acoustic
-    }
-  }
-  // check all local sources were added
-  if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
-
-  // copies extracted array values onto GPU
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
-                              (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),99099);
-
-  // launches cuda kernel for acoustic adjoint sources
-  add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
-                                                          *nrec,
-                                                          mp->d_adj_sourcearrays,
-                                                          mp->d_ibool,
-                                                          mp->d_ispec_is_inner,
-                                                          mp->d_ispec_is_acoustic,
-                                                          mp->d_ispec_selected_rec,
-                                                          *phase_is_inner,
-                                                          mp->d_pre_computed_irec,
-                                                          mp->nadj_rec_local,
-                                                          mp->d_kappastore);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");
-#endif
-}

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2012-02-21 00:33:56 UTC (rev 19657)
@@ -47,23 +47,20 @@
 
 __global__ void compute_add_sources_kernel(realw* accel,
                                            int* ibool,
-                                           int* ispec_is_inner,
-                                           int phase_is_inner,
                                            realw* sourcearrays,
                                            double* stf_pre_compute,
                                            int myrank,
                                            int* islice_selected_source,
                                            int* ispec_selected_source,
-                                           int* ispec_is_elastic,
                                            int NSOURCES) {
+  int ispec,iglob;
+  realw stf;
+
   int i = threadIdx.x;
   int j = threadIdx.y;
   int k = threadIdx.z;
 
   int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
-  int ispec;
-  int iglob;
-  realw stf;
 
   if(isource < NSOURCES) { // when NSOURCES > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
 
@@ -71,21 +68,19 @@
 
       ispec = ispec_selected_source[isource]-1;
 
-      if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) {
+      stf = (realw) stf_pre_compute[isource];
+      iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
 
-        stf = (realw) stf_pre_compute[isource];
-        iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
-
-        atomicAdd(&accel[iglob*3],
-                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf);
-        atomicAdd(&accel[iglob*3+1],
-                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 1, i,j,k)]*stf);
-        atomicAdd(&accel[iglob*3+2],
-                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 2, i,j,k)]*stf);
-      }
+      // note: for global version, sourcearrays has dimensions 
+      //            sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,NSOURCES)
+      atomicAdd(&accel[iglob*3],
+                sourcearrays[INDEX5(3,5,5,5, 0,i,j,k,isource)]*stf);
+      atomicAdd(&accel[iglob*3+1],
+                sourcearrays[INDEX5(3,5,5,5, 1,i,j,k,isource)]*stf);
+      atomicAdd(&accel[iglob*3+2],
+                sourcearrays[INDEX5(3,5,5,5, 2,i,j,k,isource)]*stf);
     }
   }
-
 }
 
 
@@ -94,21 +89,17 @@
 extern "C"
 void FC_FUNC_(compute_add_sources_el_cuda,
               COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
-                                            int* phase_is_innerf,
-                                            int* NSOURCESf,
-                                            double* h_stf_pre_compute,
-                                            int* myrankf) {
+                                           int* NSOURCESf,
+                                           double* h_stf_pre_compute) {
 
-TRACE("compute_add_sources_el_cuda");
+  TRACE("compute_add_sources_el_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  // check if anything to do
+  // checks if anything to do
   if( mp->nsources_local == 0 ) return;
 
-  int phase_is_inner = *phase_is_innerf;
   int NSOURCES = *NSOURCESf;
-  int myrank = *myrankf;
 
   int num_blocks_x = NSOURCES;
   int num_blocks_y = 1;
@@ -116,24 +107,21 @@
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
 
-  //double* d_stf_pre_compute;
+  // copies source time function buffer values to GPU
   print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
                                      NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
 
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,5);
-
-  compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel,
-                                               mp->d_ibool,
-                                               mp->d_ispec_is_inner,
-                                               phase_is_inner,
+  // adds source contributions
+  compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
+                                               mp->d_ibool_crust_mantle,
                                                mp->d_sourcearrays,
                                                mp->d_stf_pre_compute,
-                                               myrank,
+                                               mp->myrank,
                                                mp->d_islice_selected_source,
                                                mp->d_ispec_selected_source,
-                                               mp->d_ispec_is_elastic,
                                                NSOURCES);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -145,42 +133,38 @@
 
 extern "C"
 void FC_FUNC_(compute_add_sources_el_s3_cuda,
-              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
-                                              double* h_stf_pre_compute,
+              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer_f,
                                               int* NSOURCESf,
-                                              int* phase_is_inner,
-                                              int* myrank) {
+                                              double* h_stf_pre_compute) {
   TRACE("compute_add_sources_el_s3_cuda");
-  // EPIK_TRACER("compute_add_sources_el_s3_cuda");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
+  // checks if anything to do
+  if( mp->nsources_local == 0 ) return;
+
   int NSOURCES = *NSOURCESf;
 
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
-                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_el_s3_cuda");
-#endif
-
   int num_blocks_x = NSOURCES;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
-
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(5,5,5);
 
-  compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel,mp->d_ibool,
-                                               mp->d_ispec_is_inner, *phase_is_inner,
+  // copies source time function buffer values to GPU
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),19);
+  
+  compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+                                               mp->d_ibool_crust_mantle,
                                                mp->d_sourcearrays,
                                                mp->d_stf_pre_compute,
-                                               *myrank,
-                                               mp->d_islice_selected_source,mp->d_ispec_selected_source,
-                                               mp->d_ispec_is_elastic,
+                                               mp->myrank,
+                                               mp->d_islice_selected_source,
+                                               mp->d_ispec_selected_source,
                                                NSOURCES);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -219,7 +203,6 @@
 extern "C"
 void FC_FUNC_(add_source_master_rec_noise_cu,
               ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
-                                                int* myrank_f,
                                                 int* it_f,
                                                 int* irec_master_noise_f,
                                                 int* islice_selected_rec) {
@@ -230,12 +213,11 @@
 
   int it = *it_f-1; // -1 for Fortran -> C indexing differences
   int irec_master_noise = *irec_master_noise_f;
-  int myrank = *myrank_f;
 
   dim3 grid(1,1,1);
   dim3 threads(NGLL3,1,1);
 
-  if(myrank == islice_selected_rec[irec_master_noise-1]) {
+  if(mp->myrank == islice_selected_rec[irec_master_noise-1]) {
     add_source_master_rec_noise_cuda_kernel<<<grid,threads>>>(mp->d_ibool,
                                                               mp->d_ispec_selected_rec,
                                                               irec_master_noise,
@@ -259,47 +241,35 @@
                                                      int nrec,
                                                      realw* adj_sourcearrays,
                                                      int* ibool,
-                                                     int* ispec_is_inner,
-                                                     int* ispec_is_elastic,
                                                      int* ispec_selected_rec,
-                                                     int phase_is_inner,
                                                      int* pre_computed_irec,
                                                      int nadj_rec_local) {
 
+  int ispec,iglob;
+  int irec,i,j,k;
+  
   int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
 
   if(irec_local < nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
 
-    int irec = pre_computed_irec[irec_local];
+    irec = pre_computed_irec[irec_local];
+    ispec = ispec_selected_rec[irec]-1;
 
-    int ispec = ispec_selected_rec[irec]-1;
-    if( ispec_is_elastic[ispec] ){
+    i = threadIdx.x;
+    j = threadIdx.y;
+    k = threadIdx.z;
+    iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
 
-      if(ispec_is_inner[ispec] == phase_is_inner) {
-        int i = threadIdx.x;
-        int j = threadIdx.y;
-        int k = threadIdx.z;
-        int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+    // atomic operations are absolutely necessary for correctness!
+    atomicAdd(&accel[3*iglob],adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
+                                                      0,i,j,k,irec_local)]);
 
-        // atomic operations are absolutely necessary for correctness!
-        atomicAdd(&accel[3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
-                                                              i,j,k,
-                                                              0,
-                                                              irec_local)]);
+    atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
+                                                      1,i,j,k,irec_local)]);
 
-        atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(5,5,5,3,
-                                                             i,j,k,
-                                                             1,
-                                                             irec_local)]);
-
-        atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
-                                                            i,j,k,
-                                                            2,
-                                                            irec_local)]);
-      }
-    } // ispec_is_elastic
+    atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
+                                                      2,i,j,k,irec_local)]);
   }
-
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -307,24 +277,18 @@
 extern "C"
 void FC_FUNC_(add_sources_el_sim_type_2_or_3,
               ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
-                                               realw* h_adj_sourcearrays,
-                                               int* phase_is_inner,
-                                               int* h_ispec_is_inner,
-                                               int* h_ispec_is_elastic,
-                                               int* h_ispec_selected_rec,
-                                               int* myrank,
-                                               int* nrec,
-                                               int* time_index,
-                                               int* h_islice_selected_rec,
-                                               int* nadj_rec_local,
-                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
+                                              int* nrec,
+                                              realw* h_adj_sourcearrays,
+                                              int* h_islice_selected_rec,
+                                              int* h_ispec_selected_rec,
+                                              int* time_index) {
 
-TRACE("add_sources_el_sim_type_2_or_3");
+  TRACE("add_sources_el_sim_type_2_or_3");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  // checks
-  if( *nadj_rec_local != mp->nadj_rec_local) exit_on_error("add_sources_el_sim_type_2_or_3: nadj_rec_local not equal\n");
+  // check if anything to do
+  if( mp->nadj_rec_local == 0 ) return;
 
   // make sure grid dimension is less than 65535 in x dimension
   int num_blocks_x = mp->nadj_rec_local;
@@ -333,62 +297,42 @@
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
-
   dim3 grid(num_blocks_x,num_blocks_y,1);
   dim3 threads(5,5,5);
 
   // build slice of adj_sourcearrays because full array is *very* large.
   // note: this extracts array values for local adjoint sources at given time step "time_index"
   //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
-  int ispec,i,j,k;
+  int i,j,k;
   int irec_local = 0;
   for(int irec = 0; irec < *nrec; irec++) {
-    if(*myrank == h_islice_selected_rec[irec]) {
+    if(mp->myrank == h_islice_selected_rec[irec]) {
       irec_local++;
 
-      // takes only elastic sources
-      ispec = h_ispec_selected_rec[irec]-1;
-      if( h_ispec_is_elastic[ispec] ){
+      // takes only local sources
+      for(k=0;k<NGLLX;k++) {
+        for(j=0;j<NGLLX;j++) {
+          for(i=0;i<NGLLX;i++) {
 
-        if( h_ispec_is_inner[ispec] == *phase_is_inner) {
-          for(k=0;k<5;k++) {
-            for(j=0;j<5;j++) {
-              for(i=0;i<5;i++) {
+            // note: global version uses dimensions
+            //          h_adj_sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC)
+            mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
+                                            0,i,j,k,irec_local-1)]
+                    = h_adj_sourcearrays[INDEX6(NDIM,NGLLX,NGLLX,NGLLX,mp->nadj_rec_local,
+                                                0,i,j,k,irec_local-1,*time_index-1)];
 
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
-                                                i,j,k,0,
-                                                irec_local-1)]
-                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
-                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
-                                                    3,5,5,
-                                                    irec_local-1,
-                                                    *time_index-1,
-                                                    0,i,j,k)];
+            mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
+                                            1,i,j,k,irec_local-1)]
+                    = h_adj_sourcearrays[INDEX6(NDIM,NGLLX,NGLLX,NGLLX,mp->nadj_rec_local,
+                                                1,i,j,k,irec_local-1,*time_index-1)];
 
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
-                                                i,j,k,1,
-                                                irec_local-1)]
-                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
-                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
-                                                    3,5,5,
-                                                    irec_local-1,
-                                                    *time_index-1,
-                                                    1,i,j,k)];
-
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
-                                                i,j,k,2,
-                                                irec_local-1)]
-                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
-                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
-                                                    3,5,5,
-                                                    irec_local-1,
-                                                    *time_index-1,
-                                                    2,i,j,k)];
-              }
-            }
+            mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
+                                            2,i,j,k,irec_local-1)]
+                    = h_adj_sourcearrays[INDEX6(NDIM,NGLLX,NGLLX,NGLLX,mp->nadj_rec_local,
+                                                2,i,j,k,irec_local-1,*time_index-1)];
           }
-        } // phase_is_inner
-      } // h_ispec_is_elastic
+        }
+      }
     }
   }
   // check all local sources were added
@@ -402,15 +346,11 @@
   // the irec_local variable needs to be precomputed (as
   // h_pre_comp..), because normally it is in the loop updating accel,
   // and due to how it's incremented, it cannot be parallelized
-
-  add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel,
+  add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
                                                          *nrec,
                                                          mp->d_adj_sourcearrays,
-                                                         mp->d_ibool,
-                                                         mp->d_ispec_is_inner,
-                                                         mp->d_ispec_is_elastic,
+                                                         mp->d_ibool_crust_mantle,
                                                          mp->d_ispec_selected_rec,
-                                                         *phase_is_inner,
                                                          mp->d_pre_computed_irec,
                                                          mp->nadj_rec_local);
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2012-02-21 00:33:56 UTC (rev 19657)
@@ -41,29 +41,27 @@
 
 __global__ void compute_stacey_acoustic_kernel(realw* potential_dot_acoustic,
                                                realw* potential_dot_dot_acoustic,
+                                               int interface_type,
+                                               int num_abs_boundary_faces,
                                                int* abs_boundary_ispec,
-                                               int* abs_boundary_ijk,
-                                               realw* abs_boundary_jacobian2Dw,
+                                               int* nkmin_xi, int* nkmin_eta,
+                                               int* njmin, int* njmax,
+                                               int* nimin, int* nimax,                                               
+                                               realw* abs_boundary_jacobian2D,
+                                               realw* wgllwgll,
                                                int* ibool,
-                                               realw* rhostore,
-                                               realw* kappastore,
-                                               int* ispec_is_inner,
-                                               int* ispec_is_acoustic,
-                                               int phase_is_inner,
-                                               int SIMULATION_TYPE, int SAVE_FORWARD,
-                                               int num_abs_boundary_faces,
-                                               realw* b_potential_dot_acoustic,
+                                               realw* vpstore,
+                                               int SIMULATION_TYPE, 
+                                               int SAVE_FORWARD,
                                                realw* b_potential_dot_dot_acoustic,
-                                               realw* b_absorb_potential,
-                                               int gravity) {
+                                               realw* b_absorb_potential) {
 
   int igll = threadIdx.x;
   int iface = blockIdx.x + gridDim.x*blockIdx.y;
 
   int i,j,k,iglob,ispec;
-  realw rhol,kappal,cpl;
-  realw jacobianw;
-  realw vel;
+  realw sn;
+  realw jacobianw,fac1;
 
   // don't compute points outside NGLLSQUARE==NGLL2==25
   // way 2: no further check needed since blocksize = 25
@@ -74,46 +72,99 @@
     // "-1" from index values to convert from Fortran-> C indexing
     ispec = abs_boundary_ispec[iface]-1;
 
-    if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) {
+    // determines indices i,j,k depending on absorbing boundary type
+    switch( interface_type ){
+      case 4:
+        // xmin
+        if( nkmin_xi[INDEX2(2,0,iface)] == 0 || njmin[INDEX2(2,0,iface)] == 0 ) return;
+        
+        i = 0; // index -1
+        k = (igll/NGLLX);
+        j = (igll-k*NGLLX);
+        
+        if( k < nkmin_xi[INDEX2(2,0,iface)]-1 || k > NGLLX-1 ) return;
+        if( j < njmin[INDEX2(2,0,iface)]-1 || j > njmax[INDEX2(2,0,iface)]-1 ) return;
+        
+        fac1 = wgllwgll[k*NGLLX+j];
+        break;
+        
+      case 5:
+        // xmax
+        if( nkmin_xi[INDEX2(2,1,iface)] == 0 || njmin[INDEX2(2,1,iface)] == 0 ) return;
+        
+        i = NGLLX-1;
+        k = (igll/NGLLX);
+        j = (igll-k*NGLLX);
+        
+        if( k < nkmin_xi[INDEX2(2,1,iface)]-1 || k > NGLLX-1 ) return;
+        if( j < njmin[INDEX2(2,1,iface)]-1 || j > njmax[INDEX2(2,1,iface)]-1 ) return;
+        
+        fac1 = wgllwgll[k*NGLLX+j];
+        break;
+        
+      case 6:
+        // ymin
+        if( nkmin_eta[INDEX2(2,0,iface)] == 0 || nimin[INDEX2(2,0,iface)] == 0 ) return;
+        
+        j = 0;
+        k = (igll/NGLLX);        
+        i = (igll-k*NGLLX);
+        
+        if( k < nkmin_eta[INDEX2(2,0,iface)]-1 || k > NGLLX-1 ) return;
+        if( i < nimin[INDEX2(2,0,iface)]-1 || i > nimax[INDEX2(2,0,iface)]-1 ) return;
+        
+        fac1 = wgllwgll[k*NGLLX+i];
+        break;
+        
+      case 7:
+        // ymax
+        if( nkmin_eta[INDEX2(2,1,iface)] == 0 || nimin[INDEX2(2,1,iface)] == 0 ) return;
+        
+        j = NGLLX-1;
+        k = (igll/NGLLX);        
+        i = (igll-k*NGLLX);
+        
+        if( k < nkmin_eta[INDEX2(2,1,iface)]-1 || k > NGLLX-1 ) return;
+        if( i < nimin[INDEX2(2,1,iface)]-1 || i > nimax[INDEX2(2,1,iface)]-1 ) return;
+        
+        fac1 = wgllwgll[k*NGLLX+i];
+        break;
 
-      i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
-      j = abs_boundary_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
-      k = abs_boundary_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
-      iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+      case 8:
+        // zmin        
+        k = 0;
+        j = (igll/NGLLX);        
+        i = (igll-j*NGLLX);
+        
+        if( j < 0 || j > NGLLX-1 ) return;
+        if( i < 0 || i > NGLLX-1 ) return;
+        
+        fac1 = wgllwgll[j*NGLLX+i];
+        break;
+        
+    }
+    
+    iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
 
-      // determines bulk sound speed
-      rhol = rhostore[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+    // determines bulk sound speed
+    // velocity
+    sn = potential_dot_acoustic[iglob] / vpstore[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)] ;
 
-      kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+    // gets associated, weighted jacobian
+    jacobianw = abs_boundary_jacobian2D[INDEX2(NGLL2,igll,iface)]*fac1;
 
-      cpl = sqrt( kappal / rhol );
+    // Sommerfeld condition
+    atomicAdd(&potential_dot_dot_acoustic[iglob],-sn*jacobianw);
 
-      // velocity
-      if( gravity ){
-        // daniel: TODO - check gravity and stacey condition here...
-        // uses a potential definition of: s = grad(chi)
-        vel = potential_dot_acoustic[iglob] / rhol ;
-      }else{
-        // uses a potential definition of: s = 1/rho grad(chi)
-        vel = potential_dot_acoustic[iglob] / rhol;
-      }
-
-      // gets associated, weighted jacobian
-      jacobianw = abs_boundary_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
-
+    // adjoint simulations
+    if( SIMULATION_TYPE == 3 ){
       // Sommerfeld condition
-      atomicAdd(&potential_dot_dot_acoustic[iglob],-vel*jacobianw/cpl);
+      atomicAdd(&b_potential_dot_dot_acoustic[iglob],-b_absorb_potential[INDEX2(NGLL2,igll,iface)]);
+    }else if( SIMULATION_TYPE == 1 && SAVE_FORWARD ){
+      // saves boundary values
+      b_absorb_potential[INDEX2(NGLL2,igll,iface)] = sn*jacobianw;
+    }
 
-      // adjoint simulations
-      if( SIMULATION_TYPE == 3 ){
-        // Sommerfeld condition
-        atomicAdd(&b_potential_dot_dot_acoustic[iglob],-b_absorb_potential[INDEX2(NGLL2,igll,iface)]);
-      }else if( SIMULATION_TYPE == 1 && SAVE_FORWARD ){
-        // saves boundary values
-        b_absorb_potential[INDEX2(NGLL2,igll,iface)] = vel*jacobianw/cpl;
-      }
-    }
-//  }
   }
 }
 
@@ -121,20 +172,76 @@
 
 extern "C"
 void FC_FUNC_(compute_stacey_acoustic_cuda,
-              COMPUTE_STACEY_ACOUSTIC_CUDA)(
-                                    long* Mesh_pointer_f,
-                                    int* phase_is_innerf,
-                                    int* SIMULATION_TYPEf,
-                                    int* SAVE_FORWARDf,
-                                    realw* h_b_absorb_potential) {
+              COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+                                            realw* absorb_potential,
+                                            int* itype) {
 TRACE("compute_stacey_acoustic_cuda");
   //double start_time = get_time();
 
+  int num_abs_boundary_faces;
+  int* d_abs_boundary_ispec;
+  realw* d_abs_boundary_jacobian2D;
+  realw* d_wgllwgll;
+  realw* d_b_absorb_potential;
+  
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  int phase_is_inner          = *phase_is_innerf;
-  int SIMULATION_TYPE         = *SIMULATION_TYPEf;
-  int SAVE_FORWARD            = *SAVE_FORWARDf;
+  
+  // absorbing boundary type
+  int interface_type = *itype;
+  switch( interface_type ){
+    case 4:
+      // xmin
+      num_abs_boundary_faces = mp->nspec2D_xmin_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_xmin_outer_core;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_xmin_outer_core;
+      d_b_absorb_potential = mp->d_absorb_xmin_outer_core;
+      d_wgllwgll = mp->d_wgllwgll_yz;
+      break;
+      
+    case 5:
+      // xmax
+      num_abs_boundary_faces = mp->nspec2D_xmax_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_xmax_outer_core;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_xmax_outer_core;
+      d_b_absorb_potential = mp->d_absorb_xmax_outer_core;
+      d_wgllwgll = mp->d_wgllwgll_yz;
+      break;
+      
+    case 6:
+      // ymin
+      num_abs_boundary_faces = mp->nspec2D_ymin_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_ymin_outer_core;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymin_outer_core;
+      d_b_absorb_potential = mp->d_absorb_ymin_outer_core; 
+      d_wgllwgll = mp->d_wgllwgll_xz;
+      break;
+      
+    case 7:
+      // ymax
+      num_abs_boundary_faces = mp->nspec2D_ymax_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_ymax_outer_core;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymax_outer_core;
+      d_b_absorb_potential = mp->d_absorb_ymax_outer_core; 
+      d_wgllwgll = mp->d_wgllwgll_xz;
+      break;
 
+    case 8:
+      // zmin
+      num_abs_boundary_faces = mp->nspec2D_zmin_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_zmin_outer_core;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_zmin_outer_core;
+      d_b_absorb_potential = mp->d_absorb_zmin_outer_core; 
+      d_wgllwgll = mp->d_wgllwgll_xy;
+      break;
+      
+    default:
+      exit_on_cuda_error("compute_stacey_acoustic_cuda: unknown interface type");
+      break;
+  }
+
+  // checks if anything to do  
+  if( num_abs_boundary_faces == 0 ) return;
+  
   // way 1: Elapsed time: 4.385948e-03
   // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
   //  int blocksize = 32;
@@ -143,46 +250,47 @@
   // > NGLLSQUARE==NGLL2==25, no further check inside kernel
   int blocksize = NGLL2;
 
-  int num_blocks_x = mp->d_num_abs_boundary_faces;
+  int num_blocks_x = num_abs_boundary_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
-
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  //  adjoint simulations: reads in absorbing boundary
-  if (SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0 ){
+  //  adjoint simulations: needs absorbing boundary buffer
+  if (mp->simulation_type == 3 && num_abs_boundary_faces > 0 ){
     // copies array to GPU
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,h_b_absorb_potential,
-                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),7700);
+    print_CUDA_error_if_any(cudaMemcpy(d_b_absorb_potential,absorb_potential,
+                            NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyHostToDevice),7700);
   }
 
-  compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_acoustic,
-                                                   mp->d_potential_dot_dot_acoustic,
-                                                   mp->d_abs_boundary_ispec,
-                                                   mp->d_abs_boundary_ijk,
-                                                   mp->d_abs_boundary_jacobian2Dw,
-                                                   mp->d_ibool,
-                                                   mp->d_rhostore,
-                                                   mp->d_kappastore,
-                                                   mp->d_ispec_is_inner,
-                                                   mp->d_ispec_is_acoustic,
-                                                   phase_is_inner,
-                                                   SIMULATION_TYPE,SAVE_FORWARD,
-                                                   mp->d_num_abs_boundary_faces,
-                                                   mp->d_b_potential_dot_acoustic,
-                                                   mp->d_b_potential_dot_dot_acoustic,
-                                                   mp->d_b_absorb_potential,
-                                                   mp->gravity);
+  compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_veloc_outer_core,
+                                                   mp->d_accel_outer_core,
+                                                   interface_type,
+                                                   num_abs_boundary_faces,
+                                                   d_abs_boundary_ispec,
+                                                   mp->d_nkmin_xi_outer_core,
+                                                   mp->d_nkmin_eta_outer_core,                                                  
+                                                   mp->d_njmin_outer_core,
+                                                   mp->d_njmax_outer_core,
+                                                   mp->d_nimin_outer_core,
+                                                   mp->d_nimax_outer_core,                                                  
+                                                   d_abs_boundary_jacobian2D,
+                                                   d_wgllwgll,
+                                                   mp->d_ibool_outer_core,
+                                                   mp->d_vp_outer_core,
+                                                   mp->simulation_type,
+                                                   mp->save_forward,
+                                                   mp->d_b_accel_outer_core,
+                                                   d_b_absorb_potential);
 
   //  adjoint simulations: stores absorbed wavefield part
-  if (SIMULATION_TYPE == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ){
+  if (mp->simulation_type == 1 && mp->save_forward && num_abs_boundary_faces > 0 ){
     // copies array to CPU
-    print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_potential,mp->d_b_absorb_potential,
-                                       mp->d_b_reclen_potential,cudaMemcpyDeviceToHost),7701);
+    print_CUDA_error_if_any(cudaMemcpy(absorb_potential,d_b_absorb_potential,
+                            NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyDeviceToHost),7701);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2012-02-21 00:33:56 UTC (rev 19657)
@@ -41,19 +41,20 @@
 
 __global__ void compute_stacey_elastic_kernel(realw* veloc,
                                               realw* accel,
+                                              int interface_type,
+                                              int num_abs_boundary_faces,
                                               int* abs_boundary_ispec,
-                                              int* abs_boundary_ijk,
+                                              int* nkmin_xi, int* nkmin_eta,
+                                              int* njmin, int* njmax,
+                                              int* nimin, int* nimax,
                                               realw* abs_boundary_normal,
-                                              realw* abs_boundary_jacobian2Dw,
+                                              realw* abs_boundary_jacobian2D,
+                                              realw* wgllwgll,
                                               int* ibool,
                                               realw* rho_vp,
                                               realw* rho_vs,
-                                              int* ispec_is_inner,
-                                              int* ispec_is_elastic,
-                                              int phase_is_inner,
                                               int SIMULATION_TYPE,
                                               int SAVE_FORWARD,
-                                              int num_abs_boundary_faces,
                                               realw* b_accel,
                                               realw* b_absorb_field) {
 
@@ -66,63 +67,117 @@
   realw rho_vp_temp,rho_vs_temp;
   realw tx,ty,tz;
   realw jacobianw;
+  realw fac1;
 
-  // don't compute points outside NGLLSQUARE==NGLL2==25
-  // way 2: no further check needed since blocksize = 25
+  // don't compute surface faces outside of range
+  // and don't compute points outside NGLLSQUARE==NGLL2==25
+  //if(igll < NGLL2 && iface < num_abs_boundary_faces) {
+
+  // way 2: only check face, no further check needed since blocksize = 25  
   if( iface < num_abs_boundary_faces){
 
-  //if(igll < NGLL2 && iface < num_abs_boundary_faces) {
-
     // "-1" from index values to convert from Fortran-> C indexing
     ispec = abs_boundary_ispec[iface]-1;
+    
+    // determines indices i,j,k depending on absorbing boundary type
+    switch( interface_type ){
+      case 0:
+        // xmin
+        if( nkmin_xi[INDEX2(2,0,iface)] == 0 || njmin[INDEX2(2,0,iface)] == 0 ) return;
+        
+        i = 0; // index -1
+        k = (igll/NGLLX);
+        j = (igll-k*NGLLX);
+        
+        if( k < nkmin_xi[INDEX2(2,0,iface)]-1 || k > NGLLX-1 ) return;
+        if( j < njmin[INDEX2(2,0,iface)]-1 || j > NGLLX-1 ) return;
+        
+        fac1 = wgllwgll[k*NGLLX+j];
+        break;
+        
+      case 1:
+        // xmax
+        if( nkmin_xi[INDEX2(2,1,iface)] == 0 || njmin[INDEX2(2,1,iface)] == 0 ) return;
+        
+        i = NGLLX-1;
+        k = (igll/NGLLX);
+        j = (igll-k*NGLLX);
+        
+        if( k < nkmin_xi[INDEX2(2,1,iface)]-1 || k > NGLLX-1 ) return;
+        if( j < njmin[INDEX2(2,1,iface)]-1 || j > njmax[INDEX2(2,1,iface)]-1 ) return;
 
-    if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) {
+        fac1 = wgllwgll[k*NGLLX+j];
+        break;
+        
+      case 2:
+        // ymin
+        if( nkmin_eta[INDEX2(2,0,iface)] == 0 || nimin[INDEX2(2,0,iface)] == 0 ) return;
+        
+        j = 0;
+        k = (igll/NGLLX);        
+        i = (igll-k*NGLLX);
+        
+        if( k < nkmin_eta[INDEX2(2,0,iface)]-1 || k > NGLLX-1 ) return;
+        if( i < nimin[INDEX2(2,0,iface)]-1 || i > nimax[INDEX2(2,0,iface)]-1 ) return;
+        
+        fac1 = wgllwgll[k*NGLLX+i];
+        break;
 
-      i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
-      j = abs_boundary_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
-      k = abs_boundary_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
-      iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
+      case 3:
+        // ymax
+        if( nkmin_eta[INDEX2(2,1,iface)] == 0 || nimin[INDEX2(2,1,iface)] == 0 ) return;
+        
+        j = NGLLX-1;
+        k = (igll/NGLLX);        
+        i = (igll-k*NGLLX);
+        
+        if( k < nkmin_eta[INDEX2(2,1,iface)]-1 || k > NGLLX-1 ) return;
+        if( i < nimin[INDEX2(2,1,iface)]-1 || i > nimax[INDEX2(2,1,iface)]-1 ) return;
+        
+        fac1 = wgllwgll[k*NGLLX+i];
+        break;
+    }
+    
+    iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
 
-      // gets associated velocity
+    // gets associated velocity
+    vx = veloc[iglob*3+0];
+    vy = veloc[iglob*3+1];
+    vz = veloc[iglob*3+2];
 
-      vx = veloc[iglob*3+0];
-      vy = veloc[iglob*3+1];
-      vz = veloc[iglob*3+2];
+    // gets associated normal
+    nx = abs_boundary_normal[INDEX3(NDIM,NGLL2,0,igll,iface)];
+    ny = abs_boundary_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
+    nz = abs_boundary_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
 
-      // gets associated normal
-      nx = abs_boundary_normal[INDEX3(NDIM,NGLL2,0,igll,iface)];
-      ny = abs_boundary_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
-      nz = abs_boundary_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
+    // // velocity component in normal direction (normal points out of element)
+    vn = vx*nx + vy*ny + vz*nz;
 
-      // // velocity component in normal direction (normal points out of element)
-      vn = vx*nx + vy*ny + vz*nz;
+    rho_vp_temp = rho_vp[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+    rho_vs_temp = rho_vs[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
 
-      rho_vp_temp = rho_vp[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
-      rho_vs_temp = rho_vs[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+    tx = rho_vp_temp*vn*nx + rho_vs_temp*(vx-vn*nx);
+    ty = rho_vp_temp*vn*ny + rho_vs_temp*(vy-vn*ny);
+    tz = rho_vp_temp*vn*nz + rho_vs_temp*(vz-vn*nz);
 
-      tx = rho_vp_temp*vn*nx + rho_vs_temp*(vx-vn*nx);
-      ty = rho_vp_temp*vn*ny + rho_vs_temp*(vy-vn*ny);
-      tz = rho_vp_temp*vn*nz + rho_vs_temp*(vz-vn*nz);
+    jacobianw = abs_boundary_jacobian2D[INDEX2(NGLL2,igll,iface)]*fac1;
 
-      jacobianw = abs_boundary_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
+    atomicAdd(&accel[iglob*3],-tx*jacobianw);
+    atomicAdd(&accel[iglob*3+1],-ty*jacobianw);
+    atomicAdd(&accel[iglob*3+2],-tz*jacobianw);
 
-      atomicAdd(&accel[iglob*3],-tx*jacobianw);
-      atomicAdd(&accel[iglob*3+1],-ty*jacobianw);
-      atomicAdd(&accel[iglob*3+2],-tz*jacobianw);
-
-      if(SIMULATION_TYPE == 3) {
-        atomicAdd(&b_accel[iglob*3  ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]);
-        atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]);
-        atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]);
-      }
-      else if(SAVE_FORWARD && SIMULATION_TYPE == 1) {
-        b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)] = tx*jacobianw;
-        b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)] = ty*jacobianw;
-        b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)] = tz*jacobianw;
-      } // SIMULATION_TYPE
+    if(SIMULATION_TYPE == 3) {
+      atomicAdd(&b_accel[iglob*3  ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]);
+      atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]);
+      atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]);
     }
+    else if(SAVE_FORWARD && SIMULATION_TYPE == 1) {
+      b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)] = tx*jacobianw;
+      b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)] = ty*jacobianw;
+      b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)] = tz*jacobianw;
+    } // SIMULATION_TYPE
+    
   } // num_abs_boundary_faces
-
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -131,22 +186,71 @@
 extern "C"
 void FC_FUNC_(compute_stacey_elastic_cuda,
               COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
-                                           int* phase_is_innerf,
-                                           int* SIMULATION_TYPEf,
-                                           int* SAVE_FORWARDf,
-                                           realw* h_b_absorb_field) {
+                                                realw* absorb_field,
+                                                int* itype) {
 
 TRACE("compute_stacey_elastic_cuda");
 
+  int num_abs_boundary_faces;
+  int* d_abs_boundary_ispec;
+  realw* d_abs_boundary_normal;
+  realw* d_abs_boundary_jacobian2D;
+  realw* d_wgllwgll;
+  realw* d_b_absorb_field;
+  
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  // check
-  if( mp->d_num_abs_boundary_faces == 0 ) return;
+  // absorbing boundary type
+  int interface_type = *itype;
+  switch( interface_type ){
+    case 0:
+      // xmin
+      num_abs_boundary_faces = mp->nspec2D_xmin_crust_mantle;
+      d_abs_boundary_ispec = mp->d_ibelm_xmin_crust_mantle;
+      d_abs_boundary_normal = mp->d_normal_xmin_crust_mantle;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_xmin_crust_mantle;
+      d_b_absorb_field = mp->d_absorb_xmin_crust_mantle;
+      d_wgllwgll = mp->d_wgllwgll_yz;
+      break;
+      
+    case 1:
+      // xmax
+      num_abs_boundary_faces = mp->nspec2D_xmax_crust_mantle;
+      d_abs_boundary_ispec = mp->d_ibelm_xmax_crust_mantle;
+      d_abs_boundary_normal = mp->d_normal_xmax_crust_mantle;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_xmax_crust_mantle;
+      d_b_absorb_field = mp->d_absorb_xmax_crust_mantle;
+      d_wgllwgll = mp->d_wgllwgll_yz;
+      break;
+      
+    case 2:
+      // ymin
+      num_abs_boundary_faces = mp->nspec2D_ymin_crust_mantle;
+      d_abs_boundary_ispec = mp->d_ibelm_ymin_crust_mantle;
+      d_abs_boundary_normal = mp->d_normal_ymin_crust_mantle;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymin_crust_mantle;
+      d_b_absorb_field = mp->d_absorb_ymin_crust_mantle; 
+      d_wgllwgll = mp->d_wgllwgll_xz;
+      break;
+      
+    case 3:
+      // ymax
+      num_abs_boundary_faces = mp->nspec2D_ymax_crust_mantle;
+      d_abs_boundary_ispec = mp->d_ibelm_ymax_crust_mantle;
+      d_abs_boundary_normal = mp->d_normal_ymax_crust_mantle;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymax_crust_mantle;
+      d_b_absorb_field = mp->d_absorb_ymax_crust_mantle; 
+      d_wgllwgll = mp->d_wgllwgll_xz;
+      break;
+      
+    default:
+      exit_on_cuda_error("compute_stacey_elastic_cuda: unknown interface type");
+      break;
+  }
+  
+  // checks if anything to do  
+  if( num_abs_boundary_faces == 0 ) return;
 
-  int phase_is_inner    = *phase_is_innerf;
-  int SIMULATION_TYPE   = *SIMULATION_TYPEf;
-  int SAVE_FORWARD      = *SAVE_FORWARDf;
-
   // way 1
   // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
   //int blocksize = 32;
@@ -154,61 +258,56 @@
   // way 2: seems sligthly faster
   // > NGLLSQUARE==NGLL2==25, no further check inside kernel
   int blocksize = NGLL2;
-
-  int num_blocks_x = mp->d_num_abs_boundary_faces;
+  
+  int num_blocks_x = num_abs_boundary_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
-
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  if(SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0) {
-    // The read is done in fortran
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,h_b_absorb_field,
-                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),7700);
+  // adjoint simulations: needs absorbing boundary buffer
+  if(mp->simulation_type == 3 && num_abs_boundary_faces > 0) {
+    // copies array to GPU  
+    print_CUDA_error_if_any(cudaMemcpy(d_b_absorb_field,absorb_field,
+                            NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyHostToDevice),7700);
   }
 
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("between cudamemcpy and compute_stacey_elastic_kernel");
-#endif
+  // absorbing boundary contributions
+  compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc_crust_mantle,
+                                                  mp->d_accel_crust_mantle,
+                                                  interface_type,
+                                                  num_abs_boundary_faces,                                                  
+                                                  d_abs_boundary_ispec,
+                                                  mp->d_nkmin_xi_crust_mantle,
+                                                  mp->d_nkmin_eta_crust_mantle,                                                  
+                                                  mp->d_njmin_crust_mantle,
+                                                  mp->d_njmax_crust_mantle,
+                                                  mp->d_nimin_crust_mantle,
+                                                  mp->d_nimax_crust_mantle,                                                  
+                                                  d_abs_boundary_normal,
+                                                  d_abs_boundary_jacobian2D,
+                                                  d_wgllwgll,
+                                                  mp->d_ibool_crust_mantle,
+                                                  mp->d_rho_vp_crust_mantle,
+                                                  mp->d_rho_vs_crust_mantle,
+                                                  mp->simulation_type,
+                                                  mp->save_forward,
+                                                  mp->d_b_accel_crust_mantle,
+                                                  d_b_absorb_field);
 
-  compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc,
-                                                  mp->d_accel,
-                                                  mp->d_abs_boundary_ispec,
-                                                  mp->d_abs_boundary_ijk,
-                                                  mp->d_abs_boundary_normal,
-                                                  mp->d_abs_boundary_jacobian2Dw,
-                                                  mp->d_ibool,
-                                                  mp->d_rho_vp,
-                                                  mp->d_rho_vs,
-                                                  mp->d_ispec_is_inner,
-                                                  mp->d_ispec_is_elastic,
-                                                  phase_is_inner,
-                                                  SIMULATION_TYPE,SAVE_FORWARD,
-                                                  mp->d_num_abs_boundary_faces,
-                                                  mp->d_b_accel,
-                                                  mp->d_b_absorb_field);
 
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_stacey_elastic_kernel");
-#endif
-
-  // ! adjoint simulations: stores absorbed wavefield part
-  // if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) &
-  //   write(IOABS,rec=it) b_reclen_field,b_absorb_field,b_reclen_field
-
-  if(SIMULATION_TYPE == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ) {
-    print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_field,mp->d_b_absorb_field,
-                                       mp->d_b_reclen_field,cudaMemcpyDeviceToHost),7701);
-    // The write is done in fortran
-    // write_abs_(&fid,(char*)b_absorb_field,&b_reclen_field,&it);
+  // adjoint simulations: stores absorbed wavefield part
+  if(mp->simulation_type == 1 && mp->save_forward && num_abs_boundary_faces > 0 ) {
+    // copies array to CPU  
+    print_CUDA_error_if_any(cudaMemcpy(absorb_field,d_b_absorb_field,
+                            NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyDeviceToHost),7701);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("after compute_stacey_elastic after cudamemcpy");
+  exit_on_cuda_error("compute_stacey_elastic_cuda");
 #endif
 }
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-02-21 00:33:56 UTC (rev 19657)
@@ -418,6 +418,8 @@
   int anisotropic_inner_core;
   int save_boundary_mesh;
 
+  int myrank;
+  
   // ------------------------------------------------------------------ //
   // gravity
   // ------------------------------------------------------------------ //
@@ -490,7 +492,55 @@
   int* d_ibool_interfaces_outer_core;
   realw* d_send_accel_buffer_outer_core;
   
+  // ------------------------------------------------------------------ //
+  // absorbing boundaries
+  // ------------------------------------------------------------------ //
+  
+  int nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle;
+  int nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle;      
 
+  int* d_nimin_crust_mantle, *d_nimax_crust_mantle;
+  int* d_njmin_crust_mantle, *d_njmax_crust_mantle;
+  int* d_nkmin_xi_crust_mantle, *d_nkmin_eta_crust_mantle;
+
+  int* d_ibelm_xmin_crust_mantle, *d_ibelm_xmax_crust_mantle;
+  int* d_ibelm_ymin_crust_mantle, *d_ibelm_ymax_crust_mantle;  
+  
+  realw* d_normal_xmin_crust_mantle, *d_normal_xmax_crust_mantle;
+  realw* d_normal_ymin_crust_mantle, *d_normal_ymax_crust_mantle;
+  
+  realw* d_jacobian2D_xmin_crust_mantle, *d_jacobian2D_xmax_crust_mantle;
+  realw* d_jacobian2D_ymin_crust_mantle, *d_jacobian2D_ymax_crust_mantle;
+      
+  realw* d_absorb_xmin_crust_mantle, *d_absorb_xmax_crust_mantle;
+  realw* d_absorb_ymin_crust_mantle, *d_absorb_ymax_crust_mantle;
+  
+  realw* d_rho_vp_crust_mantle;
+  realw* d_rho_vs_crust_mantle;
+
+  int nspec2D_xmin_outer_core,nspec2D_xmax_outer_core;
+  int nspec2D_ymin_outer_core,nspec2D_ymax_outer_core;      
+  int nspec2D_zmin_outer_core;
+  
+  int* d_nimin_outer_core, *d_nimax_outer_core;
+  int* d_njmin_outer_core, *d_njmax_outer_core;
+  int* d_nkmin_xi_outer_core, *d_nkmin_eta_outer_core;
+  
+  int* d_ibelm_xmin_outer_core, *d_ibelm_xmax_outer_core;
+  int* d_ibelm_ymin_outer_core, *d_ibelm_ymax_outer_core;  
+  int* d_ibelm_zmin_outer_core;
+  
+  realw* d_jacobian2D_xmin_outer_core, *d_jacobian2D_xmax_outer_core;
+  realw* d_jacobian2D_ymin_outer_core, *d_jacobian2D_ymax_outer_core;
+  realw* d_jacobian2D_zmin_outer_core;
+  
+  realw* d_absorb_xmin_outer_core, *d_absorb_xmax_outer_core;
+  realw* d_absorb_ymin_outer_core, *d_absorb_ymax_outer_core;
+  realw* d_absorb_zmin_outer_core;
+  
+  realw* d_vp_outer_core;
+  
+
 // ------------------------------------------------------------------ //
 //daniel: TODO - former code...
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-02-21 00:33:56 UTC (rev 19657)
@@ -168,7 +168,7 @@
     // outputs initial memory infos via cudaMemGetInfo()
     double free_db,used_db,total_db;
     get_free_memory(&free_db,&used_db,&total_db);
-    fprintf(fp,"%d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank,
+    fprintf(fp,"%d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n",myrank,
             used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
 
     fclose(fp);
@@ -183,18 +183,18 @@
 
 extern "C"
 void FC_FUNC_(prepare_constants_device,
-              PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
+              PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer, 
+                                        int* myrank_f,
                                         int* h_NGLLX,
                                         realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
                                         realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
                                         realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
                                         int* NSOURCES,int* nsources_local,
                                         realw* h_sourcearrays,
-                                        int* h_islice_selected_source,
-                                        int* h_ispec_selected_source,
+                                        int* h_islice_selected_source,int* h_ispec_selected_source,
                                         int* h_number_receiver_global,
-                                        int* h_ispec_selected_rec,
-                                        int* nrec,int* nrec_local,
+                                        int* h_islice_selected_rec,int* h_ispec_selected_rec,
+                                        int* nrec,int* nrec_local, int* nadj_rec_local,
                                         int* NSPEC_CRUST_MANTLE, int* NGLOB_CRUST_MANTLE,
                                         int* NSPEC_OUTER_CORE, int* NGLOB_OUTER_CORE,
                                         int* NSPEC_INNER_CORE, int* NGLOB_INNER_CORE,
@@ -257,6 +257,7 @@
   mp->anisotropic_inner_core = *ANISOTROPIC_INNER_CORE_f;
   mp->save_boundary_mesh = *SAVE_BOUNDARY_MESH_f;
 
+  mp->myrank = *myrank_f;
 
   // mesh coloring flag
 #ifdef USE_MESH_COLORING_GPU
@@ -277,7 +278,7 @@
                                        sizeof(realw)* *NSOURCES*3*NGLL3),1301);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_sourcearrays, h_sourcearrays,
                                        sizeof(realw)* *NSOURCES*3*NGLL3,cudaMemcpyHostToDevice),1302);
-
+    // buffer for source time function values
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_stf_pre_compute,
                                        *NSOURCES*sizeof(double)),1303);
   }
@@ -306,8 +307,42 @@
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_selected_rec),(*nrec)*sizeof(int)),1513);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_rec,h_ispec_selected_rec,
                                      (*nrec)*sizeof(int),cudaMemcpyHostToDevice),1514);
-
-
+                                     
+  // receiver adjoint source arrays only used for noise and adjoint simulations
+  // adjoint source arrays
+  mp->nadj_rec_local = *nadj_rec_local;
+  if( mp->nadj_rec_local > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
+                                       (mp->nadj_rec_local)*3*NGLL3*sizeof(realw)),6003);
+    
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
+                                       (mp->nadj_rec_local)*sizeof(int)),6004);
+    
+    // prepares local irec array:
+    // the irec_local variable needs to be precomputed (as
+    // h_pre_comp..), because normally it is in the loop updating accel,
+    // and due to how it's incremented, it cannot be parallelized
+    int* h_pre_computed_irec = (int*) malloc( (mp->nadj_rec_local)*sizeof(int) );
+    if( h_pre_computed_irec == NULL ) exit_on_error("h_pre_computed_irec not allocated\n");
+    
+    int irec_local = 0;
+    for(int irec = 0; irec < *nrec; irec++) {
+      if(mp->myrank == h_islice_selected_rec[irec]) {
+        irec_local++;
+        h_pre_computed_irec[irec_local-1] = irec;
+      }
+    }
+    if( irec_local != mp->nadj_rec_local ) exit_on_error("prepare_sim2_or_3_const_device: irec_local not equal\n");
+    // copies values onto GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec,
+                                       (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),6010);
+    free(h_pre_computed_irec);
+    
+    // temporary array to prepare extracted source array values
+    mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
+    if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");    
+  }
+  
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_constants_device");
 #endif
@@ -761,9 +796,344 @@
   }
 }
 
+/* ----------------------------------------------------------------------------------------------- */
 
+// STRAIN simulations
+
 /* ----------------------------------------------------------------------------------------------- */
 
+extern "C"
+void FC_FUNC_(prepare_fields_absorb_device,
+              PREPARE_FIELDS_ABSORB_DEVICE)(long* Mesh_pointer_f,
+                                            int* nspec2D_xmin_crust_mantle,int* nspec2D_xmax_crust_mantle,
+                                            int* nspec2D_ymin_crust_mantle,int* nspec2D_ymax_crust_mantle,
+                                            int* NSPEC2DMAX_XMIN_XMAX_CM,int* NSPEC2DMAX_YMIN_YMAX_CM,
+                                            int* nimin_crust_mantle,int* nimax_crust_mantle,
+                                            int* njmin_crust_mantle,int* njmax_crust_mantle,
+                                            int* nkmin_xi_crust_mantle,int* nkmin_eta_crust_mantle,                                     
+                                            int* ibelm_xmin_crust_mantle,int* ibelm_xmax_crust_mantle,
+                                            int* ibelm_ymin_crust_mantle,int* ibelm_ymax_crust_mantle,                                       
+                                            realw* normal_xmin_crust_mantle,realw* normal_xmax_crust_mantle,
+                                            realw* normal_ymin_crust_mantle,realw* normal_ymax_crust_mantle,                                     
+                                            realw* jacobian2D_xmin_crust_mantle, realw* jacobian2D_xmax_crust_mantle,
+                                            realw* jacobian2D_ymin_crust_mantle, realw* jacobian2D_ymax_crust_mantle,
+                                            realw* rho_vp_crust_mantle,
+                                            realw* rho_vs_crust_mantle,
+                                            int* nspec2D_xmin_outer_core,int* nspec2D_xmax_outer_core,
+                                            int* nspec2D_ymin_outer_core,int* nspec2D_ymax_outer_core,
+                                            int* nspec2D_zmin_outer_core,
+                                            int* NSPEC2DMAX_XMIN_XMAX_OC,int* NSPEC2DMAX_YMIN_YMAX_OC,
+                                            int* nimin_outer_core,int* nimax_outer_core,
+                                            int* njmin_outer_core,int* njmax_outer_core,
+                                            int* nkmin_xi_outer_core,int* nkmin_eta_outer_core,                                     
+                                            int* ibelm_xmin_outer_core,int* ibelm_xmax_outer_core,
+                                            int* ibelm_ymin_outer_core,int* ibelm_ymax_outer_core,
+                                            int* ibelm_bottom_outer_core,
+                                            realw* jacobian2D_xmin_outer_core, realw* jacobian2D_xmax_outer_core,
+                                            realw* jacobian2D_ymin_outer_core, realw* jacobian2D_ymax_outer_core,
+                                            realw* jacobian2D_bottom_outer_core,                                            
+                                            realw* vp_outer_core
+                                            ) {
+  
+  TRACE("prepare_fields_absorb_device");
+  int size;
+  
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  
+  // checks flag
+  if( ! mp->absorbing_conditions ){ exit_on_cuda_error("prepare_fields_absorb_device absorbing_conditions not properly initialized"); }
+  
+  // crust_mantle
+  mp->nspec2D_xmin_crust_mantle = *nspec2D_xmin_crust_mantle;
+  mp->nspec2D_xmax_crust_mantle = *nspec2D_xmax_crust_mantle;
+  mp->nspec2D_ymin_crust_mantle = *nspec2D_ymin_crust_mantle;
+  mp->nspec2D_ymax_crust_mantle = *nspec2D_ymax_crust_mantle;
+
+  // vp & vs 
+  size = NGLL3*(mp->NSPEC_CRUST_MANTLE);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rho_vp_crust_mantle,
+                                     size*sizeof(realw)),2201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp_crust_mantle,rho_vp_crust_mantle,
+                                     size*sizeof(realw),cudaMemcpyHostToDevice),2202);
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rho_vs_crust_mantle,
+                                     size*sizeof(realw)),2201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs_crust_mantle,rho_vs_crust_mantle,
+                                     size*sizeof(realw),cudaMemcpyHostToDevice),2202);
+
+  // ijk index arrays
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nkmin_xi_crust_mantle,
+                                     2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_nkmin_xi_crust_mantle,nkmin_xi_crust_mantle,
+                                     2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nkmin_eta_crust_mantle,
+                                     2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_nkmin_eta_crust_mantle,nkmin_eta_crust_mantle,
+                                     2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
+  
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_njmin_crust_mantle,
+                                     2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_njmin_crust_mantle,njmin_crust_mantle,
+                                     2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
+  
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_njmax_crust_mantle,
+                                     2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_njmax_crust_mantle,njmax_crust_mantle,
+                                     2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nimin_crust_mantle,
+                                     2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_nimin_crust_mantle,nimin_crust_mantle,
+                                     2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nimax_crust_mantle,
+                                     2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_nimax_crust_mantle,nimax_crust_mantle,
+                                     2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
+  
+  
+  // xmin  
+  if( mp->nspec2D_xmin_crust_mantle > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_xmin_crust_mantle,
+                                       (mp->nspec2D_xmin_crust_mantle)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_xmin_crust_mantle,ibelm_xmin_crust_mantle,
+                                       (mp->nspec2D_xmin_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_normal_xmin_crust_mantle,
+                            NDIM*NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_xmin_crust_mantle,normal_xmin_crust_mantle,
+                            NDIM*NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_xmin_crust_mantle,
+                            NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_xmin_crust_mantle,jacobian2D_xmin_crust_mantle,
+                            NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+    
+    // boundary buffer
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+      print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_xmin_crust_mantle,
+                              NDIM*NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw)),1202);
+    }    
+  }
+
+  // xmax
+  if( mp->nspec2D_xmax_crust_mantle > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_xmax_crust_mantle,
+                                       (mp->nspec2D_xmax_crust_mantle)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_xmax_crust_mantle,ibelm_xmax_crust_mantle,
+                                       (mp->nspec2D_xmax_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
+        
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_normal_xmax_crust_mantle,
+                                       NDIM*NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_xmax_crust_mantle,normal_xmax_crust_mantle,
+                                       NDIM*NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+    
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_xmax_crust_mantle,
+                                       NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_xmax_crust_mantle,jacobian2D_xmax_crust_mantle,
+                                       NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+    
+    // boundary buffer
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+      print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_xmax_crust_mantle,
+                                         NDIM*NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw)),1202);
+    }    
+  }
+
+  // ymin
+  if( mp->nspec2D_ymin_crust_mantle > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_ymin_crust_mantle,
+                                       (mp->nspec2D_ymin_crust_mantle)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_ymin_crust_mantle,ibelm_ymin_crust_mantle,
+                                       (mp->nspec2D_ymin_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_normal_ymin_crust_mantle,
+                                       NDIM*NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_ymin_crust_mantle,normal_ymin_crust_mantle,
+                                       NDIM*NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+    
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_ymin_crust_mantle,
+                                       NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_ymin_crust_mantle,jacobian2D_ymin_crust_mantle,
+                                       NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+    
+    // boundary buffer
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+      print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_ymin_crust_mantle,
+                                         NDIM*NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw)),1202);
+    }    
+  }
+
+  // ymax
+  if( mp->nspec2D_ymax_crust_mantle > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_ymax_crust_mantle,
+                                       (mp->nspec2D_ymax_crust_mantle)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_ymax_crust_mantle,ibelm_ymax_crust_mantle,
+                                       (mp->nspec2D_ymax_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_normal_ymax_crust_mantle,
+                                       NDIM*NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_ymax_crust_mantle,normal_ymax_crust_mantle,
+                                       NDIM*NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+    
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_ymax_crust_mantle,
+                                       NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_ymax_crust_mantle,jacobian2D_ymax_crust_mantle,
+                                       NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+    
+    // boundary buffer
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+      print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_ymax_crust_mantle,
+                                         NDIM*NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw)),1202);
+    }    
+  }
+  
+
+  // outer_core
+  mp->nspec2D_xmin_outer_core = *nspec2D_xmin_outer_core;
+  mp->nspec2D_xmax_outer_core = *nspec2D_xmax_outer_core;
+  mp->nspec2D_ymin_outer_core = *nspec2D_ymin_outer_core;
+  mp->nspec2D_ymax_outer_core = *nspec2D_ymax_outer_core;
+  mp->nspec2D_zmin_outer_core = *nspec2D_zmin_outer_core;
+  
+  // vp 
+  size = NGLL3*(mp->NSPEC_OUTER_CORE);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_vp_outer_core,
+                                     size*sizeof(realw)),2201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_vp_outer_core,vp_outer_core,
+                                     size*sizeof(realw),cudaMemcpyHostToDevice),2202);
+  
+  // ijk index arrays
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nkmin_xi_outer_core,
+                                     2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_nkmin_xi_outer_core,nkmin_xi_outer_core,
+                                     2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
+  
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nkmin_eta_outer_core,
+                                     2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_nkmin_eta_outer_core,nkmin_eta_outer_core,
+                                     2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
+  
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_njmin_outer_core,
+                                     2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_njmin_outer_core,njmin_outer_core,
+                                     2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
+  
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_njmax_outer_core,
+                                     2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_njmax_outer_core,njmax_outer_core,
+                                     2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
+  
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nimin_outer_core,
+                                     2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_nimin_outer_core,nimin_outer_core,
+                                     2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
+  
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nimax_outer_core,
+                                     2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_nimax_outer_core,nimax_outer_core,
+                                     2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
+  
+  // xmin  
+  if( mp->nspec2D_xmin_outer_core > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_xmin_outer_core,
+                                       (mp->nspec2D_xmin_outer_core)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_xmin_outer_core,ibelm_xmin_outer_core,
+                                       (mp->nspec2D_xmin_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_xmin_outer_core,
+                                       NGLL2*(mp->nspec2D_xmin_outer_core)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_xmin_outer_core,jacobian2D_xmin_outer_core,
+                                       NGLL2*(mp->nspec2D_xmin_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+    
+    // boundary buffer
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+      print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_xmin_outer_core,
+                                         NGLL2*(mp->nspec2D_xmin_outer_core)*sizeof(realw)),1202);
+    }    
+  }
+
+  // xmax
+  if( mp->nspec2D_xmax_outer_core > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_xmax_outer_core,
+                                       (mp->nspec2D_xmax_outer_core)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_xmax_outer_core,ibelm_xmax_outer_core,
+                                       (mp->nspec2D_xmax_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_xmax_outer_core,
+                                       NGLL2*(mp->nspec2D_xmax_outer_core)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_xmax_outer_core,jacobian2D_xmax_outer_core,
+                                       NGLL2*(mp->nspec2D_xmax_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+    
+    // boundary buffer
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+      print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_xmax_outer_core,
+                                         NGLL2*(mp->nspec2D_xmax_outer_core)*sizeof(realw)),1202);
+    }    
+  }
+  
+  // ymin
+  if( mp->nspec2D_ymin_outer_core > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_ymin_outer_core,
+                                       (mp->nspec2D_ymin_outer_core)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_ymin_outer_core,ibelm_ymin_outer_core,
+                                       (mp->nspec2D_ymin_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_ymin_outer_core,
+                                       NGLL2*(mp->nspec2D_ymin_outer_core)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_ymin_outer_core,jacobian2D_ymin_outer_core,
+                                       NGLL2*(mp->nspec2D_ymin_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+    
+    // boundary buffer
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+      print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_ymin_outer_core,
+                                         NGLL2*(mp->nspec2D_ymin_outer_core)*sizeof(realw)),1202);
+    }    
+  }
+  
+  // ymax
+  if( mp->nspec2D_ymax_outer_core > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_ymax_outer_core,
+                                       (mp->nspec2D_ymax_outer_core)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_ymax_outer_core,ibelm_ymax_outer_core,
+                                       (mp->nspec2D_ymax_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_ymax_outer_core,
+                                       NGLL2*(mp->nspec2D_ymax_outer_core)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_ymax_outer_core,jacobian2D_ymax_outer_core,
+                                       NGLL2*(mp->nspec2D_ymax_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+    
+    // boundary buffer
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+      print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_ymax_outer_core,
+                                         NGLL2*(mp->nspec2D_ymax_outer_core)*sizeof(realw)),1202);
+    }    
+  }
+
+  // zmin
+  if( mp->nspec2D_zmin_outer_core > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_zmin_outer_core,
+                                       (mp->nspec2D_zmin_outer_core)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_zmin_outer_core,ibelm_bottom_outer_core,
+                                       (mp->nspec2D_zmin_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_zmin_outer_core,
+                                       NGLL2*(mp->nspec2D_zmin_outer_core)*sizeof(realw)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_zmin_outer_core,jacobian2D_bottom_outer_core,
+                                       NGLL2*(mp->nspec2D_zmin_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
+    
+    // boundary buffer
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+      print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_zmin_outer_core,
+                                         NGLL2*(mp->nspec2D_zmin_outer_core)*sizeof(realw)),1202);
+    }    
+  }
+  
+}  
+
+/* ----------------------------------------------------------------------------------------------- */
+
 // MPI interfaces
 
 /* ----------------------------------------------------------------------------------------------- */

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-02-21 00:33:56 UTC (rev 19657)
@@ -122,78 +122,32 @@
 
 
 //
-// src/cuda/compute_add_sources_acoustic_cuda.cu
-//
-
-void FC_FUNC_(compute_add_sources_ac_cuda,
-              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
-                                                 int* phase_is_innerf,
-                                                 int* NSOURCESf,
-                                                 int* SIMULATION_TYPEf,
-                                                 double* h_stf_pre_compute,
-                                                 int* myrankf) {} 
-
-void FC_FUNC_(compute_add_sources_ac_s3_cuda,
-              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
-                                                      int* phase_is_innerf,
-                                                      int* NSOURCESf,
-                                                      int* SIMULATION_TYPEf,
-                                                      double* h_stf_pre_compute,
-                                                      int* myrankf) {} 
-
-void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
-              ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
-                                               realw* h_adj_sourcearrays,
-                                               int* phase_is_inner,
-                                               int* h_ispec_is_inner,
-                                               int* h_ispec_is_acoustic,
-                                               int* h_ispec_selected_rec,
-                                               int* myrank,
-                                               int* nrec,
-                                               int* time_index,
-                                               int* h_islice_selected_rec,
-                                               int* nadj_rec_local,
-                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {} 
-
-
-//
 // src/cuda/compute_add_sources_elastic_cuda.cu
 //
 
 void FC_FUNC_(compute_add_sources_el_cuda,
               COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
-                                            int* phase_is_innerf,
-                                            int* NSOURCESf,
-                                            double* h_stf_pre_compute,
-                                            int* myrankf) {} 
+                                           int* NSOURCESf,
+                                           double* h_stf_pre_compute) {} 
 
 void FC_FUNC_(compute_add_sources_el_s3_cuda,
-              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
-                                              double* h_stf_pre_compute,
+              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer_f,
                                               int* NSOURCESf,
-                                              int* phase_is_inner,
-                                              int* myrank) {} 
+                                              double* h_stf_pre_compute) {} 
 
 void FC_FUNC_(add_source_master_rec_noise_cu,
               ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
-                                                int* myrank_f,
                                                 int* it_f,
                                                 int* irec_master_noise_f,
                                                 int* islice_selected_rec) {} 
 
 void FC_FUNC_(add_sources_el_sim_type_2_or_3,
               ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
-                                               realw* h_adj_sourcearrays,
-                                               int* phase_is_inner,
-                                               int* h_ispec_is_inner,
-                                               int* h_ispec_is_elastic,
-                                               int* h_ispec_selected_rec,
-                                               int* myrank,
-                                               int* nrec,
-                                               int* time_index,
-                                               int* h_islice_selected_rec,
-                                               int* nadj_rec_local,
-                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {} 
+                                              int* nrec,
+                                              realw* h_adj_sourcearrays,
+                                              int* h_islice_selected_rec,
+                                              int* h_ispec_selected_rec,
+                                              int* time_index) {} 
 
 
 //
@@ -278,12 +232,9 @@
 //
 
 void FC_FUNC_(compute_stacey_acoustic_cuda,
-              COMPUTE_STACEY_ACOUSTIC_CUDA)(
-                                    long* Mesh_pointer_f,
-                                    int* phase_is_innerf,
-                                    int* SIMULATION_TYPEf,
-                                    int* SAVE_FORWARDf,
-                                    realw* h_b_absorb_potential) {} 
+              COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+                                            realw* absorb_potential,
+                                            int* itype) {} 
 
 
 //
@@ -292,10 +243,8 @@
 
 void FC_FUNC_(compute_stacey_elastic_cuda,
               COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
-                                           int* phase_is_innerf,
-                                           int* SIMULATION_TYPEf,
-                                           int* SAVE_FORWARDf,
-                                           realw* h_b_absorb_field) {} 
+                                                realw* absorb_field,
+                                                int* itype) {} 
 
 
 //
@@ -381,17 +330,17 @@
 
 void FC_FUNC_(prepare_constants_device,
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
+                                        int* myrank_f,
                                         int* h_NGLLX,
                                         realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
                                         realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
                                         realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
                                         int* NSOURCES,int* nsources_local,
                                         realw* h_sourcearrays,
-                                        int* h_islice_selected_source,
-                                        int* h_ispec_selected_source,
+                                        int* h_islice_selected_source,int* h_ispec_selected_source,
                                         int* h_number_receiver_global,
-                                        int* h_ispec_selected_rec,
-                                        int* nrec,int* nrec_local,
+                                        int* h_islice_selected_rec,int* h_ispec_selected_rec,
+                                        int* nrec,int* nrec_local, int* nadj_rec_local,
                                         int* NSPEC_CRUST_MANTLE, int* NGLOB_CRUST_MANTLE,
                                         int* NSPEC_OUTER_CORE, int* NGLOB_OUTER_CORE,
                                         int* NSPEC_INNER_CORE, int* NGLOB_INNER_CORE,
@@ -480,6 +429,38 @@
                                             realw* b_eps_trace_over_3_inner_core
                                             ) {} 
 
+void FC_FUNC_(prepare_fields_absorb_device,
+              PREPARE_FIELDS_ABSORB_DEVICE)(long* Mesh_pointer_f,
+                                            int* nspec2D_xmin_crust_mantle,int* nspec2D_xmax_crust_mantle,
+                                            int* nspec2D_ymin_crust_mantle,int* nspec2D_ymax_crust_mantle,
+                                            int* NSPEC2DMAX_XMIN_XMAX_CM,int* NSPEC2DMAX_YMIN_YMAX_CM,
+                                            int* nimin_crust_mantle,int* nimax_crust_mantle,
+                                            int* njmin_crust_mantle,int* njmax_crust_mantle,
+                                            int* nkmin_xi_crust_mantle,int* nkmin_eta_crust_mantle,
+                                            int* ibelm_xmin_crust_mantle,int* ibelm_xmax_crust_mantle,
+                                            int* ibelm_ymin_crust_mantle,int* ibelm_ymax_crust_mantle,
+                                            realw* normal_xmin_crust_mantle,realw* normal_xmax_crust_mantle,
+                                            realw* normal_ymin_crust_mantle,realw* normal_ymax_crust_mantle,
+                                            realw* jacobian2D_xmin_crust_mantle, realw* jacobian2D_xmax_crust_mantle,
+                                            realw* jacobian2D_ymin_crust_mantle, realw* jacobian2D_ymax_crust_mantle,
+                                            realw* rho_vp_crust_mantle,
+                                            realw* rho_vs_crust_mantle,
+                                            int* nspec2D_xmin_outer_core,int* nspec2D_xmax_outer_core,
+                                            int* nspec2D_ymin_outer_core,int* nspec2D_ymax_outer_core,
+                                            int* nspec2D_zmin_outer_core,
+                                            int* NSPEC2DMAX_XMIN_XMAX_OC,int* NSPEC2DMAX_YMIN_YMAX_OC,
+                                            int* nimin_outer_core,int* nimax_outer_core,
+                                            int* njmin_outer_core,int* njmax_outer_core,
+                                            int* nkmin_xi_outer_core,int* nkmin_eta_outer_core,
+                                            int* ibelm_xmin_outer_core,int* ibelm_xmax_outer_core,
+                                            int* ibelm_ymin_outer_core,int* ibelm_ymax_outer_core,
+                                            int* ibelm_bottom_outer_core,
+                                            realw* jacobian2D_xmin_outer_core, realw* jacobian2D_xmax_outer_core,
+                                            realw* jacobian2D_ymin_outer_core, realw* jacobian2D_ymax_outer_core,
+                                            realw* jacobian2D_bottom_outer_core,
+                                            realw* vp_outer_core
+                                            ) {} 
+
 void FC_FUNC_(prepare_mpi_buffers_device,
               PREPARE_MPI_BUFFERS_DEVICE)(long* Mesh_pointer_f,
                                           int* num_interfaces_crust_mantle,

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/Makefile.in	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/Makefile.in	2012-02-21 00:33:56 UTC (rev 19657)
@@ -187,7 +187,6 @@
   $O/assemble_MPI_scalar_cuda.cuda.o \
 	$O/assemble_MPI_vector_cuda.cuda.o \
   $O/check_fields_cuda.cuda.o \
-	$O/compute_add_sources_acoustic_cuda.cuda.o \
 	$O/compute_add_sources_elastic_cuda.cuda.o \
 	$O/compute_coupling_cuda.cuda.o \
 	$O/compute_forces_crust_mantle_cuda.cuda.o \

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90	2012-02-21 00:33:56 UTC (rev 19657)
@@ -25,159 +25,123 @@
 !
 !=====================================================================
 
-  subroutine compute_add_sources(myrank,NSOURCES, &
-                                accel_crust_mantle,sourcearrays, &
-                                DT,t0,tshift_cmt,hdur_gaussian,ibool_crust_mantle, &
-                                islice_selected_source,ispec_selected_source,it, &
-                                hdur,xi_source,eta_source,gamma_source,nu_source)
+  subroutine compute_add_sources() 
 
+  use specfem_par
+  use specfem_par_crustmantle,only: accel_crust_mantle,ibool_crust_mantle  
   implicit none
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
-
-  integer myrank,NSOURCES
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: &
-    accel_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ,NSOURCES) :: sourcearrays
-
-  double precision, dimension(NSOURCES) :: tshift_cmt,hdur_gaussian
-
-  double precision :: DT,t0
-
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
-
-  integer, dimension(NSOURCES) :: islice_selected_source,ispec_selected_source
-  integer :: it
-
-  ! needed for point force sources
-  double precision, dimension(NSOURCES) :: xi_source,eta_source,gamma_source
-  double precision, dimension(NDIM,NDIM,NSOURCES) :: nu_source
-  double precision, dimension(NSOURCES) :: hdur
-
   ! local parameters
   double precision :: stf
   real(kind=CUSTOM_REAL) :: stf_used
   integer :: isource,i,j,k,iglob,ispec
-  double precision, external :: comp_source_time_function
   double precision :: f0
+  double precision, dimension(NSOURCES) :: stf_pre_compute
+
+  double precision, external :: comp_source_time_function
   double precision, external :: comp_source_time_function_rickr
 
-  do isource = 1,NSOURCES
+  if( .not. GPU_MODE ) then
+    ! on CPU
+    do isource = 1,NSOURCES
 
+      ! add only if this proc carries the source
+      if(myrank == islice_selected_source(isource)) then
 
-    ! add only if this proc carries the source
-    if(myrank == islice_selected_source(isource)) then
+        if(USE_FORCE_POINT_SOURCE) then
 
-      if(USE_FORCE_POINT_SOURCE) then
+          ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
+          iglob = ibool_crust_mantle(nint(xi_source(isource)), &
+                         nint(eta_source(isource)), &
+                         nint(gamma_source(isource)), &
+                         ispec_selected_source(isource))
 
-        ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
-        iglob = ibool_crust_mantle(nint(xi_source(isource)), &
-                       nint(eta_source(isource)), &
-                       nint(gamma_source(isource)), &
-                       ispec_selected_source(isource))
+          f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
 
-        f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
+          !if (it == 1 .and. myrank == 0) then
+          !  write(IMAIN,*) 'using a source of dominant frequency ',f0
+          !  write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
+          !  write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
+          !endif
 
-        !if (it == 1 .and. myrank == 0) then
-        !  write(IMAIN,*) 'using a source of dominant frequency ',f0
-        !  write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
-        !  write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
-        !endif
+          ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
+          stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0)
 
-        ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
-        stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0)
+          ! we use a force in a single direction along one of the components:
+          !  x/y/z or E/N/Z-direction would correspond to 1/2/3 = COMPONENT_FORCE_SOURCE
+          ! e.g. nu_source(3,:) here would be a source normal to the surface (z-direction).
+          accel_crust_mantle(:,iglob) = accel_crust_mantle(:,iglob)  &
+                           + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used
 
-        ! we use a force in a single direction along one of the components:
-        !  x/y/z or E/N/Z-direction would correspond to 1/2/3 = COMPONENT_FORCE_SOURCE
-        ! e.g. nu_source(3,:) here would be a source normal to the surface (z-direction).
-        accel_crust_mantle(:,iglob) = accel_crust_mantle(:,iglob)  &
-                         + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used
+        else
 
-      else
+          stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
 
-        stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+          !     distinguish between single and double precision for reals
+          if(CUSTOM_REAL == SIZE_REAL) then
+            stf_used = sngl(stf)
+          else
+            stf_used = stf
+          endif
 
-        !     distinguish between single and double precision for reals
-        if(CUSTOM_REAL == SIZE_REAL) then
-          stf_used = sngl(stf)
-        else
-          stf_used = stf
-        endif
+          !     add source array
+          ispec = ispec_selected_source(isource)
+          do k=1,NGLLZ
+            do j=1,NGLLY
+              do i=1,NGLLX
+                iglob = ibool_crust_mantle(i,j,k,ispec)
 
-        !     add source array
-        ispec = ispec_selected_source(isource)
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              iglob = ibool_crust_mantle(i,j,k,ispec)
+                accel_crust_mantle(:,iglob) = accel_crust_mantle(:,iglob) &
+                  + sourcearrays(:,i,j,k,isource)*stf_used
 
-              accel_crust_mantle(:,iglob) = accel_crust_mantle(:,iglob) &
-                + sourcearrays(:,i,j,k,isource)*stf_used
-
+              enddo
             enddo
           enddo
-        enddo
 
-      endif ! USE_FORCE_POINT_SOURCE
+        endif ! USE_FORCE_POINT_SOURCE
 
+      endif
+
+    enddo
+
+  else
+    ! on GPU
+    call load_GPU_elastic()
+    
+    ! prepares buffer with source time function values, to be copied onto GPU
+    if(USE_FORCE_POINT_SOURCE) then
+      do isource = 1,NSOURCES
+        stf_pre_compute(isource) = &
+          FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0)
+      enddo
+    else
+      do isource = 1,NSOURCES
+        stf_pre_compute(isource) = &
+          comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))      
+      enddo
     endif
+    ! adds sources: only implements SIMTYPE=1 and NOISE_TOM=0
+    call compute_add_sources_el_cuda(Mesh_pointer,NSOURCES,stf_pre_compute)
 
-  enddo
+    call load_CPU_elastic()    
+  endif
 
+
   end subroutine compute_add_sources
 
 !
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine compute_add_sources_adjoint(myrank,nrec, &
-                                nadj_rec_local,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC, &
-                                accel_crust_mantle,adj_sourcearrays, &
-                                nu,xi_receiver,eta_receiver,gamma_receiver, &
-                                xigll,yigll,zigll,ibool_crust_mantle, &
-                                islice_selected_rec,ispec_selected_rec, &
-                                NSTEP_SUB_ADJ,iadjsrc_len,iadjsrc,iadj_vec, &
-                                it,it_begin,station_name,network_name,DT)
+  subroutine compute_add_sources_adjoint()
 
+  use specfem_par
+  use specfem_par_crustmantle,only: accel_crust_mantle,ibool_crust_mantle  
   implicit none
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
-
-  integer myrank,nrec,nadj_rec_local,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC
-
-  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_CRUST_MANTLE) :: &
-    accel_crust_mantle
-
-  real(kind=CUSTOM_REAL),dimension(NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC) :: &
-    adj_sourcearrays
-
-  double precision, dimension(NDIM,NDIM,nrec) :: nu
-  double precision, dimension(nrec) :: xi_receiver,eta_receiver,gamma_receiver
-  double precision, dimension(NGLLX) :: xigll
-  double precision, dimension(NGLLY) :: yigll
-  double precision, dimension(NGLLZ) :: zigll
-  double precision :: DT
-
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
-  integer, dimension(nrec) :: islice_selected_rec,ispec_selected_rec
-
-  integer NSTEP_SUB_ADJ
-  integer, dimension(NSTEP_SUB_ADJ) :: iadjsrc_len
-  integer, dimension(NSTEP_SUB_ADJ,2) :: iadjsrc ! to read input in chunks
-  integer, dimension(NSTEP) :: iadj_vec
-
-  integer :: it,it_begin,itime
-
-  character(len=MAX_LENGTH_STATION_NAME), dimension(nrec) :: station_name
-  character(len=MAX_LENGTH_NETWORK_NAME), dimension(nrec) :: network_name
-
   ! local parameters
   real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: adj_sourcearray
-  integer :: irec,irec_local,i,j,k,iglob,it_sub_adj
+  integer :: irec,irec_local,i,j,k,iglob,it_sub_adj,itime
   character(len=150) :: adj_source_file
   logical :: ibool_read_adj_arrays
 
@@ -228,88 +192,101 @@
 
   endif
 
-  irec_local = 0
-  do irec = 1,nrec
+  ! adds adjoint sources
+  if( .not. GPU_MODE ) then
+    ! on CPU
+    irec_local = 0
+    do irec = 1,nrec
 
-    ! adds source (only if this proc carries the source)
-    if(myrank == islice_selected_rec(irec)) then
-      irec_local = irec_local + 1
+      ! adds source (only if this proc carries the source)
+      if(myrank == islice_selected_rec(irec)) then
+        irec_local = irec_local + 1
 
-      ! adds source contributions
-      do k=1,NGLLZ
-        do j=1,NGLLY
-          do i=1,NGLLX
-            iglob = ibool_crust_mantle(i,j,k,ispec_selected_rec(irec))
+        ! adds source contributions
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              iglob = ibool_crust_mantle(i,j,k,ispec_selected_rec(irec))
 
 
-            ! adds adjoint source acting at this time step (it):
-            !
-            ! note: we use index iadj_vec(it) which is the corresponding time step
-            !          for the adjoint source acting at this time step (it)
-            !
-            ! see routine: setup_sources_receivers_adjindx() how this adjoint index array is set up
-            !
-            !           e.g. total length NSTEP = 3000, chunk length NTSTEP_BETWEEN_READ_ADJSRC= 1000
-            !           then for it=1,..1000, first block has iadjsrc(1,1) with start = 2001 and end = 3000;
-            !           corresponding iadj_vec(it) goes from
-            !           iadj_vec(1) = 1000, iadj_vec(2) = 999 to iadj_vec(1000) = 1,
-            !           that is, originally the idea was
-            !           adj_sourcearrays(.. iadj_vec(1) ) corresponds to adjoint source trace at time index 3000
-            !           adj_sourcearrays(.. iadj_vec(2) ) corresponds to adjoint source trace at time index 2999
-            !           ..
-            !           adj_sourcearrays(.. iadj_vec(1000) ) corresponds to adjoint source trace at time index 2001
-            !           then a new block will be read, etc, and it is going down till to adjoint source trace at time index 1
-            !
-            ! now comes the tricky part:
-            !           adjoint source traces are based on the seismograms from the forward run;
-            !           such seismograms have a time step index 1 which corresponds to time -t0
-            !           then time step index 2 which corresponds to -t0 + DT, and
-            !           the last time step in the file at time step NSTEP corresponds to time -t0 + (NSTEP-1)*DT
-            !           (see how we add the sources to the simulation in compute_add_sources() and
-            !             how we write/save the seismograms and wavefields at the end of the time loop).
-            !
-            !           then you use that seismogram and take e.g. the velocity of it for a travetime adjoint source
-            !
-            !           now we read it in again, and remember the last time step in
-            !           the file at NSTEP corresponds to -t0 + (NSTEP-1)*DT
-            !
-            !           the same time step is saved for the forward wavefields to reconstruct them;
-            !           however, the Newmark time scheme acts at the very beginning of this time loop
-            !           such that we have the backward/reconstructed wavefield updated by
-            !           a single time step into the direction -DT and b_displ(it=1) would  corresponds to -t0 + (NSTEP-1)*DT - DT
-            !           after the Newmark (predictor) time step update.
-            !           however, we will read the backward/reconstructed wavefield at the end of the first time loop,
-            !           such that b_displ(it=1) corresponds to -t0 + (NSTEP-1)*DT (which is the one saved in the files).
-            !
-            !           for the kernel calculations, we want:
-            !             adjoint wavefield at time t, starting from 0 to T
-            !             and forward wavefield at time T-t, starting from T down to 0
-            !           let's say time 0 corresponds to -t0 = -t0 + (it - 1)*DT at it=1
-            !             and time T corresponds to -t0 + (NSTEP-1)*DT  at it = NSTEP
-            !
-            !           as seen before, the time for the forward wavefield b_displ(it=1) would then
-            !           correspond to time -t0 + (NSTEP-1)*DT - DT, which is T - DT.
-            !           the corresponding time for the adjoint wavefield thus would be 0 + DT
-            !           and the adjoint source index would be iadj_vec(it+1)
-            !           however, iadj_vec(it+1) which would go from 999 down to 0. 0 is out of bounds.
-            !           we thus would have to read in the adjoint source trace beginning from 2999 down to 0.
-            !           index 0 is not defined in the adjoint source trace, and would be set to zero.
-            !
-            !           however, since this complicates things, we read the backward/reconstructed
-            !           wavefield at the end of the first time loop, such that b_displ(it=1) corresponds to -t0 + (NSTEP-1)*DT.
-            !           assuming that until that end the backward/reconstructed wavefield and adjoint fields
-            !           have a zero contribution to adjoint kernels.
-            accel_crust_mantle(:,iglob) = accel_crust_mantle(:,iglob) &
-                          + adj_sourcearrays(:,i,j,k,irec_local,iadj_vec(it))
+              ! adds adjoint source acting at this time step (it):
+              !
+              ! note: we use index iadj_vec(it) which is the corresponding time step
+              !          for the adjoint source acting at this time step (it)
+              !
+              ! see routine: setup_sources_receivers_adjindx() how this adjoint index array is set up
+              !
+              !           e.g. total length NSTEP = 3000, chunk length NTSTEP_BETWEEN_READ_ADJSRC= 1000
+              !           then for it=1,..1000, first block has iadjsrc(1,1) with start = 2001 and end = 3000;
+              !           corresponding iadj_vec(it) goes from
+              !           iadj_vec(1) = 1000, iadj_vec(2) = 999 to iadj_vec(1000) = 1,
+              !           that is, originally the idea was
+              !           adj_sourcearrays(.. iadj_vec(1) ) corresponds to adjoint source trace at time index 3000
+              !           adj_sourcearrays(.. iadj_vec(2) ) corresponds to adjoint source trace at time index 2999
+              !           ..
+              !           adj_sourcearrays(.. iadj_vec(1000) ) corresponds to adjoint source trace at time index 2001
+              !           then a new block will be read, etc, and it is going down till to adjoint source trace at time index 1
+              !
+              ! now comes the tricky part:
+              !           adjoint source traces are based on the seismograms from the forward run;
+              !           such seismograms have a time step index 1 which corresponds to time -t0
+              !           then time step index 2 which corresponds to -t0 + DT, and
+              !           the last time step in the file at time step NSTEP corresponds to time -t0 + (NSTEP-1)*DT
+              !           (see how we add the sources to the simulation in compute_add_sources() and
+              !             how we write/save the seismograms and wavefields at the end of the time loop).
+              !
+              !           then you use that seismogram and take e.g. the velocity of it for a travetime adjoint source
+              !
+              !           now we read it in again, and remember the last time step in
+              !           the file at NSTEP corresponds to -t0 + (NSTEP-1)*DT
+              !
+              !           the same time step is saved for the forward wavefields to reconstruct them;
+              !           however, the Newmark time scheme acts at the very beginning of this time loop
+              !           such that we have the backward/reconstructed wavefield updated by
+              !           a single time step into the direction -DT and b_displ(it=1) would  corresponds to -t0 + (NSTEP-1)*DT - DT
+              !           after the Newmark (predictor) time step update.
+              !           however, we will read the backward/reconstructed wavefield at the end of the first time loop,
+              !           such that b_displ(it=1) corresponds to -t0 + (NSTEP-1)*DT (which is the one saved in the files).
+              !
+              !           for the kernel calculations, we want:
+              !             adjoint wavefield at time t, starting from 0 to T
+              !             and forward wavefield at time T-t, starting from T down to 0
+              !           let's say time 0 corresponds to -t0 = -t0 + (it - 1)*DT at it=1
+              !             and time T corresponds to -t0 + (NSTEP-1)*DT  at it = NSTEP
+              !
+              !           as seen before, the time for the forward wavefield b_displ(it=1) would then
+              !           correspond to time -t0 + (NSTEP-1)*DT - DT, which is T - DT.
+              !           the corresponding time for the adjoint wavefield thus would be 0 + DT
+              !           and the adjoint source index would be iadj_vec(it+1)
+              !           however, iadj_vec(it+1) which would go from 999 down to 0. 0 is out of bounds.
+              !           we thus would have to read in the adjoint source trace beginning from 2999 down to 0.
+              !           index 0 is not defined in the adjoint source trace, and would be set to zero.
+              !
+              !           however, since this complicates things, we read the backward/reconstructed
+              !           wavefield at the end of the first time loop, such that b_displ(it=1) corresponds to -t0 + (NSTEP-1)*DT.
+              !           assuming that until that end the backward/reconstructed wavefield and adjoint fields
+              !           have a zero contribution to adjoint kernels.
+              accel_crust_mantle(:,iglob) = accel_crust_mantle(:,iglob) &
+                            + adj_sourcearrays(:,i,j,k,irec_local,iadj_vec(it))
 
+            enddo
           enddo
         enddo
-      enddo
-    endif
+      endif
 
-  enddo
+    enddo
 
+  else
+    ! on GPU
+    call load_GPU_elastic()
+    
+    call add_sources_el_sim_type_2_or_3(Mesh_pointer,nrec,adj_sourcearrays, &
+                                       islice_selected_rec,ispec_selected_rec, &
+                                       iadj_vec(it))    
 
+    call load_CPU_elastic()                                       
+  endif
+
   end subroutine compute_add_sources_adjoint
 
 !
@@ -317,117 +294,116 @@
 !
 
 
-  subroutine compute_add_sources_backward(myrank,NSOURCES,NSTEP, &
-                                b_accel_crust_mantle,sourcearrays, &
-                                DT,t0,tshift_cmt,hdur_gaussian,ibool_crust_mantle, &
-                                islice_selected_source,ispec_selected_source,it, &
-                                hdur,xi_source,eta_source,gamma_source,nu_source)
+  subroutine compute_add_sources_backward()
 
+  use specfem_par
+  use specfem_par_crustmantle,only: b_accel_crust_mantle,ibool_crust_mantle
   implicit none
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
-
-  integer myrank,NSOURCES,NSTEP
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE_ADJOINT) :: &
-    b_accel_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ,NSOURCES) :: sourcearrays
-
-  double precision, dimension(NSOURCES) :: tshift_cmt,hdur_gaussian
-
-  double precision :: DT,t0
-
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
-  integer, dimension(NSOURCES) :: islice_selected_source,ispec_selected_source
-  integer :: it
-  ! needed for point force sources
-  double precision, dimension(NSOURCES) :: xi_source,eta_source,gamma_source
-  double precision, dimension(NDIM,NDIM,NSOURCES) :: nu_source
-  double precision, dimension(NSOURCES) :: hdur
-
   ! local parameters
   double precision :: stf
   real(kind=CUSTOM_REAL) :: stf_used
   integer :: isource,i,j,k,iglob,ispec
-  double precision, external :: comp_source_time_function
   double precision :: f0
+  double precision, dimension(NSOURCES) :: stf_pre_compute
+
+  double precision, external :: comp_source_time_function
   double precision, external :: comp_source_time_function_rickr
 
-  do isource = 1,NSOURCES
+  if( .not. GPU_MODE ) then
+    ! on CPU
+    do isource = 1,NSOURCES
 
-    !   add the source (only if this proc carries the source)
-    if(myrank == islice_selected_source(isource)) then
+      !   add the source (only if this proc carries the source)
+      if(myrank == islice_selected_source(isource)) then
 
-! note on backward/reconstructed wavefields:
-!       time for b_displ( it ) corresponds to (NSTEP - (it-1) - 1 )*DT - t0  ...
-!       as we start with saved wavefields b_displ( 1 ) = displ( NSTEP ) which correspond
-!       to a time (NSTEP - 1)*DT - t0
-!       (see sources for simulation_type 1 and seismograms)
-!
-!       now, at the beginning of the time loop, the numerical Newmark time scheme updates
-!       the wavefields, that is b_displ( it=1) would correspond to time (NSTEP -1 - 1)*DT - t0.
-!       however, we read in the backward/reconstructed wavefields at the end of the Newmark time scheme
-!       in the first (it=1) time loop.
-!       this leads to the timing (NSTEP-(it-1)-1)*DT-t0-tshift_cmt for the source time function here
+        ! note on backward/reconstructed wavefields:
+        !       time for b_displ( it ) corresponds to (NSTEP - (it-1) - 1 )*DT - t0  ...
+        !       as we start with saved wavefields b_displ( 1 ) = displ( NSTEP ) which correspond
+        !       to a time (NSTEP - 1)*DT - t0
+        !       (see sources for simulation_type 1 and seismograms)
+        !
+        !       now, at the beginning of the time loop, the numerical Newmark time scheme updates
+        !       the wavefields, that is b_displ( it=1) would correspond to time (NSTEP -1 - 1)*DT - t0.
+        !       however, we read in the backward/reconstructed wavefields at the end of the Newmark time scheme
+        !       in the first (it=1) time loop.
+        !       this leads to the timing (NSTEP-(it-1)-1)*DT-t0-tshift_cmt for the source time function here
+        if(USE_FORCE_POINT_SOURCE) then
 
-      if(USE_FORCE_POINT_SOURCE) then
+           ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
+           iglob = ibool_crust_mantle(nint(xi_source(isource)), &
+                         nint(eta_source(isource)), &
+                         nint(gamma_source(isource)), &
+                         ispec_selected_source(isource))
 
-         ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
-         iglob = ibool_crust_mantle(nint(xi_source(isource)), &
-                       nint(eta_source(isource)), &
-                       nint(gamma_source(isource)), &
-                       ispec_selected_source(isource))
+           f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
 
-         f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
+           !if (it == 1 .and. myrank == 0) then
+           !   write(IMAIN,*) 'using a source of dominant frequency ',f0
+           !   write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
+           !   write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
+           !endif
 
-         !if (it == 1 .and. myrank == 0) then
-         !   write(IMAIN,*) 'using a source of dominant frequency ',f0
-         !   write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
-         !   write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
-         !endif
+           ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
+           stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),f0)
 
-         ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
-         stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),f0)
+           ! e.g. we use nu_source(3,:) here if we want a source normal to the surface.
+           ! note: time step is now at NSTEP-it
+           b_accel_crust_mantle(:,iglob) = b_accel_crust_mantle(:,iglob)  &
+                              + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used
 
-         ! e.g. we use nu_source(3,:) here if we want a source normal to the surface.
-         ! note: time step is now at NSTEP-it
-         b_accel_crust_mantle(:,iglob) = b_accel_crust_mantle(:,iglob)  &
-                            + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used
+        else
 
-      else
+          ! see note above: time step corresponds now to NSTEP-it
+          stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
 
-        ! see note above: time step corresponds now to NSTEP-it
-        stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+          !     distinguish between single and double precision for reals
+          if(CUSTOM_REAL == SIZE_REAL) then
+            stf_used = sngl(stf)
+          else
+            stf_used = stf
+          endif
 
-        !     distinguish between single and double precision for reals
-        if(CUSTOM_REAL == SIZE_REAL) then
-          stf_used = sngl(stf)
-        else
-          stf_used = stf
-        endif
+          !     add source array
+          ispec = ispec_selected_source(isource)
+          do k=1,NGLLZ
+            do j=1,NGLLY
+              do i=1,NGLLX
+                iglob = ibool_crust_mantle(i,j,k,ispec)
 
-        !     add source array
-        ispec = ispec_selected_source(isource)
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              iglob = ibool_crust_mantle(i,j,k,ispec)
+                b_accel_crust_mantle(:,iglob) = b_accel_crust_mantle(:,iglob) &
+                  + sourcearrays(:,i,j,k,isource)*stf_used
 
-              b_accel_crust_mantle(:,iglob) = b_accel_crust_mantle(:,iglob) &
-                + sourcearrays(:,i,j,k,isource)*stf_used
-
+              enddo
             enddo
           enddo
-        enddo
 
-      endif ! USE_FORCE_POINT_SOURCE
+        endif ! USE_FORCE_POINT_SOURCE
 
-    endif
+      endif
 
-  enddo
+    enddo
 
-  end subroutine compute_add_sources_backward
+  else
+    ! on GPU
+    call load_GPU_elastic()
+    
+    ! prepares buffer with source time function values, to be copied onto GPU
+    if(USE_FORCE_POINT_SOURCE) then
+      do isource = 1,NSOURCES
+        stf_pre_compute(isource) = &
+          FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),f0)
+      enddo
+    else
+      do isource = 1,NSOURCES
+        stf_pre_compute(isource) = &
+          comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))      
+      enddo
+    endif
+    ! adds sources: only implements SIMTYPE=3 (and NOISE_TOM=0)
+    call compute_add_sources_el_s3_cuda(Mesh_pointer,NSOURCES,stf_pre_compute)
 
+    call load_CPU_elastic()    
+  endif
 
+  end subroutine compute_add_sources_backward

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic.F90	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic.F90	2012-02-21 00:33:56 UTC (rev 19657)
@@ -356,7 +356,7 @@
   use specfem_par_outercore
   implicit none
   
-  ! daniel: TODO - temporary transfers to the GPU
+  ! daniel: TODO - temporary transfers to the CPU
   call transfer_fields_oc_from_device(NGLOB_OUTER_CORE,displ_outer_core, &
                                 veloc_outer_core,accel_outer_core,Mesh_pointer)
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-02-21 00:33:56 UTC (rev 19657)
@@ -207,35 +207,18 @@
       if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) call compute_stacey_crust_mantle()
 
       ! add the sources
-      if (SIMULATION_TYPE == 1) &
-        call compute_add_sources(myrank,NSOURCES, &
-                                  accel_crust_mantle,sourcearrays, &
-                                  DT,t0,tshift_cmt,hdur_gaussian,ibool_crust_mantle, &
-                                  islice_selected_source,ispec_selected_source,it, &
-                                  hdur,xi_source,eta_source,gamma_source,nu_source)
+      if (SIMULATION_TYPE == 1 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) &
+        call compute_add_sources()
 
       ! add adjoint sources
       if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
-        if( nadj_rec_local > 0 ) &
-          call compute_add_sources_adjoint(myrank,nrec, &
-                                  nadj_rec_local,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC, &
-                                  accel_crust_mantle,adj_sourcearrays, &
-                                  nu,xi_receiver,eta_receiver,gamma_receiver, &
-                                  xigll,yigll,zigll,ibool_crust_mantle, &
-                                  islice_selected_rec,ispec_selected_rec, &
-                                  NSTEP_SUB_ADJ,iadjsrc_len,iadjsrc,iadj_vec, &
-                                  it,it_begin,station_name,network_name,DT)
+        if( nadj_rec_local > 0 ) call compute_add_sources_adjoint()
       endif
 
       ! add sources for backward/reconstructed wavefield
-      if (SIMULATION_TYPE == 3) &
-        call compute_add_sources_backward(myrank,NSOURCES,NSTEP, &
-                                  b_accel_crust_mantle,sourcearrays, &
-                                  DT,t0,tshift_cmt,hdur_gaussian,ibool_crust_mantle, &
-                                  islice_selected_source,ispec_selected_source,it, &
-                                  hdur,xi_source,eta_source,gamma_source,nu_source)
+      if (SIMULATION_TYPE == 3 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) &
+        call compute_add_sources_backward()
 
-
       ! NOISE_TOMOGRAPHY
       if ( NOISE_TOMOGRAPHY == 1 ) then
          ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90	2012-02-21 00:33:56 UTC (rev 19657)
@@ -32,6 +32,8 @@
   use specfem_par,only: &
     ichunk,SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it, &
     wgllwgll_xz,wgllwgll_yz,wgllwgll_xy
+  
+  use specfem_par,only: GPU_MODE,Mesh_pointer
 
   use specfem_par_crustmantle, only: &
     veloc_crust_mantle,accel_crust_mantle,b_accel_crust_mantle, &
@@ -50,7 +52,6 @@
     nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
     reclen_xmin_crust_mantle,reclen_xmax_crust_mantle, &
     reclen_ymin_crust_mantle,reclen_ymax_crust_mantle, &
-    nabs_xmin_cm,nabs_xmax_cm,nabs_ymin_cm,nabs_ymax_cm, &
     absorb_xmin_crust_mantle,absorb_xmax_crust_mantle, &
     absorb_ymin_crust_mantle,absorb_ymax_crust_mantle
 
@@ -72,7 +73,8 @@
 
 
   ! crust & mantle
-
+  if( GPU_MODE ) call load_GPU_elastic()   
+  
   !   xmin
   ! if two chunks exclude this face for one of them
   if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
@@ -84,54 +86,64 @@
       call read_abs(0,absorb_xmin_crust_mantle,reclen_xmin_crust_mantle,NSTEP-it+1)
     endif
 
-    do ispec2D=1,nspec2D_xmin_crust_mantle
+    if( .NOT. GPU_MODE) then
+      ! on CPU
+      do ispec2D=1,nspec2D_xmin_crust_mantle
 
-      ispec=ibelm_xmin_crust_mantle(ispec2D)
+        ispec=ibelm_xmin_crust_mantle(ispec2D)
 
-    ! exclude elements that are not on absorbing edges
-      if(nkmin_xi_crust_mantle(1,ispec2D) == 0 .or. njmin_crust_mantle(1,ispec2D) == 0) cycle
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_xi_crust_mantle(1,ispec2D) == 0 .or. njmin_crust_mantle(1,ispec2D) == 0) cycle
 
-      i=1
-      do k=nkmin_xi_crust_mantle(1,ispec2D),NGLLZ
-        do j=njmin_crust_mantle(1,ispec2D),njmax_crust_mantle(1,ispec2D)
-          iglob=ibool_crust_mantle(i,j,k,ispec)
+        i=1
+        do k=nkmin_xi_crust_mantle(1,ispec2D),NGLLZ
+          do j=njmin_crust_mantle(1,ispec2D),njmax_crust_mantle(1,ispec2D)
+            iglob=ibool_crust_mantle(i,j,k,ispec)
 
-          vx=veloc_crust_mantle(1,iglob)
-          vy=veloc_crust_mantle(2,iglob)
-          vz=veloc_crust_mantle(3,iglob)
+            vx=veloc_crust_mantle(1,iglob)
+            vy=veloc_crust_mantle(2,iglob)
+            vz=veloc_crust_mantle(3,iglob)
 
-          nx=normal_xmin_crust_mantle(1,j,k,ispec2D)
-          ny=normal_xmin_crust_mantle(2,j,k,ispec2D)
-          nz=normal_xmin_crust_mantle(3,j,k,ispec2D)
+            nx=normal_xmin_crust_mantle(1,j,k,ispec2D)
+            ny=normal_xmin_crust_mantle(2,j,k,ispec2D)
+            nz=normal_xmin_crust_mantle(3,j,k,ispec2D)
 
-          vn=vx*nx+vy*ny+vz*nz
+            vn=vx*nx+vy*ny+vz*nz
 
-          tx=rho_vp_crust_mantle(i,j,k,ispec)*vn*nx+rho_vs_crust_mantle(i,j,k,ispec)*(vx-vn*nx)
-          ty=rho_vp_crust_mantle(i,j,k,ispec)*vn*ny+rho_vs_crust_mantle(i,j,k,ispec)*(vy-vn*ny)
-          tz=rho_vp_crust_mantle(i,j,k,ispec)*vn*nz+rho_vs_crust_mantle(i,j,k,ispec)*(vz-vn*nz)
+            tx=rho_vp_crust_mantle(i,j,k,ispec)*vn*nx+rho_vs_crust_mantle(i,j,k,ispec)*(vx-vn*nx)
+            ty=rho_vp_crust_mantle(i,j,k,ispec)*vn*ny+rho_vs_crust_mantle(i,j,k,ispec)*(vy-vn*ny)
+            tz=rho_vp_crust_mantle(i,j,k,ispec)*vn*nz+rho_vs_crust_mantle(i,j,k,ispec)*(vz-vn*nz)
 
-          weight=jacobian2D_xmin_crust_mantle(j,k,ispec2D)*wgllwgll_yz(j,k)
+            weight=jacobian2D_xmin_crust_mantle(j,k,ispec2D)*wgllwgll_yz(j,k)
 
-          accel_crust_mantle(1,iglob)=accel_crust_mantle(1,iglob) - tx*weight
-          accel_crust_mantle(2,iglob)=accel_crust_mantle(2,iglob) - ty*weight
-          accel_crust_mantle(3,iglob)=accel_crust_mantle(3,iglob) - tz*weight
+            accel_crust_mantle(1,iglob)=accel_crust_mantle(1,iglob) - tx*weight
+            accel_crust_mantle(2,iglob)=accel_crust_mantle(2,iglob) - ty*weight
+            accel_crust_mantle(3,iglob)=accel_crust_mantle(3,iglob) - tz*weight
 
-          if (SIMULATION_TYPE == 3) then
-            b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_xmin_crust_mantle(:,j,k,ispec2D)
-          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-            absorb_xmin_crust_mantle(1,j,k,ispec2D) = tx*weight
-            absorb_xmin_crust_mantle(2,j,k,ispec2D) = ty*weight
-            absorb_xmin_crust_mantle(3,j,k,ispec2D) = tz*weight
-          endif
+            if (SIMULATION_TYPE == 3) then
+              b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_xmin_crust_mantle(:,j,k,ispec2D)
+            else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+              absorb_xmin_crust_mantle(1,j,k,ispec2D) = tx*weight
+              absorb_xmin_crust_mantle(2,j,k,ispec2D) = ty*weight
+              absorb_xmin_crust_mantle(3,j,k,ispec2D) = tz*weight
+            endif
+          enddo
         enddo
       enddo
-    enddo
-
+      
+    else
+      ! on GPU
+      if( nspec2D_xmin_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
+                                                                absorb_xmin_crust_mantle, &
+                                                                0) ! <= xmin              
+    endif
+    
     ! writes absorbing boundary values
     if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmin_crust_mantle > 0 ) then
       call write_abs(0,absorb_xmin_crust_mantle, reclen_xmin_crust_mantle,it)
     endif
-  endif
+    
+  endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC
 
   !   xmax
   ! if two chunks exclude this face for one of them
@@ -142,55 +154,66 @@
       call read_abs(1,absorb_xmax_crust_mantle,reclen_xmax_crust_mantle,NSTEP-it+1)
     endif
 
-    do ispec2D=1,nspec2D_xmax_crust_mantle
+    if(.NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D=1,nspec2D_xmax_crust_mantle
 
-      ispec=ibelm_xmax_crust_mantle(ispec2D)
+        ispec=ibelm_xmax_crust_mantle(ispec2D)
 
-    ! exclude elements that are not on absorbing edges
-      if(nkmin_xi_crust_mantle(2,ispec2D) == 0 .or. njmin_crust_mantle(2,ispec2D) == 0) cycle
+      ! exclude elements that are not on absorbing edges
+        if(nkmin_xi_crust_mantle(2,ispec2D) == 0 .or. njmin_crust_mantle(2,ispec2D) == 0) cycle
 
-      i=NGLLX
-      do k=nkmin_xi_crust_mantle(2,ispec2D),NGLLZ
-        do j=njmin_crust_mantle(2,ispec2D),njmax_crust_mantle(2,ispec2D)
-          iglob=ibool_crust_mantle(i,j,k,ispec)
+        i=NGLLX
+        do k=nkmin_xi_crust_mantle(2,ispec2D),NGLLZ
+          do j=njmin_crust_mantle(2,ispec2D),njmax_crust_mantle(2,ispec2D)
+            iglob=ibool_crust_mantle(i,j,k,ispec)
 
-          vx=veloc_crust_mantle(1,iglob)
-          vy=veloc_crust_mantle(2,iglob)
-          vz=veloc_crust_mantle(3,iglob)
+            vx=veloc_crust_mantle(1,iglob)
+            vy=veloc_crust_mantle(2,iglob)
+            vz=veloc_crust_mantle(3,iglob)
 
-          nx=normal_xmax_crust_mantle(1,j,k,ispec2D)
-          ny=normal_xmax_crust_mantle(2,j,k,ispec2D)
-          nz=normal_xmax_crust_mantle(3,j,k,ispec2D)
+            nx=normal_xmax_crust_mantle(1,j,k,ispec2D)
+            ny=normal_xmax_crust_mantle(2,j,k,ispec2D)
+            nz=normal_xmax_crust_mantle(3,j,k,ispec2D)
 
-          vn=vx*nx+vy*ny+vz*nz
+            vn=vx*nx+vy*ny+vz*nz
 
-          tx=rho_vp_crust_mantle(i,j,k,ispec)*vn*nx+rho_vs_crust_mantle(i,j,k,ispec)*(vx-vn*nx)
-          ty=rho_vp_crust_mantle(i,j,k,ispec)*vn*ny+rho_vs_crust_mantle(i,j,k,ispec)*(vy-vn*ny)
-          tz=rho_vp_crust_mantle(i,j,k,ispec)*vn*nz+rho_vs_crust_mantle(i,j,k,ispec)*(vz-vn*nz)
+            tx=rho_vp_crust_mantle(i,j,k,ispec)*vn*nx+rho_vs_crust_mantle(i,j,k,ispec)*(vx-vn*nx)
+            ty=rho_vp_crust_mantle(i,j,k,ispec)*vn*ny+rho_vs_crust_mantle(i,j,k,ispec)*(vy-vn*ny)
+            tz=rho_vp_crust_mantle(i,j,k,ispec)*vn*nz+rho_vs_crust_mantle(i,j,k,ispec)*(vz-vn*nz)
 
-          weight=jacobian2D_xmax_crust_mantle(j,k,ispec2D)*wgllwgll_yz(j,k)
+            weight=jacobian2D_xmax_crust_mantle(j,k,ispec2D)*wgllwgll_yz(j,k)
 
-          accel_crust_mantle(1,iglob)=accel_crust_mantle(1,iglob) - tx*weight
-          accel_crust_mantle(2,iglob)=accel_crust_mantle(2,iglob) - ty*weight
-          accel_crust_mantle(3,iglob)=accel_crust_mantle(3,iglob) - tz*weight
+            accel_crust_mantle(1,iglob)=accel_crust_mantle(1,iglob) - tx*weight
+            accel_crust_mantle(2,iglob)=accel_crust_mantle(2,iglob) - ty*weight
+            accel_crust_mantle(3,iglob)=accel_crust_mantle(3,iglob) - tz*weight
 
-          if (SIMULATION_TYPE == 3) then
-            b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_xmax_crust_mantle(:,j,k,ispec2D)
-          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-            absorb_xmax_crust_mantle(1,j,k,ispec2D) = tx*weight
-            absorb_xmax_crust_mantle(2,j,k,ispec2D) = ty*weight
-            absorb_xmax_crust_mantle(3,j,k,ispec2D) = tz*weight
-          endif
+            if (SIMULATION_TYPE == 3) then
+              b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_xmax_crust_mantle(:,j,k,ispec2D)
+            else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+              absorb_xmax_crust_mantle(1,j,k,ispec2D) = tx*weight
+              absorb_xmax_crust_mantle(2,j,k,ispec2D) = ty*weight
+              absorb_xmax_crust_mantle(3,j,k,ispec2D) = tz*weight
+            endif
 
+          enddo
         enddo
       enddo
-    enddo
 
+    else
+      ! on GPU
+      if( nspec2D_xmax_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
+                                                                absorb_xmax_crust_mantle, &
+                                                                1) ! <= xmin              
+    endif
+
+  
     ! writes absorbing boundary values
     if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmax_crust_mantle > 0 ) then
       call write_abs(1,absorb_xmax_crust_mantle,reclen_xmax_crust_mantle,it)
     endif
-  endif
+    
+  endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB
 
   !   ymin
 
@@ -199,50 +222,60 @@
     call read_abs(2,absorb_ymin_crust_mantle, reclen_ymin_crust_mantle,NSTEP-it+1)
   endif
 
-  do ispec2D=1,nspec2D_ymin_crust_mantle
+  if( .NOT. GPU_MODE ) then
+    ! on CPU
+    do ispec2D=1,nspec2D_ymin_crust_mantle
 
-    ispec=ibelm_ymin_crust_mantle(ispec2D)
+      ispec=ibelm_ymin_crust_mantle(ispec2D)
 
-  ! exclude elements that are not on absorbing edges
-    if(nkmin_eta_crust_mantle(1,ispec2D) == 0 .or. nimin_crust_mantle(1,ispec2D) == 0) cycle
+      ! exclude elements that are not on absorbing edges
+      if(nkmin_eta_crust_mantle(1,ispec2D) == 0 .or. nimin_crust_mantle(1,ispec2D) == 0) cycle
 
-    j=1
-    do k=nkmin_eta_crust_mantle(1,ispec2D),NGLLZ
-      do i=nimin_crust_mantle(1,ispec2D),nimax_crust_mantle(1,ispec2D)
-        iglob=ibool_crust_mantle(i,j,k,ispec)
+      j=1
+      do k=nkmin_eta_crust_mantle(1,ispec2D),NGLLZ
+        do i=nimin_crust_mantle(1,ispec2D),nimax_crust_mantle(1,ispec2D)
+          iglob=ibool_crust_mantle(i,j,k,ispec)
 
-        vx=veloc_crust_mantle(1,iglob)
-        vy=veloc_crust_mantle(2,iglob)
-        vz=veloc_crust_mantle(3,iglob)
+          vx=veloc_crust_mantle(1,iglob)
+          vy=veloc_crust_mantle(2,iglob)
+          vz=veloc_crust_mantle(3,iglob)
 
-        nx=normal_ymin_crust_mantle(1,i,k,ispec2D)
-        ny=normal_ymin_crust_mantle(2,i,k,ispec2D)
-        nz=normal_ymin_crust_mantle(3,i,k,ispec2D)
+          nx=normal_ymin_crust_mantle(1,i,k,ispec2D)
+          ny=normal_ymin_crust_mantle(2,i,k,ispec2D)
+          nz=normal_ymin_crust_mantle(3,i,k,ispec2D)
 
-        vn=vx*nx+vy*ny+vz*nz
+          vn=vx*nx+vy*ny+vz*nz
 
-        tx=rho_vp_crust_mantle(i,j,k,ispec)*vn*nx+rho_vs_crust_mantle(i,j,k,ispec)*(vx-vn*nx)
-        ty=rho_vp_crust_mantle(i,j,k,ispec)*vn*ny+rho_vs_crust_mantle(i,j,k,ispec)*(vy-vn*ny)
-        tz=rho_vp_crust_mantle(i,j,k,ispec)*vn*nz+rho_vs_crust_mantle(i,j,k,ispec)*(vz-vn*nz)
+          tx=rho_vp_crust_mantle(i,j,k,ispec)*vn*nx+rho_vs_crust_mantle(i,j,k,ispec)*(vx-vn*nx)
+          ty=rho_vp_crust_mantle(i,j,k,ispec)*vn*ny+rho_vs_crust_mantle(i,j,k,ispec)*(vy-vn*ny)
+          tz=rho_vp_crust_mantle(i,j,k,ispec)*vn*nz+rho_vs_crust_mantle(i,j,k,ispec)*(vz-vn*nz)
 
-        weight=jacobian2D_ymin_crust_mantle(i,k,ispec2D)*wgllwgll_xz(i,k)
+          weight=jacobian2D_ymin_crust_mantle(i,k,ispec2D)*wgllwgll_xz(i,k)
 
-        accel_crust_mantle(1,iglob)=accel_crust_mantle(1,iglob) - tx*weight
-        accel_crust_mantle(2,iglob)=accel_crust_mantle(2,iglob) - ty*weight
-        accel_crust_mantle(3,iglob)=accel_crust_mantle(3,iglob) - tz*weight
+          accel_crust_mantle(1,iglob)=accel_crust_mantle(1,iglob) - tx*weight
+          accel_crust_mantle(2,iglob)=accel_crust_mantle(2,iglob) - ty*weight
+          accel_crust_mantle(3,iglob)=accel_crust_mantle(3,iglob) - tz*weight
 
-        if (SIMULATION_TYPE == 3) then
-          b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_ymin_crust_mantle(:,i,k,ispec2D)
-        else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-          absorb_ymin_crust_mantle(1,i,k,ispec2D) = tx*weight
-          absorb_ymin_crust_mantle(2,i,k,ispec2D) = ty*weight
-          absorb_ymin_crust_mantle(3,i,k,ispec2D) = tz*weight
-        endif
+          if (SIMULATION_TYPE == 3) then
+            b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_ymin_crust_mantle(:,i,k,ispec2D)
+          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            absorb_ymin_crust_mantle(1,i,k,ispec2D) = tx*weight
+            absorb_ymin_crust_mantle(2,i,k,ispec2D) = ty*weight
+            absorb_ymin_crust_mantle(3,i,k,ispec2D) = tz*weight
+          endif
 
+        enddo
       enddo
     enddo
-  enddo
 
+  else
+    ! on GPU
+    if( nspec2D_ymin_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
+                                                              absorb_ymin_crust_mantle, &
+                                                              2) ! <= ymin              
+  endif
+
+
   ! writes absorbing boundary values
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_ymin_crust_mantle > 0 ) then
     call write_abs(2,absorb_ymin_crust_mantle,reclen_ymin_crust_mantle,it)
@@ -257,54 +290,65 @@
     call read_abs(3,absorb_ymax_crust_mantle,reclen_ymax_crust_mantle,NSTEP-it+1)
   endif
 
-  do ispec2D=1,nspec2D_ymax_crust_mantle
+  if( .NOT. GPU_MODE ) then
+    ! on CPU
+    do ispec2D=1,nspec2D_ymax_crust_mantle
 
-    ispec=ibelm_ymax_crust_mantle(ispec2D)
+      ispec=ibelm_ymax_crust_mantle(ispec2D)
 
-  ! exclude elements that are not on absorbing edges
-    if(nkmin_eta_crust_mantle(2,ispec2D) == 0 .or. nimin_crust_mantle(2,ispec2D) == 0) cycle
+    ! exclude elements that are not on absorbing edges
+      if(nkmin_eta_crust_mantle(2,ispec2D) == 0 .or. nimin_crust_mantle(2,ispec2D) == 0) cycle
 
-    j=NGLLY
-    do k=nkmin_eta_crust_mantle(2,ispec2D),NGLLZ
-      do i=nimin_crust_mantle(2,ispec2D),nimax_crust_mantle(2,ispec2D)
-        iglob=ibool_crust_mantle(i,j,k,ispec)
+      j=NGLLY
+      do k=nkmin_eta_crust_mantle(2,ispec2D),NGLLZ
+        do i=nimin_crust_mantle(2,ispec2D),nimax_crust_mantle(2,ispec2D)
+          iglob=ibool_crust_mantle(i,j,k,ispec)
 
-        vx=veloc_crust_mantle(1,iglob)
-        vy=veloc_crust_mantle(2,iglob)
-        vz=veloc_crust_mantle(3,iglob)
+          vx=veloc_crust_mantle(1,iglob)
+          vy=veloc_crust_mantle(2,iglob)
+          vz=veloc_crust_mantle(3,iglob)
 
-        nx=normal_ymax_crust_mantle(1,i,k,ispec2D)
-        ny=normal_ymax_crust_mantle(2,i,k,ispec2D)
-        nz=normal_ymax_crust_mantle(3,i,k,ispec2D)
+          nx=normal_ymax_crust_mantle(1,i,k,ispec2D)
+          ny=normal_ymax_crust_mantle(2,i,k,ispec2D)
+          nz=normal_ymax_crust_mantle(3,i,k,ispec2D)
 
-        vn=vx*nx+vy*ny+vz*nz
+          vn=vx*nx+vy*ny+vz*nz
 
-        tx=rho_vp_crust_mantle(i,j,k,ispec)*vn*nx+rho_vs_crust_mantle(i,j,k,ispec)*(vx-vn*nx)
-        ty=rho_vp_crust_mantle(i,j,k,ispec)*vn*ny+rho_vs_crust_mantle(i,j,k,ispec)*(vy-vn*ny)
-        tz=rho_vp_crust_mantle(i,j,k,ispec)*vn*nz+rho_vs_crust_mantle(i,j,k,ispec)*(vz-vn*nz)
+          tx=rho_vp_crust_mantle(i,j,k,ispec)*vn*nx+rho_vs_crust_mantle(i,j,k,ispec)*(vx-vn*nx)
+          ty=rho_vp_crust_mantle(i,j,k,ispec)*vn*ny+rho_vs_crust_mantle(i,j,k,ispec)*(vy-vn*ny)
+          tz=rho_vp_crust_mantle(i,j,k,ispec)*vn*nz+rho_vs_crust_mantle(i,j,k,ispec)*(vz-vn*nz)
 
-        weight=jacobian2D_ymax_crust_mantle(i,k,ispec2D)*wgllwgll_xz(i,k)
+          weight=jacobian2D_ymax_crust_mantle(i,k,ispec2D)*wgllwgll_xz(i,k)
 
-        accel_crust_mantle(1,iglob)=accel_crust_mantle(1,iglob) - tx*weight
-        accel_crust_mantle(2,iglob)=accel_crust_mantle(2,iglob) - ty*weight
-        accel_crust_mantle(3,iglob)=accel_crust_mantle(3,iglob) - tz*weight
+          accel_crust_mantle(1,iglob)=accel_crust_mantle(1,iglob) - tx*weight
+          accel_crust_mantle(2,iglob)=accel_crust_mantle(2,iglob) - ty*weight
+          accel_crust_mantle(3,iglob)=accel_crust_mantle(3,iglob) - tz*weight
 
-        if (SIMULATION_TYPE == 3) then
-          b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_ymax_crust_mantle(:,i,k,ispec2D)
-        else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-          absorb_ymax_crust_mantle(1,i,k,ispec2D) = tx*weight
-          absorb_ymax_crust_mantle(2,i,k,ispec2D) = ty*weight
-          absorb_ymax_crust_mantle(3,i,k,ispec2D) = tz*weight
-        endif
+          if (SIMULATION_TYPE == 3) then
+            b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_ymax_crust_mantle(:,i,k,ispec2D)
+          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            absorb_ymax_crust_mantle(1,i,k,ispec2D) = tx*weight
+            absorb_ymax_crust_mantle(2,i,k,ispec2D) = ty*weight
+            absorb_ymax_crust_mantle(3,i,k,ispec2D) = tz*weight
+          endif
 
+        enddo
       enddo
     enddo
-  enddo
 
+  else
+    ! on GPU
+    if( nspec2D_ymax_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
+                                                              absorb_ymax_crust_mantle, &
+                                                              3) ! <= ymax              
+  endif
+
   ! writes absorbing boundary values
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_ymax_crust_mantle > 0 ) then
     call write_abs(3,absorb_ymax_crust_mantle,reclen_ymax_crust_mantle,it)
   endif
 
+  if( GPU_MODE ) call load_CPU_elastic()   
+
   end subroutine compute_stacey_crust_mantle
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90	2012-02-21 00:33:56 UTC (rev 19657)
@@ -32,9 +32,10 @@
 
   use specfem_par,only: &
     ichunk,SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it, &
-    wgllwgll_xz,wgllwgll_yz,wgllwgll_xy, &
-    NSPEC2D_BOTTOM
+    wgllwgll_xz,wgllwgll_yz,wgllwgll_xy    
 
+  use specfem_par,only: GPU_MODE,Mesh_pointer
+
   use specfem_par_outercore,only: &
     veloc_outer_core,accel_outer_core,b_accel_outer_core, &
     ibool_outer_core, &
@@ -42,7 +43,7 @@
     jacobian2D_ymin_outer_core,jacobian2D_ymax_outer_core, &
     jacobian2D_bottom_outer_core, &
     nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
-    nspec2D_ymin_outer_core,nspec2D_ymax_outer_core, &
+    nspec2D_ymin_outer_core,nspec2D_ymax_outer_core,nspec2D_zmin_outer_core, &
     vp_outer_core, &
     nimin_outer_core,nimax_outer_core,nkmin_eta_outer_core, &
     njmin_outer_core,njmax_outer_core,nkmin_xi_outer_core, &
@@ -69,6 +70,8 @@
   !           file access (by process rank modulo 8) showed that the following,
   !           simple approach is still fastest. (assuming that files are accessed on a local scratch disk)
 
+  if( GPU_MODE ) call load_GPU_acoustic()   
+
   !   xmin
   ! if two chunks exclude this face for one of them
   if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
@@ -80,33 +83,42 @@
       call read_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,NSTEP-it+1)
     endif
 
-    do ispec2D=1,nspec2D_xmin_outer_core
+    if( .NOT. GPU_MODE) then
+      ! on CPU
+      do ispec2D=1,nspec2D_xmin_outer_core
 
-      ispec=ibelm_xmin_outer_core(ispec2D)
+        ispec=ibelm_xmin_outer_core(ispec2D)
 
-      ! exclude elements that are not on absorbing edges
-      if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
 
-      i=1
-      do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
-        do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
-          iglob=ibool_outer_core(i,j,k,ispec)
+        i=1
+        do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
+          do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
+            iglob=ibool_outer_core(i,j,k,ispec)
 
-          sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+            sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-          weight = jacobian2D_xmin_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
+            weight = jacobian2D_xmin_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
 
-          accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+            accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-          if (SIMULATION_TYPE == 3) then
-            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmin_outer_core(j,k,ispec2D)
-          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-            absorb_xmin_outer_core(j,k,ispec2D) = weight*sn
-          endif
+            if (SIMULATION_TYPE == 3) then
+              b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmin_outer_core(j,k,ispec2D)
+            else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+              absorb_xmin_outer_core(j,k,ispec2D) = weight*sn
+            endif
+          enddo
         enddo
       enddo
-    enddo
 
+    else
+      ! on GPU
+      if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
+                                                                absorb_xmin_outer_core, &
+                                                                4) ! <= xmin              
+    endif
+
     ! writes absorbing boundary values
     if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmin_outer_core > 0 ) then
       call write_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,it)
@@ -122,34 +134,45 @@
       call read_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,NSTEP-it+1)
     endif
 
-    do ispec2D=1,nspec2D_xmax_outer_core
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D=1,nspec2D_xmax_outer_core
 
-      ispec=ibelm_xmax_outer_core(ispec2D)
+        ispec=ibelm_xmax_outer_core(ispec2D)
 
-      ! exclude elements that are not on absorbing edges
-      if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
 
-      i=NGLLX
-      do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
-        do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
-          iglob=ibool_outer_core(i,j,k,ispec)
+        i=NGLLX
+        do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
+          do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
+            iglob=ibool_outer_core(i,j,k,ispec)
 
-          sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+            sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-          weight = jacobian2D_xmax_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
+            weight = jacobian2D_xmax_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
 
-          accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+            accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-          if (SIMULATION_TYPE == 3) then
-            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmax_outer_core(j,k,ispec2D)
-          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-            absorb_xmax_outer_core(j,k,ispec2D) = weight*sn
-          endif
+            if (SIMULATION_TYPE == 3) then
+              b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmax_outer_core(j,k,ispec2D)
+            else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+              absorb_xmax_outer_core(j,k,ispec2D) = weight*sn
+            endif
 
+          enddo
         enddo
       enddo
-    enddo
 
+    else
+      ! on GPU
+      if( nspec2D_xmax_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
+                                                                absorb_xmax_outer_core, &
+                                                                5) ! <= xmax
+    endif
+
+
+
     if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmax_outer_core > 0 ) then
       call write_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,it)
     endif
@@ -161,34 +184,43 @@
     call read_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,NSTEP-it+1)
   endif
 
-  do ispec2D=1,nspec2D_ymin_outer_core
+  if( .NOT. GPU_MODE ) then
+    ! on CPU
+    do ispec2D=1,nspec2D_ymin_outer_core
 
-    ispec=ibelm_ymin_outer_core(ispec2D)
+      ispec=ibelm_ymin_outer_core(ispec2D)
 
-    ! exclude elements that are not on absorbing edges
-    if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
+      ! exclude elements that are not on absorbing edges
+      if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
 
-    j=1
-    do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
-      do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
-        iglob=ibool_outer_core(i,j,k,ispec)
+      j=1
+      do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
+        do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
+          iglob=ibool_outer_core(i,j,k,ispec)
 
-        sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+          sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-        weight=jacobian2D_ymin_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
+          weight=jacobian2D_ymin_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
 
-        accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+          accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-        if (SIMULATION_TYPE == 3) then
-          b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymin_outer_core(i,k,ispec2D)
-        else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-          absorb_ymin_outer_core(i,k,ispec2D) = weight*sn
-        endif
+          if (SIMULATION_TYPE == 3) then
+            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymin_outer_core(i,k,ispec2D)
+          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            absorb_ymin_outer_core(i,k,ispec2D) = weight*sn
+          endif
 
+        enddo
       enddo
     enddo
-  enddo
 
+  else
+    ! on GPU
+    if( nspec2D_ymin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
+                                                              absorb_ymin_outer_core, &
+                                                              6) ! <= ymin
+  endif
+
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_ymin_outer_core > 0 ) then
     call write_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,it)
   endif
@@ -198,70 +230,92 @@
     call read_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,NSTEP-it+1)
   endif
 
-  do ispec2D=1,nspec2D_ymax_outer_core
+  if( .NOT. GPU_MODE ) then
+    ! on CPU
+    do ispec2D=1,nspec2D_ymax_outer_core
 
-    ispec=ibelm_ymax_outer_core(ispec2D)
+      ispec=ibelm_ymax_outer_core(ispec2D)
 
-    ! exclude elements that are not on absorbing edges
-    if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
+      ! exclude elements that are not on absorbing edges
+      if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
 
-    j=NGLLY
-    do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
-      do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
-        iglob=ibool_outer_core(i,j,k,ispec)
+      j=NGLLY
+      do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
+        do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
+          iglob=ibool_outer_core(i,j,k,ispec)
 
-        sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+          sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-        weight=jacobian2D_ymax_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
+          weight=jacobian2D_ymax_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
 
-        accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+          accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-        if (SIMULATION_TYPE == 3) then
-          b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymax_outer_core(i,k,ispec2D)
-        else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-          absorb_ymax_outer_core(i,k,ispec2D) = weight*sn
-        endif
+          if (SIMULATION_TYPE == 3) then
+            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymax_outer_core(i,k,ispec2D)
+          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            absorb_ymax_outer_core(i,k,ispec2D) = weight*sn
+          endif
 
+        enddo
       enddo
     enddo
-  enddo
 
+  else
+    ! on GPU
+    if( nspec2D_ymax_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
+                                                              absorb_ymax_outer_core, &
+                                                              7) ! <= ymax
+  endif
+
+
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_ymax_outer_core > 0 ) then
     call write_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,it)
   endif
 
   ! for surface elements exactly on the ICB
-  if (SIMULATION_TYPE == 3 .and. NSPEC2D_BOTTOM(IREGION_OUTER_CORE)> 0)  then
+  if (SIMULATION_TYPE == 3 .and. nspec2D_zmin_outer_core > 0)  then
     call read_abs(8,absorb_zmin_outer_core,reclen_zmin,NSTEP-it+1)
   endif
 
-  do ispec2D = 1,NSPEC2D_BOTTOM(IREGION_OUTER_CORE)
+  if( .NOT. GPU_MODE ) then
+    ! on CPU
+    do ispec2D = 1,nspec2D_zmin_outer_core
 
-    ispec = ibelm_bottom_outer_core(ispec2D)
+      ispec = ibelm_bottom_outer_core(ispec2D)
 
-    k = 1
-    do j = 1,NGLLY
-      do i = 1,NGLLX
-        iglob = ibool_outer_core(i,j,k,ispec)
+      k = 1
+      do j = 1,NGLLY
+        do i = 1,NGLLX
+          iglob = ibool_outer_core(i,j,k,ispec)
 
-        sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+          sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-        weight = jacobian2D_bottom_outer_core(i,j,ispec2D)*wgllwgll_xy(i,j)
+          weight = jacobian2D_bottom_outer_core(i,j,ispec2D)*wgllwgll_xy(i,j)
 
-        accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+          accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-        if (SIMULATION_TYPE == 3) then
-          b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_zmin_outer_core(i,j,ispec2D)
-        else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-          absorb_zmin_outer_core(i,j,ispec2D) = weight*sn
-        endif
+          if (SIMULATION_TYPE == 3) then
+            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_zmin_outer_core(i,j,ispec2D)
+          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            absorb_zmin_outer_core(i,j,ispec2D) = weight*sn
+          endif
 
+        enddo
       enddo
     enddo
-  enddo
 
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. NSPEC2D_BOTTOM(IREGION_OUTER_CORE) > 0 ) then
+  else
+    ! on GPU
+    if( nspec2D_zmin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
+                                                              absorb_zmin_outer_core, &
+                                                              8) ! <= zmin
+  endif
+
+
+  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_zmin_outer_core > 0 ) then
     call write_abs(8,absorb_zmin_outer_core,reclen_zmin,it)
   endif
 
+  if( GPU_MODE) call load_CPU_acoustic()
+
   end subroutine compute_stacey_outer_core

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90	2012-02-21 00:33:56 UTC (rev 19657)
@@ -81,7 +81,7 @@
       call close_file_abs(7)
     endif
 
-    if (NSPEC2D_BOTTOM(IREGION_OUTER_CORE) > 0 .and. (SIMULATION_TYPE == 3 &
+    if (nspec2D_zmin_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
       .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
       call close_file_abs(8)
     endif

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2012-02-21 00:33:56 UTC (rev 19657)
@@ -1059,14 +1059,14 @@
   call MPI_REDUCE(ncuda_devices,ncuda_devices_max,1,MPI_INTEGER,MPI_MAX,0,MPI_COMM_WORLD,ier)
 
   ! prepares general fields on GPU
-  call prepare_constants_device(Mesh_pointer,NGLLX, &
+  call prepare_constants_device(Mesh_pointer,myrank,NGLLX, &
                                   hprime_xx, hprime_yy, hprime_zz, &
                                   hprimewgll_xx, hprimewgll_yy, hprimewgll_zz, &
                                   wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
                                   NSOURCES, nsources_local, &
-                                  sourcearrays, islice_selected_source, ispec_selected_source, &
-                                  number_receiver_global, ispec_selected_rec, &
-                                  nrec, nrec_local, &
+                                  sourcearrays,islice_selected_source,ispec_selected_source, &
+                                  number_receiver_global,islice_selected_rec,ispec_selected_rec, &
+                                  nrec, nrec_local, nadj_rec_local, &
                                   NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
                                   NSPEC_OUTER_CORE,NGLOB_OUTER_CORE, &
                                   NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
@@ -1174,7 +1174,42 @@
   endif
   call sync_all()
 
+  ! prepares absorbing arrays
+  if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) then
+    if(myrank == 0 ) write(IMAIN,*) "  loading absorbing boundaries"
 
+    call prepare_fields_absorb_device(Mesh_pointer, &
+                                    nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
+                                    nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
+                                    NSPEC2DMAX_XMIN_XMAX_CM,NSPEC2DMAX_YMIN_YMAX_CM, &
+                                    nimin_crust_mantle,nimax_crust_mantle, &
+                                    njmin_crust_mantle,njmax_crust_mantle, &
+                                    nkmin_xi_crust_mantle,nkmin_eta_crust_mantle, &
+                                    ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle, &
+                                    ibelm_ymin_crust_mantle,ibelm_ymax_crust_mantle, &
+                                    normal_xmin_crust_mantle,normal_xmax_crust_mantle, &
+                                    normal_ymin_crust_mantle,normal_ymax_crust_mantle, &
+                                    jacobian2D_xmin_crust_mantle,jacobian2D_xmax_crust_mantle, &
+                                    jacobian2D_ymin_crust_mantle,jacobian2D_ymax_crust_mantle, &
+                                    rho_vp_crust_mantle,rho_vs_crust_mantle,  &
+                                    nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
+                                    nspec2D_ymin_outer_core,nspec2D_ymax_outer_core, &
+                                    nspec2D_zmin_outer_core, &
+                                    NSPEC2DMAX_XMIN_XMAX_OC,NSPEC2DMAX_YMIN_YMAX_OC, &
+                                    nimin_outer_core,nimax_outer_core, &
+                                    njmin_outer_core,njmax_outer_core, &
+                                    nkmin_xi_outer_core,nkmin_eta_outer_core, &
+                                    ibelm_xmin_outer_core,ibelm_xmax_outer_core, &
+                                    ibelm_ymin_outer_core,ibelm_ymax_outer_core, &
+                                    ibelm_bottom_outer_core, &
+                                    jacobian2D_xmin_outer_core,jacobian2D_xmax_outer_core, &
+                                    jacobian2D_ymin_outer_core,jacobian2D_ymax_outer_core, &
+                                    jacobian2D_bottom_outer_core, &
+                                    vp_outer_core)
+    
+  endif
+  call sync_all()
+  
   ! prepares MPI interfaces
   if(myrank == 0 ) write(IMAIN,*) "  loading mpi interfaces"
   

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2012-02-21 00:33:56 UTC (rev 19657)
@@ -374,6 +374,8 @@
   read(27) njunk1
   read(27) njunk2
 
+  nspec2D_zmin_outer_core = NSPEC2D_BOTTOM(IREGION_OUTER_CORE)
+
   read(27) ibelm_xmin_outer_core
   read(27) ibelm_xmax_outer_core
   read(27) ibelm_ymin_outer_core
@@ -1867,6 +1869,8 @@
   ! local parameters
   integer(kind=8) :: filesize
   integer :: ier
+  integer :: nabs_xmin_cm,nabs_xmax_cm,nabs_ymin_cm,nabs_ymax_cm
+  integer :: nabs_xmin_oc,nabs_xmax_oc,nabs_ymin_oc,nabs_ymax_oc,nabs_zmin_oc
 
   ! sets up absorbing boundary buffer arrays
 
@@ -1952,9 +1956,9 @@
   allocate(absorb_ymax_outer_core(NGLLX,NGLLZ,nabs_ymax_oc),stat=ier)
   if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb ymax')
 
-  if (NSPEC2D_BOTTOM(IREGION_OUTER_CORE) > 0 .and. &
-     (SIMULATION_TYPE == 3 .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-    nabs_zmin_oc = NSPEC2D_BOTTOM(IREGION_OUTER_CORE)
+  if (nspec2D_zmin_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
+    .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+    nabs_zmin_oc = nspec2D_zmin_outer_core
   else
     nabs_zmin_oc = 1
   endif
@@ -2215,11 +2219,11 @@
     endif
   endif
 
-  if (NSPEC2D_BOTTOM(IREGION_OUTER_CORE) > 0 .and. &
-     (SIMULATION_TYPE == 3 .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD)))then
+  if (nspec2D_zmin_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
+    .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD)))then
 
     ! size of single record
-    reclen_zmin = CUSTOM_REAL * (NGLLX * NGLLY * NSPEC2D_BOTTOM(IREGION_OUTER_CORE))
+    reclen_zmin = CUSTOM_REAL * (NGLLX * NGLLY * nspec2D_zmin_outer_core)
 
     ! total file size
     filesize = reclen_zmin

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_sources_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_sources_receivers.f90	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_sources_receivers.f90	2012-02-21 00:33:56 UTC (rev 19657)
@@ -305,6 +305,9 @@
     enddo
   endif
 
+  ! counter for adjoint receiver stations in local slice, used to allocate adjoint source arrays
+  nadj_rec_local = 0
+  
   ! counts receivers for adjoint simulations
   if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
     ! by Ebru
@@ -313,8 +316,6 @@
     comp(2) = bic(1:2)//'E'
     comp(3) = bic(1:2)//'Z'
 
-    ! counter for adjoint receiver stations in local slice, used to allocate adjoint source arrays
-    nadj_rec_local = 0
     ! temporary counter to check if any files are found at all
     nadj_files_found = 0
     do irec = 1,nrec
@@ -444,6 +445,7 @@
   
   ! allocates source arrays
   if (SIMULATION_TYPE == 1  .or. SIMULATION_TYPE == 3) then
+    ! source interpolated on all GLL points in source element
     allocate(sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,NSOURCES),stat=ier)
     if( ier /= 0 ) call exit_MPI(myrank,'error allocating sourcearrays')
 
@@ -460,6 +462,7 @@
 
   ! adjoint source arrays
   if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
+    ! adjoint source buffer length
     NSTEP_SUB_ADJ = ceiling( dble(NSTEP)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )
     allocate(iadj_vec(NSTEP),stat=ier)
     if( ier /= 0 ) call exit_MPI(myrank,'error allocating iadj_vec')
@@ -492,40 +495,44 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine setup_sources_receivers_srcarr(NSOURCES,myrank, &
-                      ispec_selected_source,islice_selected_source, &
-                      xi_source,eta_source,gamma_source, &
-                      Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
-                      xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-                      etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-                      gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-                      xigll,yigll,zigll,sourcearrays)
+  subroutine setup_sources_receivers_srcarr()
+!                      NSOURCES,myrank, &
+!                      ispec_selected_source,islice_selected_source, &
+!                      xi_source,eta_source,gamma_source, &
+!                      Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
+!                      xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+!                      etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
+!                      gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+!                      xigll,yigll,zigll,sourcearrays)
 
+  use specfem_par
+  use specfem_par_crustmantle
+  
   implicit none
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
+!  include "constants.h"
+!  include "OUTPUT_FILES/values_from_mesher.h"
+!
+!  integer NSOURCES,myrank
+!
+!  integer, dimension(NSOURCES) :: islice_selected_source,ispec_selected_source
+!  double precision, dimension(NSOURCES) :: xi_source,eta_source,gamma_source
+!  double precision, dimension(NSOURCES) :: Mxx,Myy,Mzz,Mxy,Mxz,Myz
+!
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
+!        xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle,&
+!        etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
+!        gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle
+!
+!  double precision, dimension(NGLLX) :: xigll
+!  double precision, dimension(NGLLY) :: yigll
+!  double precision, dimension(NGLLZ) :: zigll
+!
+!  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ,NSOURCES) :: sourcearrays
 
-  integer NSOURCES,myrank
 
-  integer, dimension(NSOURCES) :: islice_selected_source,ispec_selected_source
-  double precision, dimension(NSOURCES) :: xi_source,eta_source,gamma_source
-  double precision, dimension(NSOURCES) :: Mxx,Myy,Mzz,Mxy,Mxz,Myz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
-        xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle,&
-        etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-        gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle
-
-  double precision, dimension(NGLLX) :: xigll
-  double precision, dimension(NGLLY) :: yigll
-  double precision, dimension(NGLLZ) :: zigll
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ,NSOURCES) :: sourcearrays
-
-
   ! local parameters
-  integer :: isource
+  integer :: isource,iglob,ispec,i,j,k
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sourcearray
 
   do isource = 1,NSOURCES
@@ -544,8 +551,30 @@
              gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
              xigll,yigll,zigll,NSPEC_CRUST_MANTLE)
 
+      ! point forces, initializes sourcearray, used for simplified CUDA routines
+      if(USE_FORCE_POINT_SOURCE) then
+        ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
+        iglob = ibool_crust_mantle(nint(xi_source(isource)), &
+                                   nint(eta_source(isource)), &
+                                   nint(gamma_source(isource)), &
+                                   ispec_selected_source(isource))
+        ! sets sourcearrays
+        sourcearray(:,:,:,:) = 0.0
+        ispec = ispec_selected_source(isource)
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              if( ibool_crust_mantle(i,j,k,ispec) == iglob ) then
+                ! elastic source components
+                sourcearray(:,i,j,k) = nu_source(COMPONENT_FORCE_SOURCE,:,isource)
+              endif
+            enddo
+          enddo
+        enddo
+      endif
+
+      ! stores source excitations
       sourcearrays(:,:,:,:,isource) = sourcearray(:,:,:,:)
-
     endif
   enddo
 
@@ -621,7 +650,7 @@
     ! e.g.: first block 1 has iadjsrc_len = 1000 with start at 2001 and end at 3000
     !         so iadj_vec(1) = 1000 - 0, iadj_vec(2) = 1000 - 1, ..., to iadj_vec(1000) = 1000 - 999 = 1
     !         then for block 2, iadjsrc_len = 1000 with start at 1001 and end at 2000
-    !         so iadj_vec(1001) = 1000 - 0, iad_vec(1002) = 1000 - 1, .. and so on again down to 1
+    !         so iadj_vec(1001) = 1000 - 0, iadj_vec(1002) = 1000 - 1, .. and so on again down to 1
     !         then block 3 and your guess is right now... iadj_vec(2001) to iadj_vec(3000) is 1000 down to 1. :)
     iadj_vec(it) = iadjsrc_len(it_sub_adj) - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC)
   enddo

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90	2012-02-20 04:22:33 UTC (rev 19656)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90	2012-02-21 00:33:56 UTC (rev 19657)
@@ -133,7 +133,6 @@
   double precision, dimension(:), allocatable :: xi_source,eta_source,gamma_source
   double precision, dimension(:), allocatable :: tshift_cmt,hdur,hdur_gaussian
   double precision, dimension(:), allocatable :: theta_source,phi_source
-  double precision, external :: comp_source_time_function
   double precision :: t0
 
   !-----------------------------------------------------------------
@@ -504,8 +503,6 @@
   real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: absorb_xmin_crust_mantle, &
     absorb_xmax_crust_mantle, absorb_ymin_crust_mantle, absorb_ymax_crust_mantle
 
-  integer :: nabs_xmin_cm,nabs_xmax_cm,nabs_ymin_cm,nabs_ymax_cm
-
   integer :: reclen_xmin_crust_mantle, reclen_xmax_crust_mantle, &
             reclen_ymin_crust_mantle,reclen_ymax_crust_mantle
 
@@ -722,14 +719,14 @@
   ! Stacey
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_STACEY) :: vp_outer_core
   integer :: nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
-            nspec2D_ymin_outer_core,nspec2D_ymax_outer_core
+            nspec2D_ymin_outer_core,nspec2D_ymax_outer_core, &
+            nspec2D_zmin_outer_core
   integer, dimension(2,NSPEC2DMAX_YMIN_YMAX_OC) :: nimin_outer_core,nimax_outer_core,nkmin_eta_outer_core
   integer, dimension(2,NSPEC2DMAX_XMIN_XMAX_OC) :: njmin_outer_core,njmax_outer_core,nkmin_xi_outer_core
 
   real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: absorb_xmin_outer_core, &
      absorb_xmax_outer_core, absorb_ymin_outer_core, absorb_ymax_outer_core, &
      absorb_zmin_outer_core
-  integer :: nabs_xmin_oc,nabs_xmax_oc,nabs_ymin_oc,nabs_ymax_oc,nabs_zmin_oc
 
   integer :: reclen_xmin_outer_core, reclen_xmax_outer_core, &
             reclen_ymin_outer_core, reclen_ymax_outer_core