[cig-commits] r19152 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src: cuda generate_databases meshfem3D specfem3D

Sat Nov 5 19:02:37 PDT 2011

Author: danielpeter
Date: 2011-11-05 19:02:36 -0700 (Sat, 05 Nov 2011)
New Revision: 19152

Added:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
Removed:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
Modified:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_aniso.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/save_databases.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
Log:
updates declarations; re-adds anisotropy

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================

--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -47,14 +47,14 @@
 
 extern "C"
 void FC_FUNC_(check_max_norm_displ_gpu,
-              CHECK_MAX_NORM_DISPL_GPU)(int* size, float* displ,long* Mesh_pointer_f,int* announceID) {
+              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {
 
 TRACE("check_max_norm_displ_gpu");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  cudaMemcpy(displ, mp->d_displ,*size*sizeof(float),cudaMemcpyDeviceToHost);
-  float maxnorm=0;
+  cudaMemcpy(displ, mp->d_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+  realw maxnorm=0;
 
   for(int i=0;i<*size;i++) {
     maxnorm = MAX(maxnorm,fabsf(displ[i]));
@@ -66,13 +66,13 @@
 
 extern "C"
 void FC_FUNC_(check_max_norm_vector,
-              CHECK_MAX_NORM_VECTOR)(int* size, float* vector1, int* announceID) {
+              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {
 
 TRACE("check_max_norm_vector");
 
   int procid;
   MPI_Comm_rank(MPI_COMM_WORLD,&procid);
-  float maxnorm=0;
+  realw maxnorm=0;
   int maxloc;
   for(int i=0;i<*size;i++) {
     if(maxnorm<fabsf(vector1[i])) {
@@ -87,11 +87,11 @@
 
 extern "C"
 void FC_FUNC_(check_max_norm_displ,
-              CHECK_MAX_NORM_DISPL)(int* size, float* displ, int* announceID) {
+              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {
 
 TRACE("check_max_norm_displ");
 
-  float maxnorm=0;
+  realw maxnorm=0;
 
   for(int i=0;i<*size;i++) {
     maxnorm = MAX(maxnorm,fabsf(displ[i]));
@@ -103,19 +103,19 @@
 
 extern "C"
 void FC_FUNC_(check_max_norm_b_displ_gpu,
-              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, float* b_displ,long* Mesh_pointer_f,int* announceID) {
+              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {
 
 TRACE("check_max_norm_b_displ_gpu");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  float* b_accel = (float*)malloc(*size*sizeof(float));
+  realw* b_accel = (realw*)malloc(*size*sizeof(realw));
 
-  cudaMemcpy(b_displ, mp->d_b_displ,*size*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_displ, mp->d_b_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
 
-  float maxnorm=0;
-  float maxnorm_accel=0;
+  realw maxnorm=0;
+  realw maxnorm_accel=0;
 
   for(int i=0;i<*size;i++) {
     maxnorm = MAX(maxnorm,fabsf(b_displ[i]));
@@ -130,15 +130,15 @@
 
 extern "C"
 void FC_FUNC_(check_max_norm_b_accel_gpu,
-              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, float* b_accel,long* Mesh_pointer_f,int* announceID) {
+              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {
 
 TRACE("check_max_norm_b_accel_gpu");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
 
-  float maxnorm=0;
+  realw maxnorm=0;
 
   for(int i=0;i<*size;i++) {
     maxnorm = MAX(maxnorm,fabsf(b_accel[i]));
@@ -150,15 +150,15 @@
 
 extern "C"
 void FC_FUNC_(check_max_norm_b_veloc_gpu,
-              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, float* b_veloc,long* Mesh_pointer_f,int* announceID) {
+              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {
 
 TRACE("check_max_norm_b_veloc_gpu");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  cudaMemcpy(b_veloc, mp->d_b_veloc,*size*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_veloc, mp->d_b_veloc,*size*sizeof(realw),cudaMemcpyDeviceToHost);
 
-  float maxnorm=0;
+  realw maxnorm=0;
 
   for(int i=0;i<*size;i++) {
     maxnorm = MAX(maxnorm,fabsf(b_veloc[i]));
@@ -170,11 +170,11 @@
 
 extern "C"
 void FC_FUNC_(check_max_norm_b_displ,
-              CHECK_MAX_NORM_B_DISPL)(int* size, float* b_displ,int* announceID) {
+              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {
 
 TRACE("check_max_norm_b_displ");
 
-  float maxnorm=0;
+  realw maxnorm=0;
 
   for(int i=0;i<*size;i++) {
     maxnorm = MAX(maxnorm,fabsf(b_displ[i]));
@@ -186,11 +186,11 @@
 
 extern "C"
 void FC_FUNC_(check_max_norm_b_accel,
-              CHECK_MAX_NORM_B_ACCEL)(int* size, float* b_accel,int* announceID) {
+              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {
 
 TRACE("check_max_norm_b_accel");
 
-  float maxnorm=0;
+  realw maxnorm=0;
 
   for(int i=0;i<*size;i++) {
     maxnorm = MAX(maxnorm,fabsf(b_accel[i]));
@@ -202,7 +202,7 @@
 
 extern "C"
 void FC_FUNC_(check_error_vectors,
-              CHECK_ERROR_VECTORS)(int* sizef, float* vector1,float* vector2) {
+              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {
 
 TRACE("check_error_vectors");
 
@@ -256,9 +256,9 @@
   MPI_Comm_rank(MPI_COMM_WORLD,&procid);
   int size = *sizef;
   int it = *itf;
-  float* accel_cpy = (float*)malloc(size*sizeof(float));
-  cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(float),cudaMemcpyDeviceToHost);
-  float maxval=0;
+  realw* accel_cpy = (realw*)malloc(size*sizeof(realw));
+  cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  realw maxval=0;
   for(int i=0;i<size;++i) {
     maxval = MAX(maxval,accel_cpy[i]);
   }
@@ -272,10 +272,10 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void get_maximum_kernel(float* array, int size, float* d_max){
+__global__ void get_maximum_kernel(realw* array, int size, realw* d_max){
 
   /* simplest version: uses only 1 thread
-   float max;
+   realw max;
    max = 0;
    // finds maximum value in array
    if( size > 0 ){
@@ -288,7 +288,7 @@
    */
 
   // reduction example:
-  __shared__ float sdata[256] ;
+  __shared__ realw sdata[256] ;
 
   // load shared mem
   unsigned int tid = threadIdx.x;
@@ -320,7 +320,7 @@
 
 extern "C"
 void FC_FUNC_(get_norm_acoustic_from_device,
-              GET_NORM_ACOUSTIC_FROM_DEVICE)(float* norm,
+              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
                                                   long* Mesh_pointer_f,
                                                   int* SIMULATION_TYPE) {
 
@@ -328,19 +328,17 @@
   //double start_time = get_time();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  float max;
-  float *d_max;
+  realw max;
+  realw *d_max;
 
-
-
   max = 0;
 
   /* way 1 : timing Elapsed time: 8.464813e-03
-   float* h_array;
-   h_array = (float*)calloc(mp->NGLOB_AB,sizeof(float));
+   realw* h_array;
+   h_array = (realw*)calloc(mp->NGLOB_AB,sizeof(realw));
 
    print_CUDA_error_if_any(cudaMemcpy(h_array,mp->d_potential_dot_dot_acoustic,
-   sizeof(float)*(mp->NGLOB_AB),cudaMemcpyDeviceToHost),131);
+   sizeof(realw)*(mp->NGLOB_AB),cudaMemcpyDeviceToHost),131);
 
    // finds maximum value in array
    max = h_array[0];
@@ -352,7 +350,7 @@
 
   /* way 2: timing Elapsed time: 8.818102e-02
    // launch simple kernel
-   cudaMalloc((void**)&d_max,sizeof(float));
+   cudaMalloc((void**)&d_max,sizeof(realw));
 
    dim3 grid(1,1);
    dim3 threads(1,1,1);
@@ -360,21 +358,21 @@
    get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
    mp->NGLOB_AB,
    d_max);
-   print_CUDA_error_if_any(cudaMemcpy(&max,d_max, sizeof(float), cudaMemcpyDeviceToHost),222);
+   print_CUDA_error_if_any(cudaMemcpy(&max,d_max, sizeof(realw), cudaMemcpyDeviceToHost),222);
 
    cudaFree(d_max);
    */
 
   // way 2 b: timing Elapsed time: 1.236916e-03
   // launch simple reduction kernel
-  float* h_max;
+  realw* h_max;
   int blocksize = 256;
 
   int num_blocks_x = ceil(mp->NGLOB_AB/blocksize);
   //printf("num_blocks_x %i \n",num_blocks_x);
 
-  h_max = (float*) calloc(num_blocks_x,sizeof(float));
-  cudaMalloc((void**)&d_max,num_blocks_x*sizeof(float));
+  h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
 
   dim3 grid(num_blocks_x,1);
   dim3 threads(blocksize,1,1);
@@ -391,7 +389,7 @@
                                          d_max);
   }
 
-  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(float),cudaMemcpyDeviceToHost),222);
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
 
   // determines max for all blocks
   max = h_max[0];
@@ -417,14 +415,15 @@
    //      precision vector x
    int incr = 1;
    int imax = 0;
-   imax = cublasIsamax(mp->NGLOB_AB,(float*)mp->d_potential_dot_dot_acoustic, incr);
+   imax = cublasIsamax(mp->NGLOB_AB,(realw*)mp->d_potential_dot_dot_acoustic, incr);
    status= cublasGetError();
    if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! CUBLAS error in cublasIsamax\n");
    exit(1);
    }
 
-   print_CUDA_error_if_any(cudaMemcpy(&max,&(mp->d_potential_dot_dot_acoustic[imax]), sizeof(float), cudaMemcpyDeviceToHost),222);
+   print_CUDA_error_if_any(cudaMemcpy(&max,&(mp->d_potential_dot_dot_acoustic[imax]),
+                      sizeof(realw), cudaMemcpyDeviceToHost),222);
 
    printf("maximum %i %i %f \n",mp->NGLOB_AB,imax,max);
 
@@ -453,10 +452,10 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void get_maximum_vector_kernel(float* array, int size, float* d_max){
+__global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
 
   // reduction example:
-  __shared__ float sdata[256] ;
+  __shared__ realw sdata[256] ;
 
   // load shared mem
   unsigned int tid = threadIdx.x;
@@ -490,7 +489,7 @@
 
 extern "C"
 void FC_FUNC_(get_norm_elastic_from_device,
-              GET_NORM_ELASTIC_FROM_DEVICE)(float* norm,
+              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
                                                  long* Mesh_pointer_f,
                                                  int* SIMULATION_TYPE) {
 
@@ -498,20 +497,20 @@
   //double start_time = get_time();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  float max;
-  float *d_max;
+  realw max;
+  realw *d_max;
 
   max = 0;
 
   // launch simple reduction kernel
-  float* h_max;
+  realw* h_max;
   int blocksize = 256;
 
   int num_blocks_x = ceil(mp->NGLOB_AB/blocksize);
   //printf("num_blocks_x %i \n",num_blocks_x);
 
-  h_max = (float*) calloc(num_blocks_x,sizeof(float));
-  cudaMalloc((void**)&d_max,num_blocks_x*sizeof(float));
+  h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
 
   dim3 grid(num_blocks_x,1);
   dim3 threads(blocksize,1,1);
@@ -528,7 +527,7 @@
                                                 d_max);
   }
 
-  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(float),cudaMemcpyDeviceToHost),222);
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
 
   // determines max for all blocks
   max = h_max[0];

Added: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -0,0 +1,370 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+// #include "epik_user.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_add_sources_acoustic_kernel(realw* potential_dot_dot_acoustic,
+                                                    int* ibool,
+                                                    int* ispec_is_inner,
+                                                    int phase_is_inner,
+                                                    realw* sourcearrays,
+                                                    double* stf_pre_compute,
+                                                    int myrank,
+                                                    int* islice_selected_source,
+                                                    int* ispec_selected_source,
+                                                    int* ispec_is_acoustic,
+                                                    realw* kappastore,
+                                                    int NSOURCES) {
+  int i = threadIdx.x;
+  int j = threadIdx.y;
+  int k = threadIdx.z;
+
+  int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
+
+  int ispec;
+  int iglob;
+  realw stf;
+  realw kappal;
+
+  if( isource < NSOURCES ){
+
+    if(myrank == islice_selected_source[isource]) {
+
+      ispec = ispec_selected_source[isource]-1;
+
+      if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) {
+
+        stf = (realw) stf_pre_compute[isource];
+        iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+        kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+
+        atomicAdd(&potential_dot_dot_acoustic[iglob],
+                  -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal);
+
+        //      potential_dot_dot_acoustic[iglob] +=
+        //                -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal;
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_ac_cuda,
+              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
+                                                 int* phase_is_innerf,
+                                                 int* NSOURCESf,
+                                                 int* SIMULATION_TYPEf,
+                                                 double* h_stf_pre_compute,
+                                                 int* myrankf) {
+
+TRACE("compute_add_sources_ac_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // check if anything to do
+  if( mp->nsources_local == 0 ) return;
+
+  int phase_is_inner = *phase_is_innerf;
+  int NSOURCES = *NSOURCESf;
+  int myrank = *myrankf;
+
+  int num_blocks_x = NSOURCES;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  // copies pre-computed source time factors onto GPU
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                        mp->d_ibool,
+                                                        mp->d_ispec_is_inner,
+                                                        phase_is_inner,
+                                                        mp->d_sourcearrays,
+                                                        mp->d_stf_pre_compute,
+                                                        myrank,
+                                                        mp->d_islice_selected_source,
+                                                        mp->d_ispec_selected_source,
+                                                        mp->d_ispec_is_acoustic,
+                                                        mp->d_kappastore,
+                                                        NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_ac_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_ac_s3_cuda,
+              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
+                                                      int* phase_is_innerf,
+                                                      int* NSOURCESf,
+                                                      int* SIMULATION_TYPEf,
+                                                      double* h_stf_pre_compute,
+                                                      int* myrankf) {
+
+TRACE("compute_add_sources_ac_s3_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // check if anything to do
+  if( mp->nsources_local == 0 ) return;
+
+  int phase_is_inner = *phase_is_innerf;
+  int NSOURCES = *NSOURCESf;
+  int myrank = *myrankf;
+
+  int num_blocks_x = NSOURCES;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  // copies source time factors onto GPU
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                        mp->d_ibool,
+                                                        mp->d_ispec_is_inner,
+                                                        phase_is_inner,
+                                                        mp->d_sourcearrays,
+                                                        mp->d_stf_pre_compute,
+                                                        myrank,
+                                                        mp->d_islice_selected_source,
+                                                        mp->d_ispec_selected_source,
+                                                        mp->d_ispec_is_acoustic,
+                                                        mp->d_kappastore,
+                                                        NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_ac_s3_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic adjoint sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_acoustic,
+                                                      int nrec,
+                                                      realw* adj_sourcearrays,
+                                                      int* ibool,
+                                                      int* ispec_is_inner,
+                                                      int* ispec_is_acoustic,
+                                                      int* ispec_selected_rec,
+                                                      int phase_is_inner,
+                                                      int* pre_computed_irec,
+                                                      int nadj_rec_local,
+                                                      realw* kappastore) {
+
+  int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
+
+  // because of grid shape, irec_local can be too big
+  if(irec_local < nadj_rec_local) {
+
+    int irec = pre_computed_irec[irec_local];
+
+    int ispec = ispec_selected_rec[irec]-1;
+    if( ispec_is_acoustic[ispec] ){
+
+      // checks if element is in phase_is_inner run
+      if(ispec_is_inner[ispec] == phase_is_inner) {
+        int i = threadIdx.x;
+        int j = threadIdx.y;
+        int k = threadIdx.z;
+        
+        int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+        //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+
+        //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
+        //                                            pre_computed_irec_local_index[irec],
+        //                                            pre_computed_index,
+        //                                            0,
+        //                                            i,j,k)]/kappal;
+
+        // beware, for acoustic medium, a pressure source would be taking the negative
+        // and divide by Kappa of the fluid;
+        // this would have to be done when constructing the adjoint source.
+        //
+        // note: we take the first component of the adj_sourcearrays
+        //          the idea is to have e.g. a pressure source, where all 3 components would be the same
+        realw stf = adj_sourcearrays[INDEX5(5,5,5,3,i,j,k,0,irec_local)]; // / kappal 
+                                            
+        atomicAdd(&potential_dot_dot_acoustic[iglob],stf);
+        
+                  //+adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
+                  //                         pre_computed_irec_local_index[irec],pre_computed_index-1,
+                  //                         0,i,j,k)] // / kappal
+                  //                         );
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
+              ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
+                                               realw* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_acoustic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
+
+TRACE("add_sources_ac_sim_2_or_3_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  // checks
+  if( *nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
+
+  // make sure grid dimension is less than 65535 in x dimension
+  int num_blocks_x = mp->nadj_rec_local;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y,1);
+  dim3 threads(5,5,5);
+
+  // build slice of adj_sourcearrays because full array is *very* large.
+  // note: this extracts array values for local adjoint sources at given time step "time_index"
+  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
+  int ispec,i,j,k;
+  int irec_local = 0;
+  for(int irec = 0; irec < *nrec; irec++) {
+    if(*myrank == h_islice_selected_rec[irec]) {
+      irec_local++;
+
+      // takes only acoustic sources
+      ispec = h_ispec_selected_rec[irec]-1;
+      if( h_ispec_is_acoustic[ispec] ){
+
+        if( h_ispec_is_inner[ispec] == *phase_is_inner) {
+          for(k=0;k<5;k++) {
+            for(j=0;j<5;j++) {
+              for(i=0;i<5;i++) {
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,irec_local-1)]
+                  = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
+                                            *NTSTEP_BETWEEN_READ_ADJSRC,
+                                            3,5,5,
+                                            irec_local-1,(*time_index)-1,
+                                            0,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,irec_local-1)]
+                  = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
+                                            *NTSTEP_BETWEEN_READ_ADJSRC,
+                                            3,5,5,
+                                            irec_local-1,(*time_index)-1,
+                                            1,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,irec_local-1)]
+                  = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
+                                            *NTSTEP_BETWEEN_READ_ADJSRC,
+                                            3,5,5,
+                                            irec_local-1,(*time_index)-1,
+                                            2,i,j,k)];
+              }
+            }
+          }
+        } // phase_is_inner
+      } // h_ispec_is_acoustic
+    }
+  }
+  // check all local sources were added
+  if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
+
+  // copies extracted array values onto GPU
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+                              (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),99099);
+
+  // launches cuda kernel for acoustic adjoint sources
+  add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                          *nrec,
+                                                          mp->d_adj_sourcearrays,
+                                                          mp->d_ibool,
+                                                          mp->d_ispec_is_inner,
+                                                          mp->d_ispec_is_acoustic,
+                                                          mp->d_ispec_selected_rec,
+                                                          *phase_is_inner,
+                                                          mp->d_pre_computed_irec,
+                                                          mp->nadj_rec_local,
+                                                          mp->d_kappastore);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");
+#endif
+}

Deleted: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -1,923 +0,0 @@
-/*
- !=====================================================================
- !
- !               S p e c f e m 3 D  V e r s i o n  2 . 0
- !               ---------------------------------------
- !
- !          Main authors: Dimitri Komatitsch and Jeroen Tromp
- !    Princeton University, USA and University of Pau / CNRS / INRIA
- ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
- !                            April 2011
- !
- ! This program is free software; you can redistribute it and/or modify
- ! it under the terms of the GNU General Public License as published by
- ! the Free Software Foundation; either version 2 of the License, or
- ! (at your option) any later version.
- !
- ! This program is distributed in the hope that it will be useful,
- ! but WITHOUT ANY WARRANTY; without even the implied warranty of
- ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- ! GNU General Public License for more details.
- !
- ! You should have received a copy of the GNU General Public License along
- ! with this program; if not, write to the Free Software Foundation, Inc.,
- ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- !
- !=====================================================================
- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <cublas.h>
-#include <mpi.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-
-#include "config.h"
-#include "mesh_constants_cuda.h"
-// #include "epik_user.h"
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// elastic domain sources
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-// crashes if the CMTSOLUTION does not match the mesh properly
-__global__ void compute_add_sources_kernel(float* accel,
-                                           int* ibool,
-                                           int* ispec_is_inner,
-                                           int phase_is_inner,
-                                           float* sourcearrays,
-                                           double* stf_pre_compute,
-                                           int myrank,
-                                           int* islice_selected_source,
-                                           int* ispec_selected_source,
-                                           int* ispec_is_elastic,
-                                           int NSOURCES //,float* d_debug
-                                           ) {
-  int i = threadIdx.x;
-  int j = threadIdx.y;
-  int k = threadIdx.z;
-
-  int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
-  int ispec;
-  int iglob;
-  float stf;
-
-  if(isource < NSOURCES) { // when NSOURCES > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
-
-    if(myrank == islice_selected_source[isource]) {
-
-      ispec = ispec_selected_source[isource]-1;
-
-      if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) {
-
-        stf = (float) stf_pre_compute[isource];
-
-        //if(i==0 && j==0 && k==0) printf("add sources kernel: stf = %e\n",stf);
-
-        iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
-
-        atomicAdd(&accel[iglob*3],
-                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf);
-        atomicAdd(&accel[iglob*3+1],
-                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 1, i,j,k)]*stf);
-
-  // if((iglob*3+2 == 304598)) {
-  //   atomicAdd(&d_debug[0],1.0f);
-  //   d_debug[1] = accel[iglob*3+2];
-  //   d_debug[2] = sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 2, i,j,k)];
-  //   d_debug[3] = stf;
-  // }
-  // d_debug[4] = 42.0f;
-
-        atomicAdd(&accel[iglob*3+2],
-                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 2, i,j,k)]*stf);
-      }
-    }
-  }
-
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(compute_add_sources_el_cuda,
-              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
-                                           //int* NSPEC_ABf, int* NGLOB_ABf,
-                                            int* phase_is_innerf,
-                                            int* NSOURCESf,
-                                            //int* itf, float* dtf, float* t0f,
-                                            //int* SIMULATION_TYPEf,int* NSTEPf,
-                                            //int* NOISE_TOMOGRAPHYf,
-                                            //int* USE_FORCE_POINT_SOURCEf,
-                                            double* h_stf_pre_compute,
-                                            int* myrankf) {
-
-TRACE("compute_add_sources_el_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  // check if anything to do
-  if( mp->nsources_local == 0 ) return;
-
-  //int NSPEC_AB = *NSPEC_ABf;
-  //int NGLOB_AB = *NGLOB_ABf;
-  int phase_is_inner = *phase_is_innerf;
-  //int it = *itf;
-  //float dt = *dtf;
-  //float t0 = *t0f;
-  //int SIMULATION_TYPE = *SIMULATION_TYPEf;
-  //int NSTEP = *NSTEPf;
-  //int NOISE_TOMOGRAPHY = *NOISE_TOMOGRAPHYf;
-  int NSOURCES = *NSOURCESf;
-  //int USE_FORCE_POINT_SOURCE = *USE_FORCE_POINT_SOURCEf;
-  int myrank = *myrankf;
-
-
-  int num_blocks_x = NSOURCES;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  //double* d_stf_pre_compute;
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
-                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,5);
-
-  //float* d_debug;
-  // (float* accel, int* ibool, int* ispec_is_inner, int phase_is_inner,
-  // float* sourcearrays, double* stf_pre_compute,int myrank,
-  // int* islice_selected_source, int* ispec_selected_source,
-  // int* ispec_is_elastic, int NSOURCES)
-  //printf("add sources : nsources_local = %d\n",mp->nsources_local);
-  //printf("add sources : stf = %e\n",h_stf_pre_compute[0]);
-
-  compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel,
-                                               mp->d_ibool,
-                                               mp->d_ispec_is_inner,
-                                               phase_is_inner,
-                                               mp->d_sourcearrays,
-                                               mp->d_stf_pre_compute,
-                                               myrank,
-                                               mp->d_islice_selected_source,
-                                               mp->d_ispec_selected_source,
-                                               mp->d_ispec_is_elastic,
-                                               NSOURCES //,d_debug
-                                               );
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_kernel");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(compute_add_sources_el_s3_cuda,
-              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
-                                              int* USE_FORCE_POINT_SOURCE,
-                                              double* h_stf_pre_compute,
-                                              int* NSOURCESf,
-                                              int* phase_is_inner,int* myrank) {
-  TRACE("compute_add_sources_el_s3_cuda");
-  // EPIK_TRACER("compute_add_sources_el_s3_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-
-  int NSOURCES = *NSOURCESf;
-
-  if(*USE_FORCE_POINT_SOURCE) {
-    printf("USE FORCE POINT SOURCE not implemented for GPU_MODE");
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
-                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_el_s3_cuda");
-#endif
-
-  int num_blocks_x = NSOURCES;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,5);
-
-  //float* d_debug;
-  // float* h_debug = (float*)calloc(128,sizeof(float));
-  // cudaMalloc((void**)&d_debug,128*sizeof(float));
-  // cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
-
-  compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel,mp->d_ibool,
-                                               mp->d_ispec_is_inner, *phase_is_inner,
-                                               mp->d_sourcearrays,
-                                               mp->d_stf_pre_compute,
-                                               *myrank,
-                                               mp->d_islice_selected_source,mp->d_ispec_selected_source,
-                                               mp->d_ispec_is_elastic,
-                                               NSOURCES //,d_debug
-                                               );
-
-  // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
-  // for(int i=0;i<10;i++) {
-  //   printf("debug[%d] = %e \n",i,h_debug[i]);
-  // }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_el_s3_cuda");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// NOISE sources
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool,
-                                                        int* ispec_selected_rec,
-                                                        int irec_master_noise,
-                                                        realw* accel,
-                                                        realw* noise_sourcearray,
-                                                        int it) {
-  int tx = threadIdx.x;
-  int iglob = ibool[tx + 125*(ispec_selected_rec[irec_master_noise-1]-1)]-1;
-
-  // not sure if we need atomic operations but just in case...
-  // accel[3*iglob] += noise_sourcearray[3*tx + 3*125*it];
-  // accel[1+3*iglob] += noise_sourcearray[1+3*tx + 3*125*it];
-  // accel[2+3*iglob] += noise_sourcearray[2+3*tx + 3*125*it];
-
-  atomicAdd(&accel[iglob*3],noise_sourcearray[3*tx + 3*125*it]);
-  atomicAdd(&accel[iglob*3+1],noise_sourcearray[1+3*tx + 3*125*it]);
-  atomicAdd(&accel[iglob*3+2],noise_sourcearray[2+3*tx + 3*125*it]);
-
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(add_source_master_rec_noise_cu,
-              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
-                                                int* myrank_f,
-                                                int* it_f,
-                                                int* irec_master_noise_f,
-                                                int* islice_selected_rec) {
-
-TRACE("add_source_master_rec_noise_cu");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  int it = *it_f-1; // -1 for Fortran -> C indexing differences
-  int irec_master_noise = *irec_master_noise_f;
-  int myrank = *myrank_f;
-
-  dim3 grid(1,1,1);
-  dim3 threads(125,1,1);
-
-  if(myrank == islice_selected_rec[irec_master_noise-1]) {
-    add_source_master_rec_noise_cuda_kernel<<<grid,threads>>>(mp->d_ibool,
-                                                              mp->d_ispec_selected_rec,
-                                                              irec_master_noise,
-                                                              mp->d_accel,
-                                                              mp->d_noise_sourcearray,
-                                                              it);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("add_source_master_rec_noise_cuda_kernel");
-#endif
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// ADJOINT sources
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(float* accel,
-                                                     int nrec,
-                                                     float* adj_sourcearrays,
-                                                     int* ibool,
-                                                     int* ispec_is_inner,
-                                                     int* ispec_is_elastic,
-                                                     int* ispec_selected_rec,
-                                                     int phase_is_inner,
-                                                     int* islice_selected_rec,
-                                                     int* pre_computed_irec,
-                                                     int nadj_rec_local //,int myrank //,int* debugi,float* debugf
-                                                     ) {
-
-  int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
-
-  if(irec_local < nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
-
-    int irec = pre_computed_irec[irec_local];
-
-    int ispec = ispec_selected_rec[irec]-1;
-    if( ispec_is_elastic[ispec] ){
-
-      if(ispec_is_inner[ispec] == phase_is_inner) {
-        int i = threadIdx.x;
-        int j = threadIdx.y;
-        int k = threadIdx.z;
-        //int iglob = ibool[i+5*(j+5*(k+5*ispec))]-1;
-        int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
-
-        // atomic operations are absolutely necessary for correctness!
-        atomicAdd(&(accel[0+3*iglob]),adj_sourcearrays[INDEX5(5,5,5,3,
-                                                              i,j,k,
-                                                              0,
-                                                              irec_local)]);
-
-        atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(5,5,5,3,
-                                                             i,j,k,
-                                                             1,
-                                                             irec_local)]);
-
-        atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
-                                                            i,j,k,
-                                                            2,
-                                                            irec_local)]);
-      }
-    } // ispec_is_elastic
-  }
-
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(add_sources_el_sim_type_2_or_3,
-              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
-                                               float* h_adj_sourcearrays,
-                                               int* phase_is_inner,
-                                               int* h_ispec_is_inner,
-                                               int* h_ispec_is_elastic,
-                                               int* h_ispec_selected_rec,
-                                               int* myrank,
-                                               int* nrec,
-                                               int* time_index,
-                                               int* h_islice_selected_rec,
-                                               int* nadj_rec_local,
-                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
-
-TRACE("add_sources_el_sim_type_2_or_3");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-
-  // checks
-  if( *nadj_rec_local != mp->nadj_rec_local) exit_on_error("add_sources_el_sim_type_2_or_3: nadj_rec_local not equal\n");
-
-  //int rank;
-  //MPI_Comm_rank(MPI_COMM_WORLD,&rank);
-
-  // make sure grid dimension is less than 65535 in x dimension
-  int num_blocks_x = mp->nadj_rec_local;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y,1);
-  dim3 threads(5,5,5);
-
-  //float* d_adj_sourcearrays;
-  //print_CUDA_error_if_any(cudaMalloc((void**)&d_adj_sourcearrays,
-  //                                   (*nadj_rec_local)*3*125*sizeof(float)),1);
-
-  //float* h_adj_sourcearrays_slice = (float*)malloc((*nadj_rec_local)*3*125*sizeof(float));
-
-  //int* h_pre_computed_irec = new int[*nadj_rec_local];
-
-  //int* d_pre_computed_irec;
-  //cudaMalloc((void**)&d_pre_computed_irec,(*nadj_rec_local)*sizeof(int));
-
-  // build slice of adj_sourcearrays because full array is *very* large.
-  // note: this extracts array values for local adjoint sources at given time step "time_index"
-  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
-  int ispec,i,j,k;
-  int irec_local = 0;
-  for(int irec = 0; irec < *nrec; irec++) {
-    if(*myrank == h_islice_selected_rec[irec]) {
-      irec_local++;
-      //h_pre_computed_irec[irec_local-1] = irec;
-
-      // takes only elastic sources
-      ispec = h_ispec_selected_rec[irec]-1;
-      if( h_ispec_is_elastic[ispec] ){
-
-        if( h_ispec_is_inner[ispec] == *phase_is_inner) {
-          for(k=0;k<5;k++) {
-            for(j=0;j<5;j++) {
-              for(i=0;i<5;i++) {
-
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
-                                                i,j,k,0,
-                                                irec_local-1)]
-                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
-                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
-                                                    3,5,5,
-                                                    irec_local-1,
-                                                    *time_index-1,
-                                                    0,i,j,k)];
-
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
-                                                i,j,k,1,
-                                                irec_local-1)]
-                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
-                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
-                                                    3,5,5,
-                                                    irec_local-1,
-                                                    *time_index-1,
-                                                    1,i,j,k)];
-
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
-                                                i,j,k,2,
-                                                irec_local-1)]
-                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
-                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
-                                                    3,5,5,
-                                                    irec_local-1,
-                                                    *time_index-1,
-                                                    2,i,j,k)];
-              }
-            }
-          }
-        } // phase_is_inner
-      } // h_ispec_is_elastic
-    }
-  }
-  // check all local sources were added
-  if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
-
-  // printf("irec_local vs. *nadj_rec_local -> %d vs. %d\n",irec_local,*nadj_rec_local);
-  // for(int ispec=0;ispec<(*nadj_rec_local);ispec++) {
-  //   for(int i=0;i<5;i++)
-  //     for(int j=0;j<5;j++)
-  //  for(int k=0;k<5;k++) {
-  //    h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,ispec)] =
-  //      h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_READ_ADJSRC,3,5,5,
-  //              ispec,
-  //              *time_index-1,
-  //              0,
-  //              i,j,k)];
-  //    h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,ispec)] =
-  //      h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_READ_ADJSRC,3,5,5,
-  //              ispec,
-  //              *time_index-1,
-  //              1,
-  //              i,j,k)];
-  //    h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,ispec)] =
-  //      h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_ADJSRC,3,5,5,
-  //              ispec,
-  //              *time_index-1,
-  //              2,
-  //              i,j,k)];
-  //  }
-
-  // }
-
-  // copies extracted array values onto GPU
-  cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
-             (mp->nadj_rec_local)*3*125*sizeof(float),cudaMemcpyHostToDevice);
-
-
-  // the irec_local variable needs to be precomputed (as
-  // h_pre_comp..), because normally it is in the loop updating accel,
-  // and due to how it's incremented, it cannot be parallelized
-
-  // int irec_local=0;
-  // for(int irec=0;irec<*nrec;irec++) {
-  //   if(*myrank == h_islice_selected_rec[irec]) {
-  //     h_pre_computed_irec_local_index[irec] = irec_local;
-  //     irec_local++;
-  //     if(irec_local==1) {
-  //  // printf("%d:first useful irec==%d\n",rank,irec);
-  //     }
-  //   }
-  //   else h_pre_computed_irec_local_index[irec] = 0;
-  // }
-  //cudaMemcpy(mp->d_pre_computed_irec,mp->h_pre_computed_irec,
-  //           (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice);
-
-  // pause_for_debugger(1);
-  //int* d_debugi, *h_debugi;
-  //float* d_debugf, *h_debugf;
-  //h_debugi = (int*)calloc(num_blocks_x,sizeof(int));
-  //cudaMalloc((void**)&d_debugi,num_blocks_x*sizeof(int));
-  //cudaMemcpy(d_debugi,h_debugi,num_blocks_x*sizeof(int),cudaMemcpyHostToDevice);
-  //h_debugf = (float*)calloc(num_blocks_x,sizeof(float));
-  //cudaMalloc((void**)&d_debugf,num_blocks_x*sizeof(float));
-  //cudaMemcpy(d_debugf,h_debugf,num_blocks_x*sizeof(float),cudaMemcpyHostToDevice);
-
-  add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel,
-                                                         *nrec,
-                                                         mp->d_adj_sourcearrays,
-                                                         mp->d_ibool,
-                                                         mp->d_ispec_is_inner,
-                                                         mp->d_ispec_is_elastic,
-                                                         mp->d_ispec_selected_rec,
-                                                         *phase_is_inner,
-                                                         mp->d_islice_selected_rec,
-                                                         mp->d_pre_computed_irec,
-                                                         mp->nadj_rec_local //,*myrank //,d_debugi,d_debugf
-                                                         );
-
-  //cudaMemcpy(h_debugi,d_debugi,num_blocks_x*sizeof(int),cudaMemcpyDeviceToHost);
-  //cudaMemcpy(h_debugf,d_debugf,num_blocks_x*sizeof(float),cudaMemcpyDeviceToHost);
-
-  // printf("%d: pre_com0:%d\n",rank,h_pre_computed_irec_local_index[0]);
-  // printf("%d: pre_com1:%d\n",rank,h_pre_computed_irec_local_index[1]);
-  // printf("%d: pre_com2:%d\n",rank,h_pre_computed_irec_local_index[2]);
-  // for(int i=156;i<(156+30);i++) {
-  //   if(rank==0) printf("%d:debug[%d] = i/f = %d / %e\n",rank,i,h_debugi[i],h_debugf[i]);
-  // }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  // MPI_Barrier(MPI_COMM_WORLD);
-  exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
-
-  // printf("Proc %d exiting with successful kernel\n",rank);
-  // exit(1);
-#endif
-  //cudaFree(d_adj_sourcearrays);
-  //cudaFree(d_pre_computed_irec);
-  //free(h_adj_sourcearrays_slice);
-  //delete h_pre_computed_irec;
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// acoustic sources
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void compute_add_sources_acoustic_kernel(float* potential_dot_dot_acoustic,
-                                                    int* ibool,
-                                                    int* ispec_is_inner,
-                                                    int phase_is_inner,
-                                                    float* sourcearrays,
-                                                    double* stf_pre_compute,
-                                                    int myrank,
-                                                    int* islice_selected_source,
-                                                    int* ispec_selected_source,
-                                                    int* ispec_is_acoustic,
-                                                    float* kappastore,
-                                                    int NSOURCES) {
-  int i = threadIdx.x;
-  int j = threadIdx.y;
-  int k = threadIdx.z;
-
-  int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
-
-  int ispec;
-  int iglob;
-  float stf;
-  float kappal;
-
-  if( isource < NSOURCES ){
-
-    //if(myrank == 0 && i== 0 && j == 0 && k == 0) printf("source isource = %i \n",isource);
-
-    if(myrank == islice_selected_source[isource]) {
-
-      ispec = ispec_selected_source[isource]-1;
-
-      if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) {
-
-        stf = (float) stf_pre_compute[isource];
-        iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
-        kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
-
-        //printf("source ispec = %i %i %e %e \n",ispec,iglob,stf,kappal);
-        //printf("source arr = %e %i %i %i %i %i\n", -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal,i,j,k,iglob,ispec);
-
-        atomicAdd(&potential_dot_dot_acoustic[iglob],
-                  -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal);
-
-        //      potential_dot_dot_acoustic[iglob] +=
-        //                -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal;
-
-        //printf("potential = %e %i %i %i %i %i\n", potential_dot_dot_acoustic[iglob],i,j,k,iglob,ispec);
-
-
-      }
-    }
-  }
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(compute_add_sources_ac_cuda,
-              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
-                                                 int* phase_is_innerf,
-                                                 int* NSOURCESf,
-                                                 int* SIMULATION_TYPEf,
-                                                 int* USE_FORCE_POINT_SOURCEf,
-                                                 double* h_stf_pre_compute,
-                                                 int* myrankf) {
-
-TRACE("compute_add_sources_ac_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  // check if anything to do
-  if( mp->nsources_local == 0 ) return;
-
-  int phase_is_inner = *phase_is_innerf;
-  //int SIMULATION_TYPE = *SIMULATION_TYPEf;
-  int NSOURCES = *NSOURCESf;
-  //int USE_FORCE_POINT_SOURCE = *USE_FORCE_POINT_SOURCEf;
-  int myrank = *myrankf;
-
-  int num_blocks_x = NSOURCES;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  // copies pre-computed source time factors onto GPU
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
-                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,5);
-
-  compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
-                                                        mp->d_ibool,
-                                                        mp->d_ispec_is_inner,
-                                                        phase_is_inner,
-                                                        mp->d_sourcearrays,
-                                                        mp->d_stf_pre_compute,
-                                                        myrank,
-                                                        mp->d_islice_selected_source,
-                                                        mp->d_ispec_selected_source,
-                                                        mp->d_ispec_is_acoustic,
-                                                        mp->d_kappastore,
-                                                        NSOURCES);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_ac_cuda");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(compute_add_sources_ac_s3_cuda,
-              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
-                                                      int* phase_is_innerf,
-                                                      int* NSOURCESf,
-                                                      int* SIMULATION_TYPEf,
-                                                      int* USE_FORCE_POINT_SOURCEf,
-                                                      double* h_stf_pre_compute,
-                                                      int* myrankf) {
-
-TRACE("compute_add_sources_ac_s3_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  // check if anything to do
-  if( mp->nsources_local == 0 ) return;
-
-  int phase_is_inner = *phase_is_innerf;
-  //int SIMULATION_TYPE = *SIMULATION_TYPEf;
-  int NSOURCES = *NSOURCESf;
-  //int USE_FORCE_POINT_SOURCE = *USE_FORCE_POINT_SOURCEf;
-  int myrank = *myrankf;
-
-  int num_blocks_x = NSOURCES;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  // copies source time factors onto GPU
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
-                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,5);
-
-  compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
-                                                        mp->d_ibool,
-                                                        mp->d_ispec_is_inner,
-                                                        phase_is_inner,
-                                                        mp->d_sourcearrays,
-                                                        mp->d_stf_pre_compute,
-                                                        myrank,
-                                                        mp->d_islice_selected_source,
-                                                        mp->d_ispec_selected_source,
-                                                        mp->d_ispec_is_acoustic,
-                                                        mp->d_kappastore,
-                                                        NSOURCES);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_ac_s3_cuda");
-#endif
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// acoustic adjoint sources
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(float* potential_dot_dot_acoustic,
-                                                      int nrec,
-                                                      float* adj_sourcearrays,
-                                                      int* ibool,
-                                                      int* ispec_is_inner,
-                                                      int* ispec_is_acoustic,
-                                                      int* ispec_selected_rec,
-                                                      int phase_is_inner,
-                                                      int* islice_selected_rec,
-                                                      int* pre_computed_irec,
-                                                      int nadj_rec_local,
-                                                      float* kappastore) {
-
-  int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
-
-  // because of grid shape, irec_local can be too big
-  if(irec_local < nadj_rec_local) {
-
-    int irec = pre_computed_irec[irec_local];
-
-    int ispec = ispec_selected_rec[irec]-1;
-    if( ispec_is_acoustic[ispec] ){
-
-      // checks if element is in phase_is_inner run
-      if(ispec_is_inner[ispec] == phase_is_inner) {
-        int i = threadIdx.x;
-        int j = threadIdx.y;
-        int k = threadIdx.z;
-        int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
-
-        //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
-
-        //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
-        //                                            pre_computed_irec_local_index[irec],
-        //                                            pre_computed_index,
-        //                                            0,
-        //                                            i,j,k)]/kappal;
-
-        // beware, for acoustic medium, a pressure source would be taking the negative
-        // and divide by Kappa of the fluid;
-        // this would have to be done when constructing the adjoint source.
-        //
-        // note: we take the first component of the adj_sourcearrays
-        //          the idea is to have e.g. a pressure source, where all 3 components would be the same
-
-        atomicAdd(&potential_dot_dot_acoustic[iglob],adj_sourcearrays[INDEX5(5,5,5,3,
-                                                                             i,j,k,
-                                                                             0,
-                                                                             irec_local)] // / kappal
-                                                                             );
-
-                  //+adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
-                  //                         pre_computed_irec_local_index[irec],pre_computed_index-1,
-                  //                         0,i,j,k)] // / kappal
-                  //                         );
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-extern "C"
-void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
-              ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
-                                               float* h_adj_sourcearrays,
-                                               int* phase_is_inner,
-                                               int* h_ispec_is_inner,
-                                               int* h_ispec_is_acoustic,
-                                               int* h_ispec_selected_rec,
-                                               int* myrank,
-                                               int* nrec,
-                                               int* time_index,
-                                               int* h_islice_selected_rec,
-                                               int* nadj_rec_local,
-                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
-
-TRACE("add_sources_ac_sim_2_or_3_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-
-  // checks
-  if( *nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
-
-  // make sure grid dimension is less than 65535 in x dimension
-  int num_blocks_x = mp->nadj_rec_local;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y,1);
-  dim3 threads(5,5,5);
-
-  // build slice of adj_sourcearrays because full array is *very* large.
-  // note: this extracts array values for local adjoint sources at given time step "time_index"
-  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
-  int ispec,i,j,k;
-  int irec_local = 0;
-  for(int irec = 0; irec < *nrec; irec++) {
-    if(*myrank == h_islice_selected_rec[irec]) {
-      irec_local++;
-
-      // takes only acoustic sources
-      ispec = h_ispec_selected_rec[irec]-1;
-      if( h_ispec_is_acoustic[ispec] ){
-
-        if( h_ispec_is_inner[ispec] == *phase_is_inner) {
-          for(k=0;k<5;k++) {
-            for(j=0;j<5;j++) {
-              for(i=0;i<5;i++) {
-
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
-                                                    i,j,k,0,
-                                                    irec_local-1)]
-                = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
-                                            *NTSTEP_BETWEEN_READ_ADJSRC,
-                                            3,5,5,
-                                            irec_local-1,
-                                            *time_index-1,
-                                            0,i,j,k)];
-
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
-                                                    i,j,k,1,
-                                                    irec_local-1)]
-                = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
-                                            *NTSTEP_BETWEEN_READ_ADJSRC,
-                                            3,5,5,
-                                            irec_local-1,
-                                            *time_index-1,
-                                            1,i,j,k)];
-
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
-                                                    i,j,k,2,
-                                                    irec_local-1)]
-                = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
-                                            *NTSTEP_BETWEEN_READ_ADJSRC,
-                                            3,5,5,
-                                            irec_local-1,
-                                            *time_index-1,
-                                            2,i,j,k)];
-              }
-            }
-          }
-        } // phase_is_inner
-      } // h_ispec_is_acoustic
-    }
-  }
-  // check all local sources were added
-  if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
-
-  // copies extracted array values onto GPU
-  cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
-             (mp->nadj_rec_local)*3*125*sizeof(float),cudaMemcpyHostToDevice);
-
-  // launches cuda kernel for acoustic adjoint sources
-  add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
-                                                          *nrec,
-                                                          mp->d_adj_sourcearrays,
-                                                          mp->d_ibool,
-                                                          mp->d_ispec_is_inner,
-                                                          mp->d_ispec_is_acoustic,
-                                                          mp->d_ispec_selected_rec,
-                                                          *phase_is_inner,
-                                                          mp->d_islice_selected_rec,
-                                                          mp->d_pre_computed_irec,
-                                                          mp->nadj_rec_local,
-                                                          mp->d_kappastore);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");
-#endif
-}

Copied: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu (from rev 19151, seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu)
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -0,0 +1,422 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+// #include "epik_user.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// elastic domain sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_add_sources_kernel(realw* accel,
+                                           int* ibool,
+                                           int* ispec_is_inner,
+                                           int phase_is_inner,
+                                           realw* sourcearrays,
+                                           double* stf_pre_compute,
+                                           int myrank,
+                                           int* islice_selected_source,
+                                           int* ispec_selected_source,
+                                           int* ispec_is_elastic,
+                                           int NSOURCES) {
+  int i = threadIdx.x;
+  int j = threadIdx.y;
+  int k = threadIdx.z;
+
+  int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
+  int ispec;
+  int iglob;
+  realw stf;
+
+  if(isource < NSOURCES) { // when NSOURCES > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+
+    if(myrank == islice_selected_source[isource]) {
+
+      ispec = ispec_selected_source[isource]-1;
+
+      if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) {
+
+        stf = (realw) stf_pre_compute[isource];
+        iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+        atomicAdd(&accel[iglob*3],
+                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf);
+        atomicAdd(&accel[iglob*3+1],
+                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 1, i,j,k)]*stf);
+        atomicAdd(&accel[iglob*3+2],
+                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 2, i,j,k)]*stf);
+      }
+    }
+  }
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_el_cuda,
+              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
+                                            int* phase_is_innerf,
+                                            int* NSOURCESf,
+                                            double* h_stf_pre_compute,
+                                            int* myrankf) {
+
+TRACE("compute_add_sources_el_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // check if anything to do
+  if( mp->nsources_local == 0 ) return;
+
+  int phase_is_inner = *phase_is_innerf;
+  int NSOURCES = *NSOURCESf;
+  int myrank = *myrankf;
+
+  int num_blocks_x = NSOURCES;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  //double* d_stf_pre_compute;
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel,
+                                               mp->d_ibool,
+                                               mp->d_ispec_is_inner,
+                                               phase_is_inner,
+                                               mp->d_sourcearrays,
+                                               mp->d_stf_pre_compute,
+                                               myrank,
+                                               mp->d_islice_selected_source,
+                                               mp->d_ispec_selected_source,
+                                               mp->d_ispec_is_elastic,
+                                               NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_kernel");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_el_s3_cuda,
+              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
+                                              double* h_stf_pre_compute,
+                                              int* NSOURCESf,
+                                              int* phase_is_inner,
+                                              int* myrank) {
+  TRACE("compute_add_sources_el_s3_cuda");
+  // EPIK_TRACER("compute_add_sources_el_s3_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  int NSOURCES = *NSOURCESf;
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_el_s3_cuda");
+#endif
+
+  int num_blocks_x = NSOURCES;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel,mp->d_ibool,
+                                               mp->d_ispec_is_inner, *phase_is_inner,
+                                               mp->d_sourcearrays,
+                                               mp->d_stf_pre_compute,
+                                               *myrank,
+                                               mp->d_islice_selected_source,mp->d_ispec_selected_source,
+                                               mp->d_ispec_is_elastic,
+                                               NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_el_s3_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// NOISE sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool,
+                                                        int* ispec_selected_rec,
+                                                        int irec_master_noise,
+                                                        realw* accel,
+                                                        realw* noise_sourcearray,
+                                                        int it) {
+  int tx = threadIdx.x;
+  int iglob = ibool[tx + NGLL3*(ispec_selected_rec[irec_master_noise-1]-1)]-1;
+
+  // not sure if we need atomic operations but just in case...
+  // accel[3*iglob] += noise_sourcearray[3*tx + 3*125*it];
+  // accel[1+3*iglob] += noise_sourcearray[1+3*tx + 3*125*it];
+  // accel[2+3*iglob] += noise_sourcearray[2+3*tx + 3*125*it];
+
+  atomicAdd(&accel[iglob*3],noise_sourcearray[3*tx + 3*NGLL3*it]);
+  atomicAdd(&accel[iglob*3+1],noise_sourcearray[1+3*tx + 3*NGLL3*it]);
+  atomicAdd(&accel[iglob*3+2],noise_sourcearray[2+3*tx + 3*NGLL3*it]);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(add_source_master_rec_noise_cu,
+              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
+                                                int* myrank_f,
+                                                int* it_f,
+                                                int* irec_master_noise_f,
+                                                int* islice_selected_rec) {
+
+TRACE("add_source_master_rec_noise_cu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  int it = *it_f-1; // -1 for Fortran -> C indexing differences
+  int irec_master_noise = *irec_master_noise_f;
+  int myrank = *myrank_f;
+
+  dim3 grid(1,1,1);
+  dim3 threads(NGLL3,1,1);
+
+  if(myrank == islice_selected_rec[irec_master_noise-1]) {
+    add_source_master_rec_noise_cuda_kernel<<<grid,threads>>>(mp->d_ibool,
+                                                              mp->d_ispec_selected_rec,
+                                                              irec_master_noise,
+                                                              mp->d_accel,
+                                                              mp->d_noise_sourcearray,
+                                                              it);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("add_source_master_rec_noise_cuda_kernel");
+#endif
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ADJOINT sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel,
+                                                     int nrec,
+                                                     realw* adj_sourcearrays,
+                                                     int* ibool,
+                                                     int* ispec_is_inner,
+                                                     int* ispec_is_elastic,
+                                                     int* ispec_selected_rec,
+                                                     int phase_is_inner,
+                                                     int* pre_computed_irec,
+                                                     int nadj_rec_local) {
+
+  int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
+
+  if(irec_local < nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+
+    int irec = pre_computed_irec[irec_local];
+
+    int ispec = ispec_selected_rec[irec]-1;
+    if( ispec_is_elastic[ispec] ){
+
+      if(ispec_is_inner[ispec] == phase_is_inner) {
+        int i = threadIdx.x;
+        int j = threadIdx.y;
+        int k = threadIdx.z;
+        int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+        // atomic operations are absolutely necessary for correctness!
+        atomicAdd(&accel[3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
+                                                              i,j,k,
+                                                              0,
+                                                              irec_local)]);
+
+        atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(5,5,5,3,
+                                                             i,j,k,
+                                                             1,
+                                                             irec_local)]);
+
+        atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
+                                                            i,j,k,
+                                                            2,
+                                                            irec_local)]);
+      }
+    } // ispec_is_elastic
+  }
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(add_sources_el_sim_type_2_or_3,
+              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
+                                               realw* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_elastic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
+
+TRACE("add_sources_el_sim_type_2_or_3");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  // checks
+  if( *nadj_rec_local != mp->nadj_rec_local) exit_on_error("add_sources_el_sim_type_2_or_3: nadj_rec_local not equal\n");
+
+  // make sure grid dimension is less than 65535 in x dimension
+  int num_blocks_x = mp->nadj_rec_local;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y,1);
+  dim3 threads(5,5,5);
+
+  // build slice of adj_sourcearrays because full array is *very* large.
+  // note: this extracts array values for local adjoint sources at given time step "time_index"
+  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
+  int ispec,i,j,k;
+  int irec_local = 0;
+  for(int irec = 0; irec < *nrec; irec++) {
+    if(*myrank == h_islice_selected_rec[irec]) {
+      irec_local++;
+
+      // takes only elastic sources
+      ispec = h_ispec_selected_rec[irec]-1;
+      if( h_ispec_is_elastic[ispec] ){
+
+        if( h_ispec_is_inner[ispec] == *phase_is_inner) {
+          for(k=0;k<5;k++) {
+            for(j=0;j<5;j++) {
+              for(i=0;i<5;i++) {
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                                                i,j,k,0,
+                                                irec_local-1)]
+                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
+                                                    3,5,5,
+                                                    irec_local-1,
+                                                    *time_index-1,
+                                                    0,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                                                i,j,k,1,
+                                                irec_local-1)]
+                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
+                                                    3,5,5,
+                                                    irec_local-1,
+                                                    *time_index-1,
+                                                    1,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                                                i,j,k,2,
+                                                irec_local-1)]
+                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
+                                                    3,5,5,
+                                                    irec_local-1,
+                                                    *time_index-1,
+                                                    2,i,j,k)];
+              }
+            }
+          }
+        } // phase_is_inner
+      } // h_ispec_is_elastic
+    }
+  }
+  // check all local sources were added
+  if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
+
+  // copies extracted array values onto GPU
+  cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+             (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice);
+
+
+  // the irec_local variable needs to be precomputed (as
+  // h_pre_comp..), because normally it is in the loop updating accel,
+  // and due to how it's incremented, it cannot be parallelized
+
+  add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel,
+                                                         *nrec,
+                                                         mp->d_adj_sourcearrays,
+                                                         mp->d_ibool,
+                                                         mp->d_ispec_is_inner,
+                                                         mp->d_ispec_is_elastic,
+                                                         mp->d_ispec_selected_rec,
+                                                         *phase_is_inner,
+                                                         mp->d_pre_computed_irec,
+                                                         mp->nadj_rec_local);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
+#endif
+}
+

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
-#include <mpi.h>
 
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -44,13 +43,13 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void compute_coupling_acoustic_el_kernel(float* displ,
-                                                    float* potential_dot_dot_acoustic,
+__global__ void compute_coupling_acoustic_el_kernel(realw* displ,
+                                                    realw* potential_dot_dot_acoustic,
                                                     int num_coupling_ac_el_faces,
                                                     int* coupling_ac_el_ispec,
                                                     int* coupling_ac_el_ijk,
-                                                    float* coupling_ac_el_normal,
-                                                    float* coupling_ac_el_jacobian2Dw,
+                                                    realw* coupling_ac_el_normal,
+                                                    realw* coupling_ac_el_jacobian2Dw,
                                                     int* ibool,
                                                     int* ispec_is_inner,
                                                     int phase_is_inner) {
@@ -130,8 +129,7 @@
   int SIMULATION_TYPE           = *SIMULATION_TYPEf;
 
   // way 1: exact blocksize to match NGLLSQUARE
-  int blocksize = 25;
-
+  int blocksize = NGLL2;
   int num_blocks_x = num_coupling_ac_el_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
@@ -183,13 +181,13 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void compute_coupling_elastic_ac_kernel(float* potential_dot_dot_acoustic,
-                                                    float* accel,
+__global__ void compute_coupling_elastic_ac_kernel(realw* potential_dot_dot_acoustic,
+                                                    realw* accel,
                                                     int num_coupling_ac_el_faces,
                                                     int* coupling_ac_el_ispec,
                                                     int* coupling_ac_el_ijk,
-                                                    float* coupling_ac_el_normal,
-                                                    float* coupling_ac_el_jacobian2Dw,
+                                                    realw* coupling_ac_el_normal,
+                                                    realw* coupling_ac_el_jacobian2Dw,
                                                     int* ibool,
                                                     int* ispec_is_inner,
                                                     int phase_is_inner) {

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
-#include <mpi.h>
 
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -42,8 +41,8 @@
 
 // prepares a device array with with all inter-element edge-nodes -- this
 // is followed by a memcpy and MPI operations
-__global__ void prepare_boundary_potential_on_device(float* d_potential_dot_dot_acoustic,
-                                                     float* d_send_potential_dot_dot_buffer,
+__global__ void prepare_boundary_potential_on_device(realw* d_potential_dot_dot_acoustic,
+                                                     realw* d_send_potential_dot_dot_buffer,
                                                      int num_interfaces_ext_mesh,
                                                      int max_nibool_interfaces_ext_mesh,
                                                      int* d_nibool_interfaces_ext_mesh,
@@ -70,8 +69,8 @@
               TRANSFER_BOUN_POT_FROM_DEVICE)(
                                               int* size,
                                               long* Mesh_pointer_f,
-                                              float* potential_dot_dot_acoustic,
-                                              float* send_potential_dot_dot_buffer,
+                                              realw* potential_dot_dot_acoustic,
+                                              realw* send_potential_dot_dot_buffer,
                                               int* num_interfaces_ext_mesh,
                                               int* max_nibool_interfaces_ext_mesh,
                                               int* nibool_interfaces_ext_mesh,
@@ -84,8 +83,8 @@
 
   if( *num_interfaces_ext_mesh == 0 ) return;
 
-  int blocksize = 256;
-  int size_padded = ((int)ceil(((double)*max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_ext_mesh))/((double)blocksize)))*blocksize;
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
@@ -99,22 +98,22 @@
   if(*FORWARD_OR_ADJOINT == 1) {
     prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
                                                          mp->d_send_potential_dot_dot_buffer,
-                                                         *num_interfaces_ext_mesh,
-                                                         *max_nibool_interfaces_ext_mesh,
+                                                         mp->num_interfaces_ext_mesh,
+                                                         mp->max_nibool_interfaces_ext_mesh,
                                                          mp->d_nibool_interfaces_ext_mesh,
                                                          mp->d_ibool_interfaces_ext_mesh);
   }
   else if(*FORWARD_OR_ADJOINT == 3) {
     prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
                                                            mp->d_send_potential_dot_dot_buffer,
-                                                           *num_interfaces_ext_mesh,
-                                                           *max_nibool_interfaces_ext_mesh,
+                                                           mp->num_interfaces_ext_mesh,
+                                                           mp->max_nibool_interfaces_ext_mesh,
                                                            mp->d_nibool_interfaces_ext_mesh,
                                                            mp->d_ibool_interfaces_ext_mesh);
   }
 
-  cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_potential_dot_dot_buffer,
-        *max_nibool_interfaces_ext_mesh* *num_interfaces_ext_mesh*sizeof(realw),cudaMemcpyDeviceToHost);
+  print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_potential_dot_dot_buffer,
+      (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),cudaMemcpyDeviceToHost),98000);
 
   // finish timing of kernel+memcpy
   // cudaEventRecord( stop, 0 );
@@ -132,8 +131,8 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 
-__global__ void assemble_boundary_potential_on_device(float* d_potential_dot_dot_acoustic,
-                                                      float* d_send_potential_dot_dot_buffer,
+__global__ void assemble_boundary_potential_on_device(realw* d_potential_dot_dot_acoustic,
+                                                      realw* d_send_potential_dot_dot_buffer,
                                                       int num_interfaces_ext_mesh,
                                                       int max_nibool_interfaces_ext_mesh,
                                                       int* d_nibool_interfaces_ext_mesh,
@@ -182,18 +181,18 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
   //double start_time = get_time();
   // cudaEvent_t start, stop;
-  // float time;
+  // realw time;
   // cudaEventCreate(&start);
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
 
   // copies buffer onto GPU
   cudaMemcpy(mp->d_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh,
-             *max_nibool_interfaces_ext_mesh* *num_interfaces_ext_mesh*sizeof(realw), cudaMemcpyHostToDevice);
+             (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw), cudaMemcpyHostToDevice);
 
   // assembles on GPU
-  int blocksize = 256;
-  int size_padded = ((int)ceil(((double)*max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
@@ -208,8 +207,8 @@
     //assemble forward field
     assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
                                                           mp->d_send_potential_dot_dot_buffer,
-                                                          *num_interfaces_ext_mesh,
-                                                          *max_nibool_interfaces_ext_mesh,
+                                                          mp->num_interfaces_ext_mesh,
+                                                          mp->max_nibool_interfaces_ext_mesh,
                                                           mp->d_nibool_interfaces_ext_mesh,
                                                           mp->d_ibool_interfaces_ext_mesh);
   }
@@ -217,8 +216,8 @@
     //assemble reconstructed/backward field
     assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
                                                             mp->d_send_potential_dot_dot_buffer,
-                                                            *num_interfaces_ext_mesh,
-                                                            *max_nibool_interfaces_ext_mesh,
+                                                            mp->num_interfaces_ext_mesh,
+                                                            mp->max_nibool_interfaces_ext_mesh,
                                                             mp->d_nibool_interfaces_ext_mesh,
                                                             mp->d_ibool_interfaces_ext_mesh);
   }
@@ -239,23 +238,6 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-//void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase, int SIMULATION_TYPE);
-
-//__global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,int* d_phase_ispec_inner_acoustic,
-//                                       int num_phase_ispec_acoustic, int d_iphase,
-//                                       float* d_potential_acoustic, float* d_potential_dot_dot_acoustic,
-//                                       float* d_xix, float* d_xiy, float* d_xiz, float* d_etax, float* d_etay, float* d_etaz,
-//                                       float* d_gammax, float* d_gammay, float* d_gammaz,
-//                                       float* hprime_xx, float* hprime_yy, float* hprime_zz,
-//                                       float* hprimewgll_xx, float* hprimewgll_yy, float* hprimewgll_zz,
-//                                       float* wgllwgll_xy,float* wgllwgll_xz,float* wgllwgll_yz,
-//                                       float* d_rhostore);
-
-
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
 /* KERNEL 2 */
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -267,20 +249,20 @@
                                        int num_phase_ispec_acoustic,
                                        int d_iphase,
                                        int use_mesh_coloring_gpu,
-                                       float* d_potential_acoustic, float* d_potential_dot_dot_acoustic,
-                                       float* d_xix, float* d_xiy, float* d_xiz,
-                                       float* d_etax, float* d_etay, float* d_etaz,
-                                       float* d_gammax, float* d_gammay, float* d_gammaz,
-                                       float* hprime_xx, float* hprime_yy, float* hprime_zz,
-                                       float* hprimewgll_xx, float* hprimewgll_yy, float* hprimewgll_zz,
-                                       float* wgllwgll_xy,float* wgllwgll_xz,float* wgllwgll_yz,
-                                       float* d_rhostore){
+                                       realw* d_potential_acoustic, realw* d_potential_dot_dot_acoustic,
+                                       realw* d_xix, realw* d_xiy, realw* d_xiz,
+                                       realw* d_etax, realw* d_etay, realw* d_etaz,
+                                       realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+                                       realw* hprime_xx, realw* hprime_yy, realw* hprime_zz,
+                                       realw* hprimewgll_xx, realw* hprimewgll_yy, realw* hprimewgll_zz,
+                                       realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
+                                       realw* d_rhostore){
 
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
   int tx = threadIdx.x;
 
-  const int NGLL3 = 125;
-  const int NGLL3_ALIGN = 128;
+  //const int NGLL3 = NGLL3;
+  const int NGLL3_ALIGN = NGLL3_PADDED;
 
   int K = (tx/NGLL2);
   int J = ((tx-K*NGLL2)/NGLLX);
@@ -296,7 +278,7 @@
 
 #ifndef MANUALLY_UNROLLED_LOOPS
     int l;
-    float hp1,hp2,hp3;
+    realw hp1,hp2,hp3;
 #endif
 
     __shared__ reald s_dummy_loc[NGLL3];
@@ -326,7 +308,7 @@
 #endif
 
       // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
-      iglob = d_ibool[working_element*125 + tx]-1;
+      iglob = d_ibool[working_element*NGLL3 + tx]-1;
 
 #ifdef USE_TEXTURES
       s_dummy_loc[tx] = tex1Dfetch(tex_potential_acoustic, iglob);
@@ -516,16 +498,16 @@
 void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
                        int SIMULATION_TYPE,
                        int* d_ibool,
-                       float* d_xix,
-                       float* d_xiy,
-                       float* d_xiz,
-                       float* d_etax,
-                       float* d_etay,
-                       float* d_etaz,
-                       float* d_gammax,
-                       float* d_gammay,
-                       float* d_gammaz,
-                       float* d_rhostore)
+                       realw* d_xix,
+                       realw* d_xiy,
+                       realw* d_xiz,
+                       realw* d_etax,
+                       realw* d_etay,
+                       realw* d_etaz,
+                       realw* d_gammax,
+                       realw* d_gammay,
+                       realw* d_gammaz,
+                       realw* d_rhostore)
 {
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -543,13 +525,13 @@
     num_blocks_y = num_blocks_y*2;
   }
 
-  int threads_2 = 128;//BLOCK_SIZE_K2;
+  int threads_2 = NGLL3_PADDED;//BLOCK_SIZE_K2;
   dim3 grid_2(num_blocks_x,num_blocks_y);
 
 
   // Cuda timing
   // cudaEvent_t start, stop;
-  // float time;
+  // realw time;
   // cudaEventCreate(&start);
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
@@ -631,10 +613,6 @@
 
   if( num_elements == 0 ) return;
 
-  //int myrank;
-  /* MPI_Comm_rank(MPI_COMM_WORLD,&myrank); */
-  /* if(myrank==0) { */
-
   // mesh coloring
   if( mp->use_mesh_coloring_gpu ){
 
@@ -654,7 +632,7 @@
 
       // array offsets (acoustic elements start after elastic ones)
       color_offset = mp->nspec_elastic * NGLL3_PADDED;
-      color_offset_nonpadded = mp->nspec_elastic * NGLL3_NONPADDED;
+      color_offset_nonpadded = mp->nspec_elastic * NGLL3;
     }else{
       // inner element colors (start after outer elements)
       nb_colors = mp->num_colors_outer_acoustic + mp->num_colors_inner_acoustic;
@@ -662,7 +640,7 @@
 
       // array offsets (inner elements start after outer ones)
       color_offset = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3_PADDED;
-      color_offset_nonpadded = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3_NONPADDED;
+      color_offset_nonpadded = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3;
     }
 
     // loops over colors
@@ -670,12 +648,6 @@
 
       nb_blocks_to_compute = mp->h_num_elem_colors_acoustic[icolor];
 
-      // checks
-      //if( nb_blocks_to_compute <= 0 ){
-      //  printf("error number of acoustic color blocks: %d -- color = %d \n",nb_blocks_to_compute,icolor);
-      //  exit(EXIT_FAILURE);
-      //}
-
       Kernel_2_acoustic(nb_blocks_to_compute,mp,*iphase,
                          *SIMULATION_TYPE,
                          mp->d_ibool + color_offset_nonpadded,
@@ -693,13 +665,12 @@
       // for padded and aligned arrays
       color_offset += nb_blocks_to_compute * NGLL3_PADDED;
       // for no-aligned arrays
-      color_offset_nonpadded += nb_blocks_to_compute * NGLL3_NONPADDED;
+      color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
     }
 
   }else{
 
     // no mesh coloring: uses atomic updates
-
     Kernel_2_acoustic(num_elements, mp, *iphase,
                       *SIMULATION_TYPE,
                       mp->d_ibool,
@@ -715,14 +686,6 @@
                       mp->d_rhostore);
 
   }
-
-  //cudaThreadSynchronize();
-
-  //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  /* MPI_Barrier(MPI_COMM_WORLD); */
-  //double end_time = get_time();
-  //printf("Elapsed time: %e\n",end_time-start_time);
-  //#endif
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -732,9 +695,9 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 
-__global__ void kernel_3_a_acoustic_cuda_device(float* potential_dot_dot_acoustic,
+__global__ void kernel_3_a_acoustic_cuda_device(realw* potential_dot_dot_acoustic,
                                                 int size,
-                                                float* rmass_acoustic) {
+                                                realw* rmass_acoustic) {
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
   /* because of block and grid sizing problems, there is a small */
@@ -747,11 +710,11 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void kernel_3_b_acoustic_cuda_device(float* potential_dot_acoustic,
-                                                float* potential_dot_dot_acoustic,
+__global__ void kernel_3_b_acoustic_cuda_device(realw* potential_dot_acoustic,
+                                                realw* potential_dot_dot_acoustic,
                                                 int size,
                                                 realw deltatover2,
-                                                float* rmass_acoustic) {
+                                                realw* rmass_acoustic) {
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
   /* because of block and grid sizing problems, there is a small */
@@ -775,7 +738,7 @@
    Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
    int size = *size_F;
 
-   int blocksize=128;
+   int blocksize = BLOCKSIZE_KERNEL3;
    int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
    int num_blocks_x = size_padded/blocksize;
    int num_blocks_y = 1;
@@ -808,9 +771,9 @@
 void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
                                                              long* Mesh_pointer,
                                                              int* size_F,
-                                                             float* deltatover2_F,
+                                                             realw* deltatover2_F,
                                                              int* SIMULATION_TYPE,
-                                                             float* b_deltatover2_F) {
+                                                             realw* b_deltatover2_F) {
 
 TRACE("kernel_3_b_acoustic_cuda");
 
@@ -819,7 +782,7 @@
   realw deltatover2 = *deltatover2_F;
   realw b_deltatover2 = *b_deltatover2_F;
 
-  int blocksize=128;
+  int blocksize = BLOCKSIZE_KERNEL3;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
@@ -858,9 +821,9 @@
 
 
 __global__ void enforce_free_surface_cuda_kernel(
-                                       float* potential_acoustic,
-                                       float* potential_dot_acoustic,
-                                       float* potential_dot_dot_acoustic,
+                                       realw* potential_acoustic,
+                                       realw* potential_dot_acoustic,
+                                       realw* potential_dot_dot_acoustic,
                                        int num_free_surface_faces,
                                        int* free_surface_ispec,
                                        int* free_surface_ijk,
@@ -874,20 +837,12 @@
 
     int ispec = free_surface_ispec[iface]-1;
 
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-//  if( iface > 648-1 ){printf("device iface: %i \n",iface);}
-//#endif
-
     // checks if element is in acoustic domain
     if( ispec_is_acoustic[ispec] ){
 
       // gets global point index
       int igll = threadIdx.x + threadIdx.y*blockDim.x;
 
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-//      if( igll > 25-1 ){printf("device igll: %i \n",igll);}
-//#endif
-
       int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
       int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
       int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
@@ -898,10 +853,6 @@
       potential_acoustic[iglob] = 0;
       potential_dot_acoustic[iglob] = 0;
       potential_dot_dot_acoustic[iglob] = 0;
-
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-//    if( ispec == 160 && igll < 25 ){printf("device: %i %i %i %i %i \n",igll,i,j,k,iglob);}
-//#endif
     }
   }
 }
@@ -931,26 +882,8 @@
       num_blocks_y = num_blocks_y*2;
     }
     dim3 grid(num_blocks_x,num_blocks_y,1);
-    dim3 threads(25,1,1);
+    dim3 threads(NGLL2,1,1);
 
-    //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-    // debugging
-    //int* d_debug;
-    //printf("acoustic_enforce_free_surf_cuda ...\n");
-    //print_CUDA_error_if_any(cudaMalloc((void**)&d_debug,128*sizeof(int)),999);
-
-    //int* h_debug;
-    //h_debug = (int*) calloc(128,sizeof(int));
-    //for(int i=0;i<128;i++){h_debug[i] = 0;}
-    //cudaMemcpy(d_debug,h_debug,128*sizeof(int),cudaMemcpyHostToDevice);
-
-    //printf("acoustic_enforce_free_surf_cuda start...\n");
-    //doesnt' work...: printf("free_surface_ispec: %i %i %i \n",mp->d_free_surface_ispec[0],mp->d_free_surface_ispec[1],mp->d_free_surface_ispec[2]);
-    //printf("free_surface_ispec: %i \n",mp->num_free_surface_faces);
-
-    //cudaThreadSynchronize();
-    //#endif
-
     // sets potentials to zero at free surface
     enforce_free_surface_cuda_kernel<<<grid,threads>>>(mp->d_potential_acoustic,
                                                        mp->d_potential_dot_acoustic,
@@ -972,15 +905,6 @@
                                                          mp->d_ispec_is_acoustic);
 
     }
-    //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-    //cudaThreadSynchronize();
-    //cudaMemcpy(h_debug,d_debug,128*sizeof(int),cudaMemcpyDeviceToHost);
-    //for(int i=0;i<25;i++) {printf("ispec d_debug = %d \n",h_debug[i]);}
-    //cudaFree(d_debug);
-    //free(h_debug);
-    //exit(1);
-    //#endif
-
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
-#include <mpi.h>
 
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -40,51 +39,29 @@
 
 
 //  cuda constant arrays
-__constant__ float d_hprime_xx[NGLL2];
-__constant__ float d_hprime_yy[NGLL2];
-__constant__ float d_hprime_zz[NGLL2];
-__constant__ float d_hprimewgll_xx[NGLL2];
-__constant__ float d_hprimewgll_yy[NGLL2];
-__constant__ float d_hprimewgll_zz[NGLL2];
-__constant__ float d_wgllwgll_xy[NGLL2];
-__constant__ float d_wgllwgll_xz[NGLL2];
-__constant__ float d_wgllwgll_yz[NGLL2];
+__constant__ realw d_hprime_xx[NGLL2];
+__constant__ realw d_hprime_yy[NGLL2];
+__constant__ realw d_hprime_zz[NGLL2];
+__constant__ realw d_hprimewgll_xx[NGLL2];
+__constant__ realw d_hprimewgll_yy[NGLL2];
+__constant__ realw d_hprimewgll_zz[NGLL2];
+__constant__ realw d_wgllwgll_xy[NGLL2];
+__constant__ realw d_wgllwgll_xz[NGLL2];
+__constant__ realw d_wgllwgll_yz[NGLL2];
 
 
-//void Kernel_2(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
-//        int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,int ATTENUATION);
-//__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
-//                            int num_phase_ispec_elastic, int d_iphase, int* d_ibool);
-//__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
-//                              int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,
-//                              float* d_displ, float* d_accel,
-//                              float* d_xix, float* d_xiy, float* d_xiz,
-//                              float* d_etax, float* d_etay, float* d_etaz,
-//                              float* d_gammax, float* d_gammay, float* d_gammaz,
-//                              float* d_kappav, float* d_muv,
-//                              //float* d_debug,
-//                              int COMPUTE_AND_STORE_STRAIN,
-//                              float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
-//                              float* epsilondev_xz,float* epsilondev_yz,float* epsilon_trace_over_3,
-//                              int SIMULATION_TYPE,
-//                              int ATTENUATION,int NSPEC,
-//                              float* one_minus_sum_beta,float* factor_common,
-//                              float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
-//                              float* alphaval,float* betaval,float* gammaval);
 
-
 /* ----------------------------------------------------------------------------------------------- */
 
 // prepares a device array with with all inter-element edge-nodes -- this
 // is followed by a memcpy and MPI operations
-__global__ void prepare_boundary_accel_on_device(float* d_accel, float* d_send_accel_buffer,
-             int num_interfaces_ext_mesh, int max_nibool_interfaces_ext_mesh,
-             int* d_nibool_interfaces_ext_mesh,
-             int* d_ibool_interfaces_ext_mesh) {
+__global__ void prepare_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
+                                                 int num_interfaces_ext_mesh, 
+                                                 int max_nibool_interfaces_ext_mesh,
+                                                 int* d_nibool_interfaces_ext_mesh,
+                                                 int* d_ibool_interfaces_ext_mesh) {
 
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-  //int bx = blockIdx.y*gridDim.x+blockIdx.x;
-  //int tx = threadIdx.x;
   int iinterface=0;
 
   for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
@@ -106,8 +83,8 @@
 // (elements on boundary)
 extern "C"
 void FC_FUNC_(transfer_boun_accel_from_device,
-              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, float* accel,
-                                                    float* send_accel_buffer,
+              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
+                                                    realw* send_accel_buffer,
                                                     int* num_interfaces_ext_mesh,
                                                     int* max_nibool_interfaces_ext_mesh,
                                                     int* nibool_interfaces_ext_mesh,
@@ -119,8 +96,8 @@
 
   if( *num_interfaces_ext_mesh == 0 ) return;
 
-  int blocksize = 256;
-  int size_padded = ((int)ceil(((double)*max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
@@ -133,28 +110,28 @@
 
   //timing for memory xfer
   // cudaEvent_t start, stop;
-  // float time;
+  // realw time;
   // cudaEventCreate(&start);
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
   if(*FORWARD_OR_ADJOINT == 1) {
     prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel,mp->d_send_accel_buffer,
-                 *num_interfaces_ext_mesh,
-                 *max_nibool_interfaces_ext_mesh,
-                 mp->d_nibool_interfaces_ext_mesh,
-                 mp->d_ibool_interfaces_ext_mesh);
+                                                       mp->num_interfaces_ext_mesh,
+                                                       mp->max_nibool_interfaces_ext_mesh,
+                                                       mp->d_nibool_interfaces_ext_mesh,
+                                                       mp->d_ibool_interfaces_ext_mesh);
   }
   else if(*FORWARD_OR_ADJOINT == 3) {
     prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel,mp->d_send_accel_buffer,
-                 *num_interfaces_ext_mesh,
-                 *max_nibool_interfaces_ext_mesh,
-                 mp->d_nibool_interfaces_ext_mesh,
-                 mp->d_ibool_interfaces_ext_mesh);
+                                                       mp->num_interfaces_ext_mesh,
+                                                       mp->max_nibool_interfaces_ext_mesh,
+                                                       mp->d_nibool_interfaces_ext_mesh,
+                                                       mp->d_ibool_interfaces_ext_mesh);
   }
 
 
   cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer,
-             3* *max_nibool_interfaces_ext_mesh* *num_interfaces_ext_mesh*sizeof(realw),cudaMemcpyDeviceToHost);
+             3*mp->max_nibool_interfaces_ext_mesh*mp->num_interfaces_ext_mesh*sizeof(realw),cudaMemcpyDeviceToHost);
 
   // finish timing of kernel+memcpy
   // cudaEventRecord( stop, 0 );
@@ -170,7 +147,7 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void assemble_boundary_accel_on_device(float* d_accel, float* d_send_accel_buffer,
+__global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
                                                   int num_interfaces_ext_mesh,
                                                   int max_nibool_interfaces_ext_mesh,
                                                   int* d_nibool_interfaces_ext_mesh,
@@ -227,10 +204,10 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   cudaMemcpy(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
-             3*(*max_nibool_interfaces_ext_mesh)*(*num_interfaces_ext_mesh)*sizeof(realw), cudaMemcpyHostToDevice);
+             3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),cudaMemcpyHostToDevice);
 
-  int blocksize = 256;
-  int size_padded = ((int)ceil(((double)*max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
@@ -242,23 +219,23 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
   // cudaEvent_t start, stop;
-  // float time;
+  // realw time;
   // cudaEventCreate(&start);
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
   if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
     assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel, mp->d_send_accel_buffer,
-              *num_interfaces_ext_mesh,
-              *max_nibool_interfaces_ext_mesh,
-              mp->d_nibool_interfaces_ext_mesh,
-              mp->d_ibool_interfaces_ext_mesh);
+                                                        mp->num_interfaces_ext_mesh,
+                                                        mp->max_nibool_interfaces_ext_mesh,
+                                                        mp->d_nibool_interfaces_ext_mesh,
+                                                        mp->d_ibool_interfaces_ext_mesh);
   }
   else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
     assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel, mp->d_send_accel_buffer,
-              *num_interfaces_ext_mesh,
-              *max_nibool_interfaces_ext_mesh,
-              mp->d_nibool_interfaces_ext_mesh,
-              mp->d_ibool_interfaces_ext_mesh);
+                                                        mp->num_interfaces_ext_mesh,
+                                                        mp->max_nibool_interfaces_ext_mesh,
+                                                        mp->d_nibool_interfaces_ext_mesh,
+                                                        mp->d_ibool_interfaces_ext_mesh);
   }
 
   // cudaEventRecord( stop, 0 );
@@ -283,7 +260,7 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-//__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
+//__global__ void Kernel_test(realw* d_debug_output,int* d_phase_ispec_inner_elastic,
 //                            int num_phase_ispec_elastic, int d_iphase, int* d_ibool) {
 //  int bx = blockIdx.x;
 //  int tx = threadIdx.x;
@@ -310,11 +287,11 @@
 // updates stress
 
 __device__ void compute_element_att_stress(int tx,int working_element,int NSPEC,
-                                           float* R_xx,
-                                           float* R_yy,
-                                           float* R_xy,
-                                           float* R_xz,
-                                           float* R_yz,
+                                           realw* R_xx,
+                                           realw* R_yy,
+                                           realw* R_xy,
+                                           realw* R_xz,
+                                           realw* R_yz,
                                            reald* sigma_xx,
                                            reald* sigma_yy,
                                            reald* sigma_zz,
@@ -327,7 +304,7 @@
 
   for(i_sls = 0; i_sls < N_SLS; i_sls++){
     // index
-    offset_sls = tx + 125*(working_element + NSPEC*i_sls);
+    offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
 
     R_xx_val = R_xx[offset_sls]; //(i,j,k,ispec,i_sls)
     R_yy_val = R_yy[offset_sls];
@@ -347,12 +324,12 @@
 // updates R_memory
 
 __device__ void compute_element_att_memory(int tx,int working_element,int NSPEC,
-                                          float* d_muv,
-                                          float* factor_common,
-                                          float* alphaval,float* betaval,float* gammaval,
-                                          float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
-                                          float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
-                                          float* epsilondev_xz,float* epsilondev_yz,
+                                          realw* d_muv,
+                                          realw* factor_common,
+                                          realw* alphaval,realw* betaval,realw* gammaval,
+                                          realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                          realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                          realw* epsilondev_xz,realw* epsilondev_yz,
                                           reald epsilondev_xx_loc,reald epsilondev_yy_loc,reald epsilondev_xy_loc,
                                           reald epsilondev_xz_loc,reald epsilondev_yz_loc
                                           ){
@@ -365,8 +342,8 @@
   reald factor_loc,Sn,Snp1;
 
   // indices
-  offset_align = tx + 128 * working_element;
-  ijk_ispec = tx + 125 * working_element;
+  offset_align = tx + NGLL3_PADDED * working_element;
+  ijk_ispec = tx + NGLL3 * working_element;
 
   mul = d_muv[offset_align];
 
@@ -374,8 +351,8 @@
   for(i_sls = 0; i_sls < N_SLS; i_sls++){
 
     // indices
-    offset_common = i_sls + N_SLS*(tx + 125*working_element); // (i_sls,i,j,k,ispec)
-    offset_sls = tx + 125*(working_element + NSPEC*i_sls);   // (i,j,k,ispec,i_sls)
+    offset_common = i_sls + N_SLS*(tx + NGLL3*working_element); // (i_sls,i,j,k,ispec)
+    offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);   // (i,j,k,ispec,i_sls)
 
     factor_loc = mul * factor_common[offset_common]; //mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
 
@@ -423,7 +400,7 @@
 
 // double precision temporary variables leads to 10% performance
 // decrease in Kernel_2_impl (not very much..)
-//typedef float reald;
+//typedef realw reald;
 
 // doesn't seem to change the performance.
 // #define MANUALLY_UNROLLED_LOOPS
@@ -435,23 +412,43 @@
                               int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic,
                               int d_iphase,
                               int use_mesh_coloring_gpu,
-                              float* d_displ, float* d_accel,
-                              float* d_xix, float* d_xiy, float* d_xiz,
-                              float* d_etax, float* d_etay, float* d_etaz,
-                              float* d_gammax, float* d_gammay, float* d_gammaz,
-                              float* d_kappav, float* d_muv,
-                              //float* d_debug,
+                              realw* d_displ, realw* d_accel,
+                              realw* d_xix, realw* d_xiy, realw* d_xiz,
+                              realw* d_etax, realw* d_etay, realw* d_etaz,
+                              realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+                              realw* d_kappav, realw* d_muv,
                               int COMPUTE_AND_STORE_STRAIN,
-                              float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
-                              float* epsilondev_xz,float* epsilondev_yz,
-                              float* epsilon_trace_over_3,
+                              realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                              realw* epsilondev_xz,realw* epsilondev_yz,
+                              realw* epsilon_trace_over_3,
                               int SIMULATION_TYPE,
                               int ATTENUATION,
                               int NSPEC,
-                              float* one_minus_sum_beta,float* factor_common,
-                              float* R_xx, float* R_yy, float* R_xy, float* R_xz, float* R_yz,
-                              float* alphaval,float* betaval,float* gammaval
-                              ){
+                              realw* one_minus_sum_beta,realw* factor_common,
+                              realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
+                              realw* alphaval,realw* betaval,realw* gammaval,
+                              int ANISOTROPY,
+                              realw* d_c11store,
+                              realw* d_c12store,
+                              realw* d_c13store,
+                              realw* d_c14store,
+                              realw* d_c15store,
+                              realw* d_c16store,
+                              realw* d_c22store,
+                              realw* d_c23store,
+                              realw* d_c24store,
+                              realw* d_c25store,
+                              realw* d_c26store,
+                              realw* d_c33store,
+                              realw* d_c34store,
+                              realw* d_c35store,
+                              realw* d_c36store,
+                              realw* d_c44store,
+                              realw* d_c45store,
+                              realw* d_c46store,
+                              realw* d_c55store,
+                              realw* d_c56store,
+                              realw* d_c66store){
 
   /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
@@ -460,8 +457,8 @@
 
   //const int NGLLX = 5;
   // const int NGLL2 = 25;
-  const int NGLL3 = 125;
-  const int NGLL3_ALIGN = 128;
+  //const int NGLL3 = NGLL3;
+  const int NGLL3_ALIGN = NGLL3_PADDED;
 
   int K = (tx/NGLL2);
   int J = ((tx-K*NGLL2)/NGLLX);
@@ -479,10 +476,11 @@
   reald fac1,fac2,fac3,lambdal,mul,lambdalplus2mul,kappal;
   reald sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
   reald epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+  reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
 
 #ifndef MANUALLY_UNROLLED_LOOPS
     int l;
-    float hp1,hp2,hp3;
+    realw hp1,hp2,hp3;
 #endif
 
     __shared__ reald s_dummyx_loc[NGLL3];
@@ -520,7 +518,7 @@
 #endif
 
       // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
-      iglob = d_ibool[working_element*125 + tx]-1;
+      iglob = d_ibool[working_element*NGLL3 + tx]-1;
 
 #ifdef USE_TEXTURES
       s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
@@ -576,13 +574,6 @@
           tempy3l += s_dummyy_loc[offset]*hp3;
           tempz3l += s_dummyz_loc[offset]*hp3;
 
-    // if(working_element == 169 && tx == 0) {
-    //   atomicAdd(&d_debug[0],1.0);
-    //   d_debug[1+3*l] = tempz3l;
-    //   d_debug[2+3*l] = s_dummyz_loc[offset];
-    //   d_debug[3+3*l] = hp3;
-    // }
-
       }
 #else
 
@@ -676,7 +667,7 @@
 
       // computes deviatoric strain attenuation and/or for kernel calculations
       if(COMPUTE_AND_STORE_STRAIN) {
-        float templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+        realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
         /*
         epsilondev_xx[offset] = duxdxl - templ;
         epsilondev_yy[offset] = duydyl - templ;
@@ -692,7 +683,7 @@
         epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
 
         if(SIMULATION_TYPE == 3) {
-          epsilon_trace_over_3[tx + working_element*125] = templ;
+          epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
         }
       }
 
@@ -703,22 +694,64 @@
       // attenuation
       if(ATTENUATION){
         // use unrelaxed parameters if attenuation
-        mul  = mul * one_minus_sum_beta[tx+working_element*125]; // (i,j,k,ispec)
+        mul  = mul * one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
       }
 
-      // isotropic case
-      lambdalplus2mul = kappal + 1.33333333333333333333f * mul;  // 4./3. = 1.3333333
-      lambdal = lambdalplus2mul - 2.0f * mul;
+      // full anisotropic case, stress calculations
+      if(ANISOTROPY){
 
-      // compute the six components of the stress tensor sigma
-      sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
-      sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
-      sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+        c11 = d_c11store[offset];
+        c12 = d_c12store[offset];
+        c13 = d_c13store[offset];
+        c14 = d_c14store[offset];
+        c15 = d_c15store[offset];
+        c16 = d_c16store[offset];
+        c22 = d_c22store[offset];
+        c23 = d_c23store[offset];
+        c24 = d_c24store[offset];
+        c25 = d_c25store[offset];
+        c26 = d_c26store[offset];
+        c33 = d_c33store[offset];
+        c34 = d_c34store[offset];
+        c35 = d_c35store[offset];
+        c36 = d_c36store[offset];
+        c44 = d_c44store[offset];
+        c45 = d_c45store[offset];
+        c46 = d_c46store[offset];
+        c55 = d_c55store[offset];
+        c56 = d_c56store[offset];
+        c66 = d_c66store[offset];
 
-      sigma_xy = mul*duxdyl_plus_duydxl;
-      sigma_xz = mul*duzdxl_plus_duxdzl;
-      sigma_yz = mul*duzdyl_plus_duydzl;
+        sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl +
+                   c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl;
+        sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl +
+                   c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl;
+        sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl +
+                   c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl;
+        sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl +
+                   c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl;
+        sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl +
+                   c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl;
+        sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl +
+                   c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl;
 
+      }else{
+
+        // isotropic case
+
+        lambdalplus2mul = kappal + 1.33333333333333333333f * mul;  // 4./3. = 1.3333333
+        lambdal = lambdalplus2mul - 2.0f * mul;
+
+        // compute the six components of the stress tensor sigma
+        sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+        sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+        sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+
+        sigma_xy = mul*duxdyl_plus_duydxl;
+        sigma_xz = mul*duzdxl_plus_duxdzl;
+        sigma_yz = mul*duzdyl_plus_duydzl;
+      }
+
       if(ATTENUATION){
         // subtract memory variables if attenuation
         compute_element_att_stress(tx,working_element,NSPEC,
@@ -784,14 +817,6 @@
         tempy3l += s_tempy3[offset]*fac3;
         tempz3l += s_tempz3[offset]*fac3;
 
-        //if(working_element == 169)
-        //  if(l==0)
-        //    if(I+J+K == 0) {
-              // atomicAdd(&d_debug[0],1.0);
-              // d_debug[0] = fac3;
-              // d_debug[1] = offset;
-              // d_debug[2] = s_tempz3[offset];
-        //    }
       }
 #else
 
@@ -876,24 +901,10 @@
         d_accel[iglob*3] -= (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
         d_accel[iglob*3 + 1] -= (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
         d_accel[iglob*3 + 2] -= (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
-        
 
+
       }else{
 
-        //if(iglob*3+2 == 41153) {
-          // int ot = d_debug[5];
-          // d_debug[0+1+ot] = d_accel[iglob*3+2];
-          // // d_debug[1+1+ot] = fac1*tempz1l;
-          // // d_debug[2+1+ot] = fac2*tempz2l;
-          // // d_debug[3+1+ot] = fac3*tempz3l;
-          // d_debug[1+1+ot] = fac1;
-          // d_debug[2+1+ot] = fac2;
-          // d_debug[3+1+ot] = fac3;
-          // d_debug[4+1+ot] = d_accel[iglob*3+2]-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
-          // atomicAdd(&d_debug[0],1.0);
-          // d_debug[6+ot] = d_displ[iglob*3+2];
-        //}
-
         atomicAdd(&d_accel[iglob*3],-(fac1*tempx1l + fac2*tempx2l + fac3*tempx3l));
         atomicAdd(&d_accel[iglob*3+1],-(fac1*tempy1l + fac2*tempy2l + fac3*tempy3l));
         atomicAdd(&d_accel[iglob*3+2],-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l));
@@ -914,7 +925,7 @@
 
       // save deviatoric strain for Runge-Kutta scheme
       if( COMPUTE_AND_STORE_STRAIN ){
-        int ijk_ispec = tx + working_element*125;
+        int ijk_ispec = tx + working_element*NGLL3;
 
         // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
         epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
@@ -936,43 +947,65 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,
-              int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,int ATTENUATION,
+              int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,
+              int ATTENUATION,int ANISOTROPY,
               int* d_ibool,
-              float* d_xix,
-              float* d_xiy,
-              float* d_xiz,
-              float* d_etax,
-              float* d_etay,
-              float* d_etaz,
-              float* d_gammax,
-              float* d_gammay,
-              float* d_gammaz,
-              float* d_kappav,
-              float* d_muv,
-              float* d_epsilondev_xx,
-              float* d_epsilondev_yy,
-              float* d_epsilondev_xy,
-              float* d_epsilondev_xz,
-              float* d_epsilondev_yz,
-              float* d_epsilon_trace_over_3,
-              float* d_one_minus_sum_beta,
-              float* d_factor_common,
-              float* d_R_xx,
-              float* d_R_yy,
-              float* d_R_xy,
-              float* d_R_xz,
-              float* d_R_yz,
-              float* d_b_epsilondev_xx,
-              float* d_b_epsilondev_yy,
-              float* d_b_epsilondev_xy,
-              float* d_b_epsilondev_xz,
-              float* d_b_epsilondev_yz,
-              float* d_b_epsilon_trace_over_3,
-              float* d_b_R_xx,
-              float* d_b_R_yy,
-              float* d_b_R_xy,
-              float* d_b_R_xz,
-              float* d_b_R_yz){
+              realw* d_xix,
+              realw* d_xiy,
+              realw* d_xiz,
+              realw* d_etax,
+              realw* d_etay,
+              realw* d_etaz,
+              realw* d_gammax,
+              realw* d_gammay,
+              realw* d_gammaz,
+              realw* d_kappav,
+              realw* d_muv,
+              realw* d_epsilondev_xx,
+              realw* d_epsilondev_yy,
+              realw* d_epsilondev_xy,
+              realw* d_epsilondev_xz,
+              realw* d_epsilondev_yz,
+              realw* d_epsilon_trace_over_3,
+              realw* d_one_minus_sum_beta,
+              realw* d_factor_common,
+              realw* d_R_xx,
+              realw* d_R_yy,
+              realw* d_R_xy,
+              realw* d_R_xz,
+              realw* d_R_yz,
+              realw* d_b_epsilondev_xx,
+              realw* d_b_epsilondev_yy,
+              realw* d_b_epsilondev_xy,
+              realw* d_b_epsilondev_xz,
+              realw* d_b_epsilondev_yz,
+              realw* d_b_epsilon_trace_over_3,
+              realw* d_b_R_xx,
+              realw* d_b_R_yy,
+              realw* d_b_R_xy,
+              realw* d_b_R_xz,
+              realw* d_b_R_yz,
+              realw* d_c11store,
+              realw* d_c12store,
+              realw* d_c13store,
+              realw* d_c14store,
+              realw* d_c15store,
+              realw* d_c16store,
+              realw* d_c22store,
+              realw* d_c23store,
+              realw* d_c24store,
+              realw* d_c25store,
+              realw* d_c26store,
+              realw* d_c33store,
+              realw* d_c34store,
+              realw* d_c35store,
+              realw* d_c36store,
+              realw* d_c44store,
+              realw* d_c45store,
+              realw* d_c46store,
+              realw* d_c55store,
+              realw* d_c56store,
+              realw* d_c66store){
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("before kernel Kernel 2");
@@ -989,29 +1022,17 @@
     num_blocks_y = num_blocks_y*2;
   }
 
-  //int threads_2 = 128;//BLOCK_SIZE_K2;
-  //dim3 grid_2(num_blocks_x,num_blocks_y);
-
-  int blocksize = 128;
+  int blocksize = NGLL3_PADDED;
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  // debugging
-  //printf("Starting with grid %dx%d for %d blocks\n",num_blocks_x,num_blocks_y,nb_blocks_to_compute);
-  //  float* d_debug;
-  //    float* h_debug;
-  //    h_debug = (float*)calloc(128,sizeof(float));
-  //    cudaMalloc((void**)&d_debug,128*sizeof(float));
-  //    cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
-
   // Cuda timing
   // cudaEvent_t start, stop;
-  // float time;
+  // realw time;
   // cudaEventCreate(&start);
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
 
-  //Kernel_2_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,mp->NGLOB_AB, mp->d_ibool,
   Kernel_2_impl<<<grid,threads>>>(nb_blocks_to_compute,
                                   mp->NGLOB_AB,
                                   d_ibool,
@@ -1024,7 +1045,6 @@
                                   d_etax, d_etay, d_etaz,
                                   d_gammax, d_gammay, d_gammaz,
                                   d_kappav, d_muv,
-                                  //d_debug,
                                   COMPUTE_AND_STORE_STRAIN,
                                   d_epsilondev_xx,
                                   d_epsilondev_yy,
@@ -1037,26 +1057,33 @@
                                   d_one_minus_sum_beta,
                                   d_factor_common,
                                   d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
-                                  mp->d_alphaval,mp->d_betaval,mp->d_gammaval
+                                  mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+                                  ANISOTROPY,
+                                  d_c11store,
+                                  d_c12store,
+                                  d_c13store,
+                                  d_c14store,
+                                  d_c15store,
+                                  d_c16store,
+                                  d_c22store,
+                                  d_c23store,
+                                  d_c24store,
+                                  d_c25store,
+                                  d_c26store,
+                                  d_c33store,
+                                  d_c34store,
+                                  d_c35store,
+                                  d_c36store,
+                                  d_c44store,
+                                  d_c45store,
+                                  d_c46store,
+                                  d_c55store,
+                                  d_c56store,
+                                  d_c66store
                                   );
 
 
-  // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
-  // int procid;
-  // MPI_Comm_rank(MPI_COMM_WORLD,&procid);
-  // if(procid==0) {
-  //   for(int i=0;i<17;i++) {
-  //  printf("cudadebug[%d] = %e\n",i,h_debug[i]);
-  //   }
-  // }
-  //    free(h_debug);
-  //    cudaFree(d_debug);
-  // #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //    exit_on_cuda_error("Kernel_2_impl");
-  // #endif
-
   if(SIMULATION_TYPE == 3) {
-    //Kernel_2_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,mp->NGLOB_AB, mp->d_ibool,
     Kernel_2_impl<<< grid,threads>>>(nb_blocks_to_compute,
                                      mp->NGLOB_AB,
                                      d_ibool,
@@ -1069,7 +1096,6 @@
                                      d_etax, d_etay, d_etaz,
                                      d_gammax, d_gammay, d_gammaz,
                                      d_kappav, d_muv,
-                                     //d_debug,
                                      COMPUTE_AND_STORE_STRAIN,
                                      d_b_epsilondev_xx,
                                      d_b_epsilondev_yy,
@@ -1082,7 +1108,29 @@
                                      d_one_minus_sum_beta,
                                      d_factor_common,
                                      d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
-                                     mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval
+                                     mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
+                                     ANISOTROPY,
+                                     d_c11store,
+                                     d_c12store,
+                                     d_c13store,
+                                     d_c14store,
+                                     d_c15store,
+                                     d_c16store,
+                                     d_c22store,
+                                     d_c23store,
+                                     d_c24store,
+                                     d_c25store,
+                                     d_c26store,
+                                     d_c33store,
+                                     d_c34store,
+                                     d_c35store,
+                                     d_c36store,
+                                     d_c44store,
+                                     d_c45store,
+                                     d_c46store,
+                                     d_c55store,
+                                     d_c56store,
+                                     d_c66store
                                      );
   }
 
@@ -1093,11 +1141,6 @@
   // cudaEventDestroy( stop );
   // printf("Kernel2 Execution Time: %f ms\n",time);
 
-  // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
-  // for(int i=0;i<10;i++) {
-  // printf("debug[%d]=%e\n",i,h_debug[i]);
-  // }
-
   /* cudaThreadSynchronize(); */
   /* LOG("Kernel 2 finished"); */
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1116,7 +1159,8 @@
                                            int* nspec_inner_elastic,
                                            int* SIMULATION_TYPE,
                                            int* COMPUTE_AND_STORE_STRAIN,
-                                           int* ATTENUATION) {
+                                           int* ATTENUATION,
+                                           int* ANISOTROPY) {
 
   TRACE("compute_forces_elastic_cuda");
   // EPIK_TRACER("compute_forces_elastic_cuda");
@@ -1135,10 +1179,6 @@
   // checks if anything to do
   if( num_elements == 0 ) return;
 
-  //int myrank;
-  /* MPI_Comm_rank(MPI_COMM_WORLD,&myrank); */
-  /* if(myrank==0) { */
-
   // mesh coloring
   if( mp->use_mesh_coloring_gpu ){
 
@@ -1167,8 +1207,8 @@
 
       // array offsets
       color_offset = (*nspec_outer_elastic) * NGLL3_PADDED;
-      color_offset_nonpadded = (*nspec_outer_elastic) * NGLL3_NONPADDED;
-      color_offset_nonpadded_att2 = (*nspec_outer_elastic) * NGLL3_NONPADDED * N_SLS;
+      color_offset_nonpadded = (*nspec_outer_elastic) * NGLL3;
+      color_offset_nonpadded_att2 = (*nspec_outer_elastic) * NGLL3 * N_SLS;
     }
 
     // loops over colors
@@ -1183,7 +1223,8 @@
       //}
 
       Kernel_2(nb_blocks_to_compute,mp,*iphase,
-               *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION,
+               *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,
+               *ATTENUATION,*ANISOTROPY,
                mp->d_ibool + color_offset_nonpadded,
                mp->d_xix + color_offset,
                mp->d_xiy + color_offset,
@@ -1219,14 +1260,35 @@
                mp->d_b_R_yy + color_offset_nonpadded,
                mp->d_b_R_xy + color_offset_nonpadded,
                mp->d_b_R_xz + color_offset_nonpadded,
-               mp->d_b_R_yz + color_offset_nonpadded);
+               mp->d_b_R_yz + color_offset_nonpadded,
+               mp->d_c11store + color_offset,
+               mp->d_c12store + color_offset,
+               mp->d_c13store + color_offset,
+               mp->d_c14store + color_offset,
+               mp->d_c15store + color_offset,
+               mp->d_c16store + color_offset,
+               mp->d_c22store + color_offset,
+               mp->d_c23store + color_offset,
+               mp->d_c24store + color_offset,
+               mp->d_c25store + color_offset,
+               mp->d_c26store + color_offset,
+               mp->d_c33store + color_offset,
+               mp->d_c34store + color_offset,
+               mp->d_c35store + color_offset,
+               mp->d_c36store + color_offset,
+               mp->d_c44store + color_offset,
+               mp->d_c45store + color_offset,
+               mp->d_c46store + color_offset,
+               mp->d_c55store + color_offset,
+               mp->d_c56store + color_offset,
+               mp->d_c66store + color_offset);
 
       // for padded and aligned arrays
       color_offset += nb_blocks_to_compute * NGLL3_PADDED;
       // for no-aligned arrays
-      color_offset_nonpadded += nb_blocks_to_compute * NGLL3_NONPADDED;
+      color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
       // for factor_common array
-      color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3_NONPADDED * N_SLS;
+      color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
     }
 
   }else{
@@ -1234,7 +1296,8 @@
     // no mesh coloring: uses atomic updates
 
     Kernel_2(num_elements,mp,*iphase,
-             *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION,
+             *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,
+             *ATTENUATION,*ANISOTROPY,
              mp->d_ibool,
              mp->d_xix,
              mp->d_xiy,
@@ -1270,18 +1333,29 @@
              mp->d_b_R_yy,
              mp->d_b_R_xy,
              mp->d_b_R_xz,
-             mp->d_b_R_yz);
+             mp->d_b_R_yz,
+             mp->d_c11store,
+             mp->d_c12store,
+             mp->d_c13store,
+             mp->d_c14store,
+             mp->d_c15store,
+             mp->d_c16store,
+             mp->d_c22store,
+             mp->d_c23store,
+             mp->d_c24store,
+             mp->d_c25store,
+             mp->d_c26store,
+             mp->d_c33store,
+             mp->d_c34store,
+             mp->d_c35store,
+             mp->d_c36store,
+             mp->d_c44store,
+             mp->d_c45store,
+             mp->d_c46store,
+             mp->d_c55store,
+             mp->d_c56store,
+             mp->d_c66store);
   }
-
-
-
-  //cudaThreadSynchronize();
-
-  //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  /* MPI_Barrier(MPI_COMM_WORLD); */
-  //double end_time = get_time();
-  //printf("Elapsed time: %e\n",end_time-start_time);
-  //#endif
 }
 
 
@@ -1350,9 +1424,9 @@
 void FC_FUNC_(kernel_3_a_cuda,
               KERNEL_3_A_CUDA)(long* Mesh_pointer,
                                int* size_F,
-                               float* deltatover2_F,
+                               realw* deltatover2_F,
                                int* SIMULATION_TYPE_f,
-                               float* b_deltatover2_F,
+                               realw* b_deltatover2_F,
                                int* OCEANS) {
 TRACE("kernel_3_a_cuda");
 
@@ -1362,7 +1436,7 @@
    realw deltatover2 = *deltatover2_F;
    realw b_deltatover2 = *b_deltatover2_F;
 
-   int blocksize=128;
+   int blocksize = BLOCKSIZE_KERNEL3;
    int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
 
    int num_blocks_x = size_padded/blocksize;
@@ -1404,9 +1478,9 @@
 void FC_FUNC_(kernel_3_b_cuda,
               KERNEL_3_B_CUDA)(long* Mesh_pointer,
                              int* size_F,
-                             float* deltatover2_F,
+                             realw* deltatover2_F,
                              int* SIMULATION_TYPE_f,
-                             float* b_deltatover2_F) {
+                             realw* b_deltatover2_F) {
   TRACE("kernel_3_b_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
@@ -1415,7 +1489,7 @@
   realw deltatover2 = *deltatover2_F;
   realw b_deltatover2 = *b_deltatover2_F;
 
-  int blocksize=128;
+  int blocksize = BLOCKSIZE_KERNEL3;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
 
   int num_blocks_x = size_padded/blocksize;
@@ -1444,6 +1518,140 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+/* OCEANS load on free surface */
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void elastic_ocean_load_cuda_kernel(realw* accel,
+                                               realw* rmass,
+                                               realw* rmass_ocean_load,
+                                               int num_free_surface_faces,
+                                               int* free_surface_ispec,
+                                               int* free_surface_ijk,
+                                               realw* free_surface_normal,
+                                               int* ibool,
+                                               int* updated_dof_ocean_load) {
+  // gets spectral element face id
+  int igll = threadIdx.x ;  //  threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1)
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+  realw nx,ny,nz;
+  realw force_normal_comp,additional_term;
+  
+  // for all faces on free surface
+  if( iface < num_free_surface_faces ){
+    
+    int ispec = free_surface_ispec[iface]-1;
+    
+    // gets global point index
+    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
+    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+    
+    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+    
+    //if(igll == 0 ) printf("igll %d %d %d %d\n",igll,i,j,k,iglob);
+    
+    // only update this global point once
+    
+    // daniel: TODO - there might be better ways to implement a mutex like below,
+    //            and find a workaround to not use the temporary update array.
+    //            atomicExch: returns the old value, i.e. 0 indicates that we still have to do this point
+    
+    if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
+      
+      // get normal
+      nx = free_surface_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; //(1,igll,iface)
+      ny = free_surface_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
+      nz = free_surface_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
+      
+      // make updated component of right-hand side
+      // we divide by rmass() which is 1 / M
+      // we use the total force which includes the Coriolis term above
+      force_normal_comp = ( accel[iglob*3]*nx + accel[iglob*3+1]*ny + accel[iglob*3+2]*nz ) / rmass[iglob];
+      
+      additional_term = (rmass_ocean_load[iglob] - rmass[iglob]) * force_normal_comp;
+      
+      // probably wouldn't need atomicAdd anymore, but just to be sure...
+      atomicAdd(&accel[iglob*3], + additional_term * nx);
+      atomicAdd(&accel[iglob*3+1], + additional_term * ny);
+      atomicAdd(&accel[iglob*3+2], + additional_term * nz);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(elastic_ocean_load_cuda,
+              ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
+                                       int* SIMULATION_TYPE) {
+  
+  TRACE("elastic_ocean_load_cuda");
+  
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  
+  // checks if anything to do
+  if( mp->num_free_surface_faces == 0 ) return;
+  
+  // block sizes: exact blocksize to match NGLLSQUARE
+  int blocksize = NGLL2;
+  
+  int num_blocks_x = mp->num_free_surface_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
+  
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+  
+  
+  // initializes temporary array to zero
+  print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
+                                     sizeof(int)*mp->NGLOB_AB),88501);
+  
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("before kernel elastic_ocean_load_cuda");
+#endif
+  
+  elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_accel,
+                                                   mp->d_rmass,
+                                                   mp->d_rmass_ocean_load,
+                                                   mp->num_free_surface_faces,
+                                                   mp->d_free_surface_ispec,
+                                                   mp->d_free_surface_ijk,
+                                                   mp->d_free_surface_normal,
+                                                   mp->d_ibool,
+                                                   mp->d_updated_dof_ocean_load);
+  // for backward/reconstructed potentials
+  if(*SIMULATION_TYPE == 3) {
+    // re-initializes array
+    print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
+                                       sizeof(int)*mp->NGLOB_AB),88502);
+    
+    elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
+                                                     mp->d_rmass,
+                                                     mp->d_rmass_ocean_load,
+                                                     mp->num_free_surface_faces,
+                                                     mp->d_free_surface_ispec,
+                                                     mp->d_free_surface_ijk,
+                                                     mp->d_free_surface_normal,
+                                                     mp->d_ibool,
+                                                     mp->d_updated_dof_ocean_load);
+    
+  }
+  
+  
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("elastic_ocean_load_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
 /* note:
  constant arrays when used in compute_forces_acoustic_cuda.cu routines stay zero,
  constant declaration and cudaMemcpyToSymbol would have to be in the same file...
@@ -1467,10 +1675,10 @@
 
 // constant arrays
 
-void setConst_hprime_xx(float* array,Mesh* mp)
+void setConst_hprime_xx(realw* array,Mesh* mp)
 {
 
-  cudaError_t err = cudaMemcpyToSymbol(d_hprime_xx, array, NGLL2*sizeof(float));
+  cudaError_t err = cudaMemcpyToSymbol(d_hprime_xx, array, NGLL2*sizeof(realw));
   if (err != cudaSuccess)
   {
     fprintf(stderr, "Error in setConst_hprime_xx: %s\n", cudaGetErrorString(err));
@@ -1485,10 +1693,10 @@
   }
 }
 
-void setConst_hprime_yy(float* array,Mesh* mp)
+void setConst_hprime_yy(realw* array,Mesh* mp)
 {
 
-  cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(float));
+  cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
   if (err != cudaSuccess)
   {
     fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err));
@@ -1503,10 +1711,10 @@
   }
 }
 
-void setConst_hprime_zz(float* array,Mesh* mp)
+void setConst_hprime_zz(realw* array,Mesh* mp)
 {
 
-  cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(float));
+  cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
   if (err != cudaSuccess)
   {
     fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err));
@@ -1522,9 +1730,9 @@
 }
 
 
-void setConst_hprimewgll_xx(float* array,Mesh* mp)
+void setConst_hprimewgll_xx(realw* array,Mesh* mp)
 {
-  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_xx, array, NGLL2*sizeof(float));
+  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_xx, array, NGLL2*sizeof(realw));
   if (err != cudaSuccess)
   {
     fprintf(stderr, "Error in setConst_hprimewgll_xx: %s\n", cudaGetErrorString(err));
@@ -1538,9 +1746,9 @@
   }
 }
 
-void setConst_hprimewgll_yy(float* array,Mesh* mp)
+void setConst_hprimewgll_yy(realw* array,Mesh* mp)
 {
-  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_yy, array, NGLL2*sizeof(float));
+  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_yy, array, NGLL2*sizeof(realw));
   if (err != cudaSuccess)
   {
     fprintf(stderr, "Error in setConst_hprimewgll_yy: %s\n", cudaGetErrorString(err));
@@ -1554,9 +1762,9 @@
   }
 }
 
-void setConst_hprimewgll_zz(float* array,Mesh* mp)
+void setConst_hprimewgll_zz(realw* array,Mesh* mp)
 {
-  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_zz, array, NGLL2*sizeof(float));
+  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_zz, array, NGLL2*sizeof(realw));
   if (err != cudaSuccess)
   {
     fprintf(stderr, "Error in setConst_hprimewgll_zz: %s\n", cudaGetErrorString(err));
@@ -1570,9 +1778,9 @@
   }
 }
 
-void setConst_wgllwgll_xy(float* array,Mesh* mp)
+void setConst_wgllwgll_xy(realw* array,Mesh* mp)
 {
-  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xy, array, NGLL2*sizeof(float));
+  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xy, array, NGLL2*sizeof(realw));
   if (err != cudaSuccess)
   {
     fprintf(stderr, "Error in setConst_wgllwgll_xy: %s\n", cudaGetErrorString(err));
@@ -1587,9 +1795,9 @@
 
 }
 
-void setConst_wgllwgll_xz(float* array,Mesh* mp)
+void setConst_wgllwgll_xz(realw* array,Mesh* mp)
 {
-  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xz, array, NGLL2*sizeof(float));
+  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xz, array, NGLL2*sizeof(realw));
   if (err != cudaSuccess)
   {
     fprintf(stderr, "Error in  setConst_wgllwgll_xz: %s\n", cudaGetErrorString(err));
@@ -1604,9 +1812,9 @@
 
 }
 
-void setConst_wgllwgll_yz(float* array,Mesh* mp)
+void setConst_wgllwgll_yz(realw* array,Mesh* mp)
 {
-  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_yz, array, NGLL2*sizeof(float));
+  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_yz, array, NGLL2*sizeof(realw));
   if (err != cudaSuccess)
   {
     fprintf(stderr, "Error in setConst_wgllwgll_yz: %s\n", cudaGetErrorString(err));
@@ -1621,136 +1829,3 @@
 
 }
 
-
-/* ----------------------------------------------------------------------------------------------- */
-
-/* OCEANS load on free surface */
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-__global__ void elastic_ocean_load_cuda_kernel(float* accel,
-                                                 float* rmass,
-                                                 float* rmass_ocean_load,
-                                                 int num_free_surface_faces,
-                                                 int* free_surface_ispec,
-                                                 int* free_surface_ijk,
-                                                 float* free_surface_normal,
-                                                 int* ibool,
-                                                 int* updated_dof_ocean_load) {
-  // gets spectral element face id
-  int igll = threadIdx.x ;  //  threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1)
-  int iface = blockIdx.x + gridDim.x*blockIdx.y;
-  realw nx,ny,nz;
-  realw force_normal_comp,additional_term;
-
-  // for all faces on free surface
-  if( iface < num_free_surface_faces ){
-
-    int ispec = free_surface_ispec[iface]-1;
-
-    // gets global point index
-    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
-    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
-    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
-
-    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
-
-    //if(igll == 0 ) printf("igll %d %d %d %d\n",igll,i,j,k,iglob);
-
-    // only update this global point once
-
-    // daniel: TODO - there might be better ways to implement a mutex like below,
-    //            and find a workaround to not use the temporary update array.
-    //            atomicExch: returns the old value, i.e. 0 indicates that we still have to do this point
-
-    if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
-
-      // get normal
-      nx = free_surface_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; //(1,igll,iface)
-      ny = free_surface_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
-      nz = free_surface_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
-
-      // make updated component of right-hand side
-      // we divide by rmass() which is 1 / M
-      // we use the total force which includes the Coriolis term above
-      force_normal_comp = ( accel[iglob*3]*nx + accel[iglob*3+1]*ny + accel[iglob*3+2]*nz ) / rmass[iglob];
-
-      additional_term = (rmass_ocean_load[iglob] - rmass[iglob]) * force_normal_comp;
-
-      // probably wouldn't need atomicAdd anymore, but just to be sure...
-      atomicAdd(&accel[iglob*3], + additional_term * nx);
-      atomicAdd(&accel[iglob*3+1], + additional_term * ny);
-      atomicAdd(&accel[iglob*3+2], + additional_term * nz);
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(elastic_ocean_load_cuda,
-              ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
-                                       int* SIMULATION_TYPE) {
-
-TRACE("elastic_ocean_load_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  // checks if anything to do
-  if( mp->num_free_surface_faces == 0 ) return;
-
-  // block sizes: exact blocksize to match NGLLSQUARE
-  int blocksize = 25;
-
-  int num_blocks_x = mp->num_free_surface_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-
-  // initializes temporary array to zero
-  print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
-                                     sizeof(int)*mp->NGLOB_AB),88501);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("before kernel elastic_ocean_load_cuda");
-#endif
-
-  elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_accel,
-                                                   mp->d_rmass,
-                                                   mp->d_rmass_ocean_load,
-                                                   mp->num_free_surface_faces,
-                                                   mp->d_free_surface_ispec,
-                                                   mp->d_free_surface_ijk,
-                                                   mp->d_free_surface_normal,
-                                                   mp->d_ibool,
-                                                   mp->d_updated_dof_ocean_load);
-  // for backward/reconstructed potentials
-  if(*SIMULATION_TYPE == 3) {
-    // re-initializes array
-    print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
-                                       sizeof(int)*mp->NGLOB_AB),88502);
-
-    elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
-                                                       mp->d_rmass,
-                                                       mp->d_rmass_ocean_load,
-                                                       mp->num_free_surface_faces,
-                                                       mp->d_free_surface_ispec,
-                                                       mp->d_free_surface_ijk,
-                                                       mp->d_free_surface_normal,
-                                                       mp->d_ibool,
-                                                       mp->d_updated_dof_ocean_load);
-
-  }
-
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("elastic_ocean_load_cuda");
-#endif
-}

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,7 @@
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
-#include <mpi.h>
+
 #include <sys/types.h>
 #include <unistd.h>
 #include <sys/time.h>
@@ -47,26 +47,25 @@
 
 __global__ void compute_kernels_cudakernel(int* ispec_is_elastic,
                                            int* ibool,
-                                           float* accel,
-                                           float* b_displ,
-                                           float* epsilondev_xx,
-                                           float* epsilondev_yy,
-                                           float* epsilondev_xy,
-                                           float* epsilondev_xz,
-                                           float* epsilondev_yz,
-                                           float* b_epsilondev_xx,
-                                           float* b_epsilondev_yy,
-                                           float* b_epsilondev_xy,
-                                           float* b_epsilondev_xz,
-                                           float* b_epsilondev_yz,
-                                           float* rho_kl,
-                                           float deltat,
-                                           float* mu_kl,
-                                           float* kappa_kl,
-                                           float* epsilon_trace_over_3,
-                                           float* b_epsilon_trace_over_3,
-                                           int NSPEC_AB //,float* d_debug
-                                           ) {
+                                           realw* accel,
+                                           realw* b_displ,
+                                           realw* epsilondev_xx,
+                                           realw* epsilondev_yy,
+                                           realw* epsilondev_xy,
+                                           realw* epsilondev_xz,
+                                           realw* epsilondev_yz,
+                                           realw* b_epsilondev_xx,
+                                           realw* b_epsilondev_yy,
+                                           realw* b_epsilondev_xy,
+                                           realw* b_epsilondev_xz,
+                                           realw* b_epsilondev_yz,
+                                           realw* rho_kl,
+                                           realw deltat,
+                                           realw* mu_kl,
+                                           realw* kappa_kl,
+                                           realw* epsilon_trace_over_3,
+                                           realw* b_epsilon_trace_over_3,
+                                           int NSPEC_AB) {
 
   int ispec = blockIdx.x + blockIdx.y*gridDim.x;
 
@@ -77,20 +76,9 @@
     if( ispec_is_elastic[ispec] ) {
 
       int ijk = threadIdx.x;
-      int ijk_ispec = ijk + 125*ispec;
+      int ijk_ispec = ijk + NGLL3*ispec;
       int iglob = ibool[ijk_ispec] - 1 ;
 
-      // debug
-//      if(ijk_ispec == 9480531) {
-//        d_debug[0] = rho_kl[ijk_ispec];
-//        d_debug[1] = accel[3*iglob];
-//        d_debug[2] = b_displ[3*iglob];
-//        d_debug[3] = deltat * (accel[3*iglob]*b_displ[3*iglob]+
-//                               accel[3*iglob+1]*b_displ[3*iglob+1]+
-//                               accel[3*iglob+2]*b_displ[3*iglob+2]);
-//      }
-
-
       // isotropic kernels:
       // density kernel
       rho_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_displ[3*iglob]+
@@ -98,18 +86,9 @@
                                      accel[3*iglob+2]*b_displ[3*iglob+2]);
 
 
-      // debug
-      // if(rho_kl[ijk_ispec] < 1.9983e+18) {
-      // atomicAdd(&d_debug[3],1.0);
-      // d_debug[4] = ijk_ispec;
-      // d_debug[0] = rho_kl[ijk_ispec];
-      // d_debug[1] = accel[3*iglob];
-      // d_debug[2] = b_displ[3*iglob];
-      // }
-
       // shear modulus kernel
-      mu_kl[ijk_ispec] += deltat * (epsilondev_xx[ijk_ispec]*b_epsilondev_xx[ijk_ispec]+ // 1*b1
-                                    epsilondev_yy[ijk_ispec]*b_epsilondev_yy[ijk_ispec]+ // 2*b2
+      mu_kl[ijk_ispec] += deltat * (epsilondev_xx[ijk_ispec]*b_epsilondev_xx[ijk_ispec]+
+                                    epsilondev_yy[ijk_ispec]*b_epsilondev_yy[ijk_ispec]+
                                     (epsilondev_xx[ijk_ispec]+epsilondev_yy[ijk_ispec])*
                                     (b_epsilondev_xx[ijk_ispec]+b_epsilondev_yy[ijk_ispec])+
                                     2*(epsilondev_xy[ijk_ispec]*b_epsilondev_xy[ijk_ispec]+
@@ -129,13 +108,13 @@
 extern "C"
 void FC_FUNC_(compute_kernels_elastic_cuda,
               COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
-                                            float* deltat_f) {
+                                            realw* deltat_f) {
 TRACE("compute_kernels_elastic_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  int blocksize = 125; // NGLLX*NGLLY*NGLLZ
-  float deltat = *deltat_f;
+  int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
+  realw deltat = *deltat_f;
 
   int num_blocks_x = mp->NSPEC_AB;
   int num_blocks_y = 1;
@@ -147,12 +126,6 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  //float* d_debug;
-  //float* h_debug;
-  //h_debug = (float*)calloc(128,sizeof(float));
-  //cudaMalloc((void**)&d_debug,128*sizeof(float));
-  //cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
-
   compute_kernels_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
                                                mp->d_accel, mp->d_b_displ,
                                                mp->d_epsilondev_xx,
@@ -171,31 +144,8 @@
                                                mp->d_kappa_kl,
                                                mp->d_epsilon_trace_over_3,
                                                mp->d_b_epsilon_trace_over_3,
-                                               mp->NSPEC_AB //,d_debug
-                                               );
-  /*
-  cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaFree(d_debug);
-  */
-  // for(int i=0;i<5;i++) {
-  // printf("d_debug[%d]=%e\n",i,h_debug[i]);
-  // }
-  /*
-  free(h_debug);
-  */
-  // float* h_rho = (float*)malloc(sizeof(float)*mp->NSPEC_AB*125);
-  // float maxval = 0;
-  // cudaMemcpy(h_rho,mp->d_rho_kl,sizeof(float)*mp->NSPEC_AB*125,cudaMemcpyDeviceToHost);
-  // int number_big_values = 0;
-  // for(int i=0;i<mp->NSPEC_AB*125;i++) {
-  // maxval = MAX(maxval,fabsf(h_rho[i]));
-  // if(fabsf(h_rho[i]) > 1e10) {
-  // number_big_values++;
-  // }
-  // }
+                                               mp->NSPEC_AB);
 
-  // printf("maval rho = %e, number>1e10 = %d vs. %d\n",maxval,number_big_values,mp->NSPEC_AB*125);
-
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_kernels_elastic_cuda");
 #endif
@@ -209,46 +159,34 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 
-__global__ void compute_kernels_strength_noise_cuda_kernel(float* displ,
+__global__ void compute_kernels_strength_noise_cuda_kernel(realw* displ,
                                                            int* free_surface_ispec,
                                                            int* free_surface_ijk,
                                                            int* ibool,
-                                                           float* noise_surface_movie,
-                                                           float* normal_x_noise,
-                                                           float* normal_y_noise,
-                                                           float* normal_z_noise,
-                                                           float* Sigma_kl,
-                                                           float deltat,
-                                                           int num_free_surface_faces //,float* d_debug
-                                                           ) {
+                                                           realw* noise_surface_movie,
+                                                           realw* normal_x_noise,
+                                                           realw* normal_y_noise,
+                                                           realw* normal_z_noise,
+                                                           realw* Sigma_kl,
+                                                           realw deltat,
+                                                           int num_free_surface_faces) {
   int iface = blockIdx.x + blockIdx.y*gridDim.x;
 
   if(iface < num_free_surface_faces) {
 
     int ispec = free_surface_ispec[iface]-1;
     int igll = threadIdx.x;
-    int ipoin = igll + 25*iface;
+    int ipoin = igll + NGLL2*iface;
     int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1 ;
     int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
     int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
 
     int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1 ;
 
-    float eta = ( noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x_noise[ipoin]+
+    realw eta = ( noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x_noise[ipoin]+
                  noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y_noise[ipoin]+
                  noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)]*normal_z_noise[ipoin]);
 
-    // if(ijk_ispec == 78496) {
-    //   d_debug[0] = Sigma_kl[ijk_ispec];
-    //   d_debug[1] = eta;
-    //   d_debug[2] = normal_x_noise[ipoin];
-    //   d_debug[3] = normal_y_noise[ipoin];
-    //   d_debug[4] = normal_z_noise[ipoin];
-    //   d_debug[5] = displ[3*iglob+2];
-    //   d_debug[6] = deltat*eta*normal_z_noise[ipoin]*displ[2+3*iglob];
-    //   d_debug[7] = 0.008*1.000000e-24*normal_z_noise[ipoin]*3.740546e-13;
-    // }
-
     Sigma_kl[INDEX4(5,5,5,i,j,k,ispec)] += deltat*eta*(normal_x_noise[ipoin]*displ[3*iglob]+
                                                        normal_y_noise[ipoin]*displ[1+3*iglob]+
                                                        normal_z_noise[ipoin]*displ[2+3*iglob]);
@@ -261,15 +199,15 @@
 extern "C"
 void FC_FUNC_(compute_kernels_strgth_noise_cu,
               COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
-                                                    float* h_noise_surface_movie,
-                                                    float* deltat) {
+                                                    realw* h_noise_surface_movie,
+                                                    realw* deltat) {
 
 TRACE("compute_kernels_strgth_noise_cu");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
-             3*25*(mp->num_free_surface_faces)*sizeof(float),cudaMemcpyHostToDevice);
+             3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice);
 
 
   int num_blocks_x = mp->num_free_surface_faces;
@@ -280,13 +218,8 @@
   }
 
   dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(25,1,1);
+  dim3 threads(NGLL2,1,1);
 
-  // float* h_debug = (float*)calloc(128,sizeof(float));
-  //float* d_debug;
-  // cudaMalloc((void**)&d_debug,128*sizeof(float));
-  // cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
-
   compute_kernels_strength_noise_cuda_kernel<<<grid,threads>>>(mp->d_displ,
                                                                mp->d_free_surface_ispec,
                                                                mp->d_free_surface_ijk,
@@ -296,14 +229,8 @@
                                                                mp->d_normal_y_noise,
                                                                mp->d_normal_z_noise,
                                                                mp->d_Sigma_kl,*deltat,
-                                                               mp->num_free_surface_faces //,d_debug
-                                                               );
+                                                               mp->num_free_surface_faces);
 
-  // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
-  // for(int i=0;i<8;i++) {
-  //   printf("debug[%d]= %e\n",i,h_debug[i]);
-  // }
-
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_kernels_strength_noise_cuda_kernel");
 #endif
@@ -320,30 +247,30 @@
 
 __device__ void compute_gradient_kernel(int ijk,
                                         int ispec,
-                                        float* scalar_field,
-                                        float* vector_field_element,
-                                        float* hprime_xx,
-                                        float* hprime_yy,
-                                        float* hprime_zz,
-                                        float* d_xix,
-                                        float* d_xiy,
-                                        float* d_xiz,
-                                        float* d_etax,
-                                        float* d_etay,
-                                        float* d_etaz,
-                                        float* d_gammax,
-                                        float* d_gammay,
-                                        float* d_gammaz,
-                                        float rhol) {
+                                        realw* scalar_field,
+                                        realw* vector_field_element,
+                                        realw* hprime_xx,
+                                        realw* hprime_yy,
+                                        realw* hprime_zz,
+                                        realw* d_xix,
+                                        realw* d_xiy,
+                                        realw* d_xiz,
+                                        realw* d_etax,
+                                        realw* d_etay,
+                                        realw* d_etaz,
+                                        realw* d_gammax,
+                                        realw* d_gammay,
+                                        realw* d_gammaz,
+                                        realw rhol) {
 
-  float temp1l,temp2l,temp3l;
-  float hp1,hp2,hp3;
-  float xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl;
-  float rho_invl;
+  realw temp1l,temp2l,temp3l;
+  realw hp1,hp2,hp3;
+  realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl;
+  realw rho_invl;
   int l,offset,offset1,offset2,offset3;
 
   //const int NGLLX = 5;
-  const int NGLL3_ALIGN = 128;
+  const int NGLL3_ALIGN = NGLL3_PADDED;
 
   int K = (ijk/NGLL2);
   int J = ((ijk-K*NGLL2)/NGLLX);
@@ -400,26 +327,26 @@
 
 __global__ void compute_kernels_acoustic_kernel(int* ispec_is_acoustic,
                                                 int* ibool,
-                                                float* rhostore,
-                                                float* kappastore,
-                                                float* hprime_xx,
-                                                float* hprime_yy,
-                                                float* hprime_zz,
-                                                float* d_xix,
-                                                float* d_xiy,
-                                                float* d_xiz,
-                                                float* d_etax,
-                                                float* d_etay,
-                                                float* d_etaz,
-                                                float* d_gammax,
-                                                float* d_gammay,
-                                                float* d_gammaz,
-                                                float* potential_dot_dot_acoustic,
-                                                float* b_potential_acoustic,
-                                                float* b_potential_dot_dot_acoustic,
-                                                float* rho_ac_kl,
-                                                float* kappa_ac_kl,
-                                                float deltat,
+                                                realw* rhostore,
+                                                realw* kappastore,
+                                                realw* hprime_xx,
+                                                realw* hprime_yy,
+                                                realw* hprime_zz,
+                                                realw* d_xix,
+                                                realw* d_xiy,
+                                                realw* d_xiz,
+                                                realw* d_etax,
+                                                realw* d_etay,
+                                                realw* d_etaz,
+                                                realw* d_gammax,
+                                                realw* d_gammay,
+                                                realw* d_gammaz,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* b_potential_acoustic,
+                                                realw* b_potential_dot_dot_acoustic,
+                                                realw* rho_ac_kl,
+                                                realw* kappa_ac_kl,
+                                                realw deltat,
                                                 int NSPEC_AB) {
 
   int ispec = blockIdx.x + blockIdx.y*gridDim.x;
@@ -433,17 +360,17 @@
       int ijk = threadIdx.x;
 
       // local and global indices
-      int ijk_ispec = ijk + 125*ispec;
-      int ijk_ispec_padded = ijk + 128*ispec;
+      int ijk_ispec = ijk + NGLL3*ispec;
+      int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
       int iglob = ibool[ijk_ispec] - 1;
 
-      float accel_elm[3];
-      float b_displ_elm[3];
-      float rhol,kappal;
+      realw accel_elm[3];
+      realw b_displ_elm[3];
+      realw rhol,kappal;
 
       // shared memory between all threads within this block
-      __shared__ float scalar_field_displ[125];
-      __shared__ float scalar_field_accel[125];
+      __shared__ realw scalar_field_displ[NGLL3];
+      __shared__ realw scalar_field_accel[NGLL3];
 
       // copy field values
       scalar_field_displ[ijk] = b_potential_acoustic[iglob];
@@ -485,14 +412,14 @@
 void FC_FUNC_(compute_kernels_acoustic_cuda,
               COMPUTE_KERNELS_ACOUSTIC_CUDA)(
                                              long* Mesh_pointer,
-                                             float* deltat_f) {
+                                             realw* deltat_f) {
 
 TRACE("compute_kernels_acoustic_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  int blocksize = 125; // NGLLX*NGLLY*NGLLZ
-  float deltat = *deltat_f;
+  int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
+  realw deltat = *deltat_f;
 
   int num_blocks_x = mp->NSPEC_AB;
   int num_blocks_y = 1;
@@ -541,10 +468,10 @@
 
 __global__ void compute_kernels_hess_el_cudakernel(int* ispec_is_elastic,
                                                    int* ibool,
-                                                   float* accel,
-                                                   float* b_accel,
-                                                   float* hess_kl,
-                                                   float deltat,
+                                                   realw* accel,
+                                                   realw* b_accel,
+                                                   realw* hess_kl,
+                                                   realw deltat,
                                                    int NSPEC_AB) {
 
   int ispec = blockIdx.x + blockIdx.y*gridDim.x;
@@ -556,7 +483,7 @@
     if( ispec_is_elastic[ispec] ) {
 
       int ijk = threadIdx.x;
-      int ijk_ispec = ijk + 125*ispec;
+      int ijk_ispec = ijk + NGLL3*ispec;
       int iglob = ibool[ijk_ispec] - 1 ;
 
       // approximate hessian
@@ -571,23 +498,23 @@
 
 __global__ void compute_kernels_hess_ac_cudakernel(int* ispec_is_acoustic,
                                                    int* ibool,
-                                                   float* potential_dot_dot_acoustic,
-                                                   float* b_potential_dot_dot_acoustic,
-                                                   float* rhostore,
-                                                   float* hprime_xx,
-                                                   float* hprime_yy,
-                                                   float* hprime_zz,
-                                                   float* d_xix,
-                                                   float* d_xiy,
-                                                   float* d_xiz,
-                                                   float* d_etax,
-                                                   float* d_etay,
-                                                   float* d_etaz,
-                                                   float* d_gammax,
-                                                   float* d_gammay,
-                                                   float* d_gammaz,
-                                                   float* hess_kl,
-                                                   float deltat,
+                                                   realw* potential_dot_dot_acoustic,
+                                                   realw* b_potential_dot_dot_acoustic,
+                                                   realw* rhostore,
+                                                   realw* hprime_xx,
+                                                   realw* hprime_yy,
+                                                   realw* hprime_zz,
+                                                   realw* d_xix,
+                                                   realw* d_xiy,
+                                                   realw* d_xiz,
+                                                   realw* d_etax,
+                                                   realw* d_etay,
+                                                   realw* d_etaz,
+                                                   realw* d_gammax,
+                                                   realw* d_gammay,
+                                                   realw* d_gammaz,
+                                                   realw* hess_kl,
+                                                   realw deltat,
                                                    int NSPEC_AB) {
 
   int ispec = blockIdx.x + blockIdx.y*gridDim.x;
@@ -600,18 +527,18 @@
 
       // local and global indices
       int ijk = threadIdx.x;
-      int ijk_ispec = ijk + 125*ispec;
+      int ijk_ispec = ijk + NGLL3*ispec;
       int iglob = ibool[ijk_ispec] - 1 ;
 
-      int ijk_ispec_padded = ijk + 128*ispec;
+      int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
 
-      float accel_elm[3];
-      float b_accel_elm[3];
-      float rhol;
+      realw accel_elm[3];
+      realw b_accel_elm[3];
+      realw rhol;
 
       // shared memory between all threads within this block
-      __shared__ float scalar_field_accel[125];
-      __shared__ float scalar_field_b_accel[125];
+      __shared__ realw scalar_field_accel[NGLL3];
+      __shared__ realw scalar_field_b_accel[NGLL3];
 
       // copy field values
       scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob];
@@ -649,15 +576,15 @@
 extern "C"
 void FC_FUNC_(compute_kernels_hess_cuda,
               COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
-                                         float* deltat_f,
+                                         realw* deltat_f,
                                          int* ELASTIC_SIMULATION,
                                          int* ACOUSTIC_SIMULATION) {
   TRACE("compute_kernels_hess_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  int blocksize = 125; // NGLLX*NGLLY*NGLLZ
-  float deltat = *deltat_f;
+  int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
+  realw deltat = *deltat_f;
 
   int num_blocks_x = mp->NSPEC_AB;
   int num_blocks_y = 1;

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
-#include <mpi.h>
 
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -40,22 +39,22 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void compute_stacey_acoustic_kernel(float* potential_dot_acoustic,
-                                               float* potential_dot_dot_acoustic,
+__global__ void compute_stacey_acoustic_kernel(realw* potential_dot_acoustic,
+                                               realw* potential_dot_dot_acoustic,
                                                int* abs_boundary_ispec,
                                                int* abs_boundary_ijk,
                                                realw* abs_boundary_jacobian2Dw,
                                                int* ibool,
-                                               float* rhostore,
-                                               float* kappastore,
+                                               realw* rhostore,
+                                               realw* kappastore,
                                                int* ispec_is_inner,
                                                int* ispec_is_acoustic,
                                                int phase_is_inner,
                                                int SIMULATION_TYPE, int SAVE_FORWARD,
                                                int num_abs_boundary_faces,
-                                               float* b_potential_dot_acoustic,
-                                               float* b_potential_dot_dot_acoustic,
-                                               float* b_absorb_potential
+                                               realw* b_potential_dot_acoustic,
+                                               realw* b_potential_dot_dot_acoustic,
+                                               realw* b_absorb_potential
                                                ) {
 
   int igll = threadIdx.x;
@@ -116,7 +115,7 @@
                                     int* phase_is_innerf,
                                     int* SIMULATION_TYPEf,
                                     int* SAVE_FORWARDf,
-                                    float* h_b_absorb_potential) {
+                                    realw* h_b_absorb_potential) {
 TRACE("compute_stacey_acoustic_cuda");
   //double start_time = get_time();
 
@@ -131,7 +130,7 @@
 
   // way 2: Elapsed time: 4.379034e-03
   // > NGLLSQUARE==NGLL2==25, no further check inside kernel
-  int blocksize = 25;
+  int blocksize = NGLL2;
 
   int num_blocks_x = mp->d_num_abs_boundary_faces;
   int num_blocks_y = 1;

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
-#include <mpi.h>
 
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -56,8 +55,7 @@
                                               int SAVE_FORWARD,
                                               int num_abs_boundary_faces,
                                               realw* b_accel,
-                                              realw* b_absorb_field //,float* debug_val,int* debug_val_int
-                                              ) {
+                                              realw* b_absorb_field) {
 
   int igll = threadIdx.x; // tx
   int iface = blockIdx.x + gridDim.x*blockIdx.y; // bx
@@ -136,7 +134,7 @@
                                            int* phase_is_innerf,
                                            int* SIMULATION_TYPEf,
                                            int* SAVE_FORWARDf,
-                                           float* h_b_absorb_field) {
+                                           realw* h_b_absorb_field) {
 
 TRACE("compute_stacey_elastic_cuda");
 
@@ -155,7 +153,7 @@
 
   // way 2: seems sligthly faster
   // > NGLLSQUARE==NGLL2==25, no further check inside kernel
-  int blocksize = 25;
+  int blocksize = NGLL2;
 
   int num_blocks_x = mp->d_num_abs_boundary_faces;
   int num_blocks_y = 1;
@@ -167,12 +165,7 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  //float* d_debug_val;
-  //int* d_debug_val_int;
-
   if(SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0) {
-    // int val = NSTEP-it+1;
-    // read_abs_(&fid,(char*)b_absorb_field,&b_reclen_field,&val);
     // The read is done in fortran
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,h_b_absorb_field,
                                        mp->d_b_reclen_field,cudaMemcpyHostToDevice),7700);
@@ -197,8 +190,7 @@
                                                   SIMULATION_TYPE,SAVE_FORWARD,
                                                   mp->d_num_abs_boundary_faces,
                                                   mp->d_b_accel,
-                                                  mp->d_b_absorb_field //,d_debug_val,d_debug_val_int
-                                                  );
+                                                  mp->d_b_absorb_field);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_stacey_elastic_kernel");

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
-#include <mpi.h>
 
 #include "config.h"
 #include "mesh_constants_cuda.h"
@@ -70,13 +69,13 @@
 void FC_FUNC_(it_update_displacement_cuda,
               IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
                                                  int* size_F,
-                                                 float* deltat_F,
-                                                 float* deltatsqover2_F,
-                                                 float* deltatover2_F,
+                                                 realw* deltat_F,
+                                                 realw* deltatsqover2_F,
+                                                 realw* deltatover2_F,
                                                  int* SIMULATION_TYPE,
-                                                 float* b_deltat_F,
-                                                 float* b_deltatsqover2_F,
-                                                 float* b_deltatover2_F) {
+                                                 realw* b_deltat_F,
+                                                 realw* b_deltatsqover2_F,
+                                                 realw* b_deltatover2_F) {
 
 TRACE("it_update_displacement_cuda");
 
@@ -93,7 +92,7 @@
   realw b_deltatover2 = *b_deltatover2_F;
   //cublasStatus status;
 
-  int blocksize = 128;
+  int blocksize = BLOCKSIZE_KERNEL1;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
 
   int num_blocks_x = size_padded/blocksize;
@@ -175,13 +174,13 @@
 void FC_FUNC_(it_update_displacement_ac_cuda,
               it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
                                                int* size_F,
-                                               float* deltat_F,
-                                               float* deltatsqover2_F,
-                                               float* deltatover2_F,
+                                               realw* deltat_F,
+                                               realw* deltatsqover2_F,
+                                               realw* deltatover2_F,
                                                int* SIMULATION_TYPE,
-                                               float* b_deltat_F,
-                                               float* b_deltatsqover2_F,
-                                               float* b_deltatover2_F) {
+                                               realw* b_deltat_F,
+                                               realw* b_deltatsqover2_F,
+                                               realw* b_deltatover2_F) {
 TRACE("it_update_displacement_ac_cuda");
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
 
@@ -195,7 +194,7 @@
   realw b_deltatover2 = *b_deltatover2_F;
   //cublasStatus status;
 
-  int blocksize = 128;
+  int blocksize = BLOCKSIZE_KERNEL1;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
 
   int num_blocks_x = size_padded/blocksize;

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2011-11-06 02:02:36 UTC (rev 19152)
@@ -44,6 +44,7 @@
 
 #ifndef GPU_MESH_
 #define GPU_MESH_
+
 #include <sys/types.h>
 #include <unistd.h>
 
@@ -74,22 +75,6 @@
 // error checking after cuda function calls
 #define ENABLE_VERY_SLOW_ERROR_CHECKING
 
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// indexing
-
-#define INDEX2(xsize,x,y) x + (y)*xsize
-#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
-#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
-#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*j)))
-#define INDEX6(xsize,ysize,zsize,isize,jsize,x,y,z,i,j,k) x + xsize*(y + ysize*(z + zsize*(i + isize*(j + jsize*k))))
-
-#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*z) + (i)*128
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
 #define MAX(x,y)                    (((x) < (y)) ? (y) : (x))
 
 double get_time();
@@ -108,14 +93,22 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// dimensions
 #define NDIM 3
+
+// Gauss-Lobatto-Legendre 
 #define NGLLX 5
 #define NGLL2 25
-#define N_SLS 3
+#define NGLL3 125 // no padding: requires same size as in fortran for NGLLX * NGLLY * NGLLZ
 
-#define NGLL3_NONPADDED 125
+// padding: 128 == 2**7 might improve on older graphics cards w/ coalescent memory accesses:
 #define NGLL3_PADDED 128
+// no padding: 125 == 5*5*5 to avoid allocation of extra memory
+//#define NGLL3_PADDED 125
 
+// number of standard linear solids
+#define N_SLS 3
+
 //typedef float real;   // type of variables passed into function
 typedef float realw;  // type of "working" variables
 
@@ -127,8 +120,33 @@
 // leads up to ~ 5% performance increase
 //#define USE_MESH_COLORING_GPU
 
+// cuda kernel block size for updating displacements/potential (newmark time scheme)
+#define BLOCKSIZE_KERNEL1 128
+#define BLOCKSIZE_KERNEL3 128
+#define BLOCKSIZE_TRANSFER 256
+
 /* ----------------------------------------------------------------------------------------------- */
 
+// indexing
+
+#define INDEX2(xsize,x,y) x + (y)*xsize
+
+#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
+//#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
+
+#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
+//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
+
+#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*(j))))
+//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + (j)*xsize*ysize*zsize*isize
+
+#define INDEX6(xsize,ysize,zsize,isize,jsize,x,y,z,i,j,k) x + xsize*(y + ysize*(z + zsize*(i + isize*(j + jsize*k))))
+
+#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*z) + (i)*NGLL3_PADDED
+//#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*NGLL3_PADDED
+
+/* ----------------------------------------------------------------------------------------------- */
+
 // mesh pointer wrapper structure
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -140,12 +158,12 @@
   int NGLOB_AB;
 
   // interpolators
-  float* d_xix; float* d_xiy; float* d_xiz;
-  float* d_etax; float* d_etay; float* d_etaz;
-  float* d_gammax; float* d_gammay; float* d_gammaz;
+  realw* d_xix; realw* d_xiy; realw* d_xiz;
+  realw* d_etax; realw* d_etay; realw* d_etaz;
+  realw* d_gammax; realw* d_gammay; realw* d_gammaz;
 
   // model parameters
-  float* d_kappav; float* d_muv;
+  realw* d_kappav; realw* d_muv;
 
   // global indexing
   int* d_ibool;
@@ -157,18 +175,22 @@
   int use_mesh_coloring_gpu;
 
   // pointers to constant memory arrays
-  float* d_hprime_xx; float* d_hprime_yy; float* d_hprime_zz;
-  float* d_hprimewgll_xx; float* d_hprimewgll_yy; float* d_hprimewgll_zz;
-  float* d_wgllwgll_xy; float* d_wgllwgll_xz; float* d_wgllwgll_yz;
+  realw* d_hprime_xx; realw* d_hprime_yy; realw* d_hprime_zz;
+  realw* d_hprimewgll_xx; realw* d_hprimewgll_yy; realw* d_hprimewgll_zz;
+  realw* d_wgllwgll_xy; realw* d_wgllwgll_xz; realw* d_wgllwgll_yz;
 
+  // mpi buffers
+  int num_interfaces_ext_mesh;
+  int max_nibool_interfaces_ext_mesh;
+  
   // ------------------------------------------------------------------ //
   // elastic wavefield parameters
   // ------------------------------------------------------------------ //
 
   // displacement, velocity, acceleration
-  float* d_displ; float* d_veloc; float* d_accel;
+  realw* d_displ; realw* d_veloc; realw* d_accel;
   // backward/reconstructed elastic wavefield
-  float* d_b_displ; float* d_b_veloc; float* d_b_accel;
+  realw* d_b_displ; realw* d_b_veloc; realw* d_b_accel;
 
   // elastic elements
   int* d_ispec_is_elastic;
@@ -182,8 +204,10 @@
   int num_colors_outer_elastic,num_colors_inner_elastic;
   int nspec_elastic;
 
-  float* d_rmass;
-  float* d_send_accel_buffer;
+  realw* d_rmass;
+  
+  // mpi buffer
+  realw* d_send_accel_buffer;
 
   // interfaces
   int* d_nibool_interfaces_ext_mesh;
@@ -193,18 +217,18 @@
   int d_num_abs_boundary_faces;
   int* d_abs_boundary_ispec;
   int* d_abs_boundary_ijk;
-  float* d_abs_boundary_normal;
-  float* d_abs_boundary_jacobian2Dw;
+  realw* d_abs_boundary_normal;
+  realw* d_abs_boundary_jacobian2Dw;
 
-  float* d_b_absorb_field;
+  realw* d_b_absorb_field;
   int d_b_reclen_field;
 
-  float* d_rho_vp;
-  float* d_rho_vs;
+  realw* d_rho_vp;
+  realw* d_rho_vs;
 
   // sources
   int nsources_local;
-  float* d_sourcearrays;
+  realw* d_sourcearrays;
   double* d_stf_pre_compute;
   int* d_islice_selected_source;
   int* d_ispec_selected_source;
@@ -214,12 +238,13 @@
   int* d_ispec_selected_rec;
   int* d_islice_selected_rec;
   int nrec_local;
-  float* d_station_seismo_field;
-  float* h_station_seismo_field;
+  realw* d_station_seismo_field;
+  realw* h_station_seismo_field;
 
+  // adjoint receivers/sources
   int nadj_rec_local;
-  float* d_adj_sourcearrays;
-  float* h_adj_sourcearrays_slice;
+  realw* d_adj_sourcearrays;
+  realw* h_adj_sourcearrays_slice;
   int* d_pre_computed_irec;
 
   // surface elements (to save for noise tomography and acoustic simulations)
@@ -228,80 +253,103 @@
   int num_free_surface_faces;
 
   // surface movie elements to save for noise tomography
-  float* d_noise_surface_movie;
+  realw* d_noise_surface_movie;
 
   // attenuation
-  float* d_R_xx;
-  float* d_R_yy;
-  float* d_R_xy;
-  float* d_R_xz;
-  float* d_R_yz;
+  realw* d_R_xx;
+  realw* d_R_yy;
+  realw* d_R_xy;
+  realw* d_R_xz;
+  realw* d_R_yz;
 
-  float* d_one_minus_sum_beta;
-  float* d_factor_common;
+  realw* d_one_minus_sum_beta;
+  realw* d_factor_common;
 
-  float* d_alphaval;
-  float* d_betaval;
-  float* d_gammaval;
+  realw* d_alphaval;
+  realw* d_betaval;
+  realw* d_gammaval;
 
   // attenuation & kernel
-  float* d_epsilondev_xx;
-  float* d_epsilondev_yy;
-  float* d_epsilondev_xy;
-  float* d_epsilondev_xz;
-  float* d_epsilondev_yz;
-  float* d_epsilon_trace_over_3;
+  realw* d_epsilondev_xx;
+  realw* d_epsilondev_yy;
+  realw* d_epsilondev_xy;
+  realw* d_epsilondev_xz;
+  realw* d_epsilondev_yz;
+  realw* d_epsilon_trace_over_3;
 
+  // anisotropy
+  realw* d_c11store;
+  realw* d_c12store;
+  realw* d_c13store;
+  realw* d_c14store;
+  realw* d_c15store;
+  realw* d_c16store;
+  realw* d_c22store;
+  realw* d_c23store;
+  realw* d_c24store;
+  realw* d_c25store;
+  realw* d_c26store;
+  realw* d_c33store;
+  realw* d_c34store;
+  realw* d_c35store;
+  realw* d_c36store;
+  realw* d_c44store;
+  realw* d_c45store;
+  realw* d_c46store;
+  realw* d_c55store;
+  realw* d_c56store;
+  realw* d_c66store;
+
   // noise
-  float* d_normal_x_noise;
-  float* d_normal_y_noise;
-  float* d_normal_z_noise;
-  float* d_mask_noise;
-  float* d_free_surface_jacobian2Dw;
+  realw* d_normal_x_noise;
+  realw* d_normal_y_noise;
+  realw* d_normal_z_noise;
+  realw* d_mask_noise;
+  realw* d_free_surface_jacobian2Dw;
 
-  float* d_noise_sourcearray;
+  realw* d_noise_sourcearray;
 
   // attenuation & kernel backward fields
-  float* d_b_R_xx;
-  float* d_b_R_yy;
-  float* d_b_R_xy;
-  float* d_b_R_xz;
-  float* d_b_R_yz;
+  realw* d_b_R_xx;
+  realw* d_b_R_yy;
+  realw* d_b_R_xy;
+  realw* d_b_R_xz;
+  realw* d_b_R_yz;
 
-  float* d_b_epsilondev_xx;
-  float* d_b_epsilondev_yy;
-  float* d_b_epsilondev_xy;
-  float* d_b_epsilondev_xz;
-  float* d_b_epsilondev_yz;
-  float* d_b_epsilon_trace_over_3;
+  realw* d_b_epsilondev_xx;
+  realw* d_b_epsilondev_yy;
+  realw* d_b_epsilondev_xy;
+  realw* d_b_epsilondev_xz;
+  realw* d_b_epsilondev_yz;
+  realw* d_b_epsilon_trace_over_3;
 
-  float* d_b_alphaval;
-  float* d_b_betaval;
-  float* d_b_gammaval;
+  realw* d_b_alphaval;
+  realw* d_b_betaval;
+  realw* d_b_gammaval;
 
   // sensitivity kernels
-  float* d_rho_kl;
-  float* d_mu_kl;
-  float* d_kappa_kl;
+  realw* d_rho_kl;
+  realw* d_mu_kl;
+  realw* d_kappa_kl;
 
   // noise sensitivity kernel
-  float* d_Sigma_kl;
+  realw* d_Sigma_kl;
 
   // approximative hessian for preconditioning kernels
-  float* d_hess_el_kl;
+  realw* d_hess_el_kl;
 
   // oceans
-  float* d_rmass_ocean_load;
-  float* d_free_surface_normal;
+  realw* d_rmass_ocean_load;
+  realw* d_free_surface_normal;
   int* d_updated_dof_ocean_load;
 
   // ------------------------------------------------------------------ //
   // acoustic wavefield
   // ------------------------------------------------------------------ //
   // potential and first and second time derivative
-  float* d_potential_acoustic; float* d_potential_dot_acoustic; float* d_potential_dot_dot_acoustic;
+  realw* d_potential_acoustic; realw* d_potential_dot_acoustic; realw* d_potential_dot_dot_acoustic;
   // backward/reconstructed wavefield
-  float* d_b_potential_acoustic; float* d_b_potential_dot_acoustic; float* d_b_potential_dot_dot_acoustic;
+  realw* d_b_potential_acoustic; realw* d_b_potential_dot_acoustic; realw* d_b_potential_dot_dot_acoustic;
 
   // acoustic domain parameters
   int* d_ispec_is_acoustic;
@@ -314,34 +362,33 @@
   int num_colors_outer_acoustic,num_colors_inner_acoustic;
   int nspec_acoustic;
 
-  float* d_rhostore;
-  float* d_kappastore;
-  float* d_rmass_acoustic;
+  realw* d_rhostore;
+  realw* d_kappastore;
+  realw* d_rmass_acoustic;
+  
+  // mpi buffer
+  realw* d_send_potential_dot_dot_buffer;
 
-  float* d_send_potential_dot_dot_buffer;
-
-  float* d_b_absorb_potential;
+  realw* d_b_absorb_potential;
   int d_b_reclen_potential;
 
   // for writing seismograms
-  float* d_station_seismo_potential;
-  float* h_station_seismo_potential;
+  realw* d_station_seismo_potential;
+  realw* h_station_seismo_potential;
 
   // sensitivity kernels
-  float* d_rho_ac_kl;
-  float* d_kappa_ac_kl;
+  realw* d_rho_ac_kl;
+  realw* d_kappa_ac_kl;
 
   // approximative hessian for preconditioning kernels
-  float* d_hess_ac_kl;
+  realw* d_hess_ac_kl;
 
   // coupling acoustic-elastic
   int* d_coupling_ac_el_ispec;
   int* d_coupling_ac_el_ijk;
-  float* d_coupling_ac_el_normal;
-  float* d_coupling_ac_el_jacobian2Dw;
+  realw* d_coupling_ac_el_normal;
+  realw* d_coupling_ac_el_jacobian2Dw;
 
-
-
 } Mesh;
 
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -63,7 +63,7 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(fortranprintf,FORTRANPRINTF)(float* val) {
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {
 TRACE("fortranprintf");
 
   int procid;
@@ -86,15 +86,15 @@
 
 // randomize displ for testing
 extern "C"
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,float* h_displ) {
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {
 TRACE("make_displ_rand");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
-  // float* displ_rnd = (float*)malloc(mp->NGLOB_AB*3*sizeof(float));
+  // realw* displ_rnd = (realw*)malloc(mp->NGLOB_AB*3*sizeof(realw));
   for(int i=0;i<mp->NGLOB_AB*3;i++) {
     h_displ[i] = rand();
   }
-  cudaMemcpy(mp->d_displ,h_displ,mp->NGLOB_AB*3*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_displ,h_displ,mp->NGLOB_AB*3*sizeof(realw),cudaMemcpyHostToDevice);
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -142,7 +142,7 @@
     num_blocks_y = num_blocks_y*2;
   }
   dim3 grid(num_blocks_x,num_blocks_y,1);
-  dim3 threads(25,1,1);
+  dim3 threads(NGLL2,1,1);
 
   transfer_surface_to_host_kernel<<<grid,threads>>>(mp->d_free_surface_ispec,
                                                     mp->d_free_surface_ijk,
@@ -152,7 +152,7 @@
                                                     mp->d_noise_surface_movie);
 
   cudaMemcpy(h_noise_surface_movie,mp->d_noise_surface_movie,
-             3*25*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost);
+             3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("transfer_surface_to_host");
@@ -170,8 +170,7 @@
                                                          realw* normal_y_noise,
                                                          realw* normal_z_noise,
                                                          realw* mask_noise,
-                                                         realw* free_surface_jacobian2Dw //,float* d_debug
-                                                         ) {
+                                                         realw* free_surface_jacobian2Dw) {
 
   int iface = blockIdx.x + gridDim.x*blockIdx.y; // surface element id
 
@@ -181,7 +180,7 @@
 
     int igll = threadIdx.x;
 
-    int ipoin = 25*iface + igll;
+    int ipoin = NGLL2*iface + igll;
     int i=free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
     int j=free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
     int k=free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
@@ -208,9 +207,9 @@
     // 0x203000c8 is out of bounds
 
     // non atomic version for speed testing -- atomic updates are needed for correctness
-    // accel[3*iglob] +=   eta*mask_noise[ipoin] * normal_x * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + 25*ispec2D];
-    // accel[3*iglob+1] += eta*mask_noise[ipoin] * normal_y * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + 25*ispec2D];
-    // accel[3*iglob+2] += eta*mask_noise[ipoin] * normal_z * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + 25*ispec2D];
+    // accel[3*iglob] +=   eta*mask_noise[ipoin] * normal_x * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D];
+    // accel[3*iglob+1] += eta*mask_noise[ipoin] * normal_y * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D];
+    // accel[3*iglob+2] += eta*mask_noise[ipoin] * normal_z * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D];
 
     // Fortran version in SVN -- note deletion of wgllwgll_xy?
     // accel(1,iglob) = accel(1,iglob) + eta * mask_noise(ipoin) * normal_x_noise(ipoin) &
@@ -220,13 +219,13 @@
     // accel(3,iglob) = accel(3,iglob) + eta * mask_noise(ipoin) * normal_z_noise(ipoin) &
     // * free_surface_jacobian2Dw(igll,iface) ! wgllwgll_xy(i,j) * jacobian2D_top(i,j,iface)
 
-    // atomicAdd(&accel[iglob*3]  ,eta*mask_noise[ipoin]*normal_x*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+25*iface]);
-    // atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+25*iface]);
-    // atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+25*iface]);
+    // atomicAdd(&accel[iglob*3]  ,eta*mask_noise[ipoin]*normal_x*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+    // atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+    // atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]);
 
-    atomicAdd(&accel[iglob*3]  ,eta*mask_noise[ipoin]*normal_x*free_surface_jacobian2Dw[igll+25*iface]);
-    atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*free_surface_jacobian2Dw[igll+25*iface]);
-    atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*free_surface_jacobian2Dw[igll+25*iface]);
+    atomicAdd(&accel[iglob*3]  ,eta*mask_noise[ipoin]*normal_x*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+    atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+    atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*free_surface_jacobian2Dw[igll+NGLL2*iface]);
 
   }
 }
@@ -245,13 +244,8 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   int NOISE_TOMOGRAPHY = *NOISE_TOMOGRAPHYf;
 
-  //float* d_noise_surface_movie;
-  //cudaMalloc((void**)&d_noise_surface_movie,3*25*num_free_surface_faces*sizeof(float));
-  //cudaMemcpy(d_noise_surface_movie, h_noise_surface_movie,
-  //           3*25*num_free_surface_faces*sizeof(realw),cudaMemcpyHostToDevice);
-
   cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
-             3*25*(mp->num_free_surface_faces)*sizeof(float),cudaMemcpyHostToDevice);
+             3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice);
 
   int num_blocks_x = mp->num_free_surface_faces;
   int num_blocks_y = 1;
@@ -260,13 +254,8 @@
     num_blocks_y = num_blocks_y*2;
   }
   dim3 grid(num_blocks_x,num_blocks_y,1);
-  dim3 threads(25,1,1);
+  dim3 threads(NGLL2,1,1);
 
-  // float* h_debug = (float*)calloc(128,sizeof(float));
-  //float* d_debug;
-  // cudaMalloc((void**)&d_debug,128*sizeof(float));
-  // cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
-
   if(NOISE_TOMOGRAPHY == 2) { // add surface source to forward field
     noise_read_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_accel,
                                                                mp->d_ibool,
@@ -278,8 +267,7 @@
                                                                mp->d_normal_y_noise,
                                                                mp->d_normal_z_noise,
                                                                mp->d_mask_noise,
-                                                               mp->d_free_surface_jacobian2Dw //,d_debug
-                                                               );
+                                                               mp->d_free_surface_jacobian2Dw);
   }
   else if(NOISE_TOMOGRAPHY == 3) { // add surface source to adjoint (backward) field
     noise_read_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
@@ -292,16 +280,9 @@
                                                                mp->d_normal_y_noise,
                                                                mp->d_normal_z_noise,
                                                                mp->d_mask_noise,
-                                                               mp->d_free_surface_jacobian2Dw //,d_debug
-                                                               );
+                                                               mp->d_free_surface_jacobian2Dw);
   }
 
-  // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
-  // for(int i=0;i<8;i++) {
-  // printf("debug[%d]= %e\n",i,h_debug[i]);
-  // }
-  // MPI_Abort(MPI_COMM_WORLD,1);
-  //cudaFree(d_noise_surface_movie);
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("noise_read_add_surface_movie_cuda_kernel");
 #endif

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,22 +29,24 @@
 #ifndef CUDA_HEADER_H
 #define CUDA_HEADER_H
 
+typedef float realw;  // type of "working" variables
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // setters for these const arrays (very ugly hack, but will have to do)
 
 // elastic
-void setConst_hprime_xx(float* array,Mesh* mp);
-void setConst_hprime_yy(float* array,Mesh* mp);
-void setConst_hprime_zz(float* array,Mesh* mp);
+void setConst_hprime_xx(realw* array,Mesh* mp);
+void setConst_hprime_yy(realw* array,Mesh* mp);
+void setConst_hprime_zz(realw* array,Mesh* mp);
 
-void setConst_hprimewgll_xx(float* array,Mesh* mp);
-void setConst_hprimewgll_yy(float* array,Mesh* mp);
-void setConst_hprimewgll_zz(float* array,Mesh* mp);
+void setConst_hprimewgll_xx(realw* array,Mesh* mp);
+void setConst_hprimewgll_yy(realw* array,Mesh* mp);
+void setConst_hprimewgll_zz(realw* array,Mesh* mp);
 
-void setConst_wgllwgll_xy(float* array,Mesh* mp);
-void setConst_wgllwgll_xz(float* array, Mesh* mp);
-void setConst_wgllwgll_yz(float* array, Mesh* mp);
+void setConst_wgllwgll_xy(realw* array,Mesh* mp);
+void setConst_wgllwgll_xz(realw* array, Mesh* mp);
+void setConst_wgllwgll_yz(realw* array, Mesh* mp);
 
 /* ----------------------------------------------------------------------------------------------- */
 
@@ -52,21 +54,21 @@
 
 #ifdef USE_TEXTURES
   // declaration of textures
-  texture<float, 1, cudaReadModeElementType> tex_displ;
-  texture<float, 1, cudaReadModeElementType> tex_accel;
+  texture<realw, 1, cudaReadModeElementType> tex_displ;
+  texture<realw, 1, cudaReadModeElementType> tex_accel;
 
-  texture<float, 1, cudaReadModeElementType> tex_potential_acoustic;
-  texture<float, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
+  texture<realw, 1, cudaReadModeElementType> tex_potential_acoustic;
+  texture<realw, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
 
   // for binding the textures
 
-  void bindTexturesDispl(float* d_displ)
+  void bindTexturesDispl(realw* d_displ)
   {
     cudaError_t err;
 
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<float>();
+    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
 
-    err = cudaBindTexture(NULL,tex_displ, d_displ, channelDescFloat, NDIM*NGLOB*sizeof(float));
+    err = cudaBindTexture(NULL,tex_displ, d_displ, channelDescFloat, NDIM*NGLOB*sizeof(realw));
     if (err != cudaSuccess)
     {
       fprintf(stderr, "Error in bindTexturesDispl for displ: %s\n", cudaGetErrorString(err));
@@ -74,13 +76,13 @@
     }
   }
 
-  void bindTexturesAccel(float* d_accel)
+  void bindTexturesAccel(realw* d_accel)
   {
     cudaError_t err;
 
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<float>();
+    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
 
-    err = cudaBindTexture(NULL,tex_accel, d_accel, channelDescFloat, NDIM*NGLOB*sizeof(float));
+    err = cudaBindTexture(NULL,tex_accel, d_accel, channelDescFloat, NDIM*NGLOB*sizeof(realw));
     if (err != cudaSuccess)
     {
       fprintf(stderr, "Error in bindTexturesAccel for accel: %s\n", cudaGetErrorString(err));
@@ -88,14 +90,14 @@
     }
   }
 
-  void bindTexturesPotential(float* d_potential_acoustic)
+  void bindTexturesPotential(realw* d_potential_acoustic)
   {
     cudaError_t err;
 
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<float>();
+    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
 
     err = cudaBindTexture(NULL,tex_potential_acoustic, d_potential_acoustic,
-                          channelDescFloat, NGLOB*sizeof(float));
+                          channelDescFloat, NGLOB*sizeof(realw));
     if (err != cudaSuccess)
     {
       fprintf(stderr, "Error in bindTexturesPotential for potential_acoustic: %s\n", cudaGetErrorString(err));
@@ -103,14 +105,14 @@
     }
   }
 
-  void bindTexturesPotential_dot_dot(float* d_potential_dot_dot_acoustic)
+  void bindTexturesPotential_dot_dot(realw* d_potential_dot_dot_acoustic)
   {
     cudaError_t err;
 
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<float>();
+    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
 
     err = cudaBindTexture(NULL,tex_potential_dot_dot_acoustic, d_potential_dot_dot_acoustic,
-                          channelDescFloat, NGLOB*sizeof(float));
+                          channelDescFloat, NGLOB*sizeof(realw));
     if (err != cudaSuccess)
     {
       fprintf(stderr, "Error in bindTexturesPotential_dot_dot for potential_dot_dot_acoustic: %s\n", cudaGetErrorString(err));

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -205,7 +205,7 @@
 
 extern "C"
 void FC_FUNC_(get_free_device_memory,
-              get_FREE_DEVICE_MEMORY)(float* free, float* used, float* total ) {
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {
 TRACE("get_free_device_memory");
 
   double free_db,used_db,total_db;
@@ -213,9 +213,9 @@
   get_free_memory(&free_db,&used_db,&total_db);
 
   // converts to MB
-  *free = (float) free_db/1024.0/1024.0;
-  *used = (float) used_db/1024.0/1024.0;
-  *total = (float) total_db/1024.0/1024.0;
+  *free = (realw) free_db/1024.0/1024.0;
+  *used = (realw) used_db/1024.0/1024.0;
+  *total = (realw) total_db/1024.0/1024.0;
   return;
 }
 
@@ -424,27 +424,27 @@
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
                                         int* h_NGLLX,
                                         int* NSPEC_AB, int* NGLOB_AB,
-                                        float* h_xix, float* h_xiy, float* h_xiz,
-                                        float* h_etax, float* h_etay, float* h_etaz,
-                                        float* h_gammax, float* h_gammay, float* h_gammaz,
-                                        float* h_kappav, float* h_muv,
+                                        realw* h_xix, realw* h_xiy, realw* h_xiz,
+                                        realw* h_etax, realw* h_etay, realw* h_etaz,
+                                        realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+                                        realw* h_kappav, realw* h_muv,
                                         int* h_ibool,
                                         int* num_interfaces_ext_mesh,
                                         int* max_nibool_interfaces_ext_mesh,
                                         int* h_nibool_interfaces_ext_mesh,
                                         int* h_ibool_interfaces_ext_mesh,
-                                        float* h_hprime_xx,float* h_hprime_yy,float* h_hprime_zz,
-                                        float* h_hprimewgll_xx,float* h_hprimewgll_yy,float* h_hprimewgll_zz,
-                                        float* h_wgllwgll_xy,float* h_wgllwgll_xz,float* h_wgllwgll_yz,
+                                        realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
+                                        realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
+                                        realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
                                         int* ABSORBING_CONDITIONS,
                                         int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
-                                        float* h_abs_boundary_normal,
-                                        float* h_abs_boundary_jacobian2Dw,
+                                        realw* h_abs_boundary_normal,
+                                        realw* h_abs_boundary_jacobian2Dw,
                                         int* h_num_abs_boundary_faces,
                                         int* h_ispec_is_inner,
                                         int* NSOURCES,
                                         int* nsources_local_f,
-                                        float* h_sourcearrays,
+                                        realw* h_sourcearrays,
                                         int* h_islice_selected_source,
                                         int* h_ispec_selected_source,
                                         int* h_number_receiver_global,
@@ -454,11 +454,10 @@
                                         int* SIMULATION_TYPE,
                                         int* USE_MESH_COLORING_GPU_f,
                                         int* nspec_acoustic,int* nspec_elastic,
-                                        int* ncuda_devices) {
+                                        int* myrank_f,int* ncuda_devices) {
 
 TRACE("prepare_constants_device");
 
-  int procid;
   int device_count = 0;
 
   // cuda initialization (needs -lcuda library)
@@ -473,12 +472,12 @@
   *ncuda_devices = device_count;
 
   // Gets rank number of MPI process
-  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
+  int myrank = *myrank_f;
 
   // Sets the active device
   if(device_count > 1) {
     // generalized for more GPUs per node
-    cudaSetDevice((procid)%device_count);
+    cudaSetDevice( myrank%device_count );
     exit_on_cuda_error("cudaSetDevice");
   }
 
@@ -508,65 +507,67 @@
   setConst_wgllwgll_yz(h_wgllwgll_yz,mp);
 
   /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
-  int size_padded = 128 * (mp->NSPEC_AB);
-  int size = 125 * (mp->NSPEC_AB);
+  int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
+  //int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
 
   // mesh
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(float)),1001);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(float)),1002);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz, size_padded*sizeof(float)),1003);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax, size_padded*sizeof(float)),1004);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay, size_padded*sizeof(float)),1005);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz, size_padded*sizeof(float)),1006);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax, size_padded*sizeof(float)),1007);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay, size_padded*sizeof(float)),1008);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz, size_padded*sizeof(float)),1009);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappav, size_padded*sizeof(float)),1010);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muv, size_padded*sizeof(float)),1011);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(realw)),1001);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(realw)),1002);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz, size_padded*sizeof(realw)),1003);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax, size_padded*sizeof(realw)),1004);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay, size_padded*sizeof(realw)),1005);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz, size_padded*sizeof(realw)),1006);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax, size_padded*sizeof(realw)),1007);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay, size_padded*sizeof(realw)),1008);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz, size_padded*sizeof(realw)),1009);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappav, size_padded*sizeof(realw)),1010);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muv, size_padded*sizeof(realw)),1011);
 
   // transfer constant element data with padding
   for(int i=0;i < mp->NSPEC_AB;i++) {
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_xix + i*128, &h_xix[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),1501);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy+i*128,   &h_xiy[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),1502);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_xiz+i*128,   &h_xiz[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),1503);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_etax+i*128,  &h_etax[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),1504);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_etay+i*128,  &h_etay[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),1505);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_etaz+i*128,  &h_etaz[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),1506);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammax+i*128,&h_gammax[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),1507);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammay+i*128,&h_gammay[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),1508);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaz+i*128,&h_gammaz[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),1509);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_kappav+i*128,&h_kappav[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),1510);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_muv+i*128,   &h_muv[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),1511);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xix + i*NGLL3_PADDED, &h_xix[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1501);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy+i*NGLL3_PADDED,   &h_xiy[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1502);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xiz+i*NGLL3_PADDED,   &h_xiz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1503);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etax+i*NGLL3_PADDED,  &h_etax[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1504);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etay+i*NGLL3_PADDED,  &h_etay[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1505);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etaz+i*NGLL3_PADDED,  &h_etaz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1506);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammax+i*NGLL3_PADDED,&h_gammax[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1507);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammay+i*NGLL3_PADDED,&h_gammay[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1508);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaz+i*NGLL3_PADDED,&h_gammaz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1509);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_kappav+i*NGLL3_PADDED,&h_kappav[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1510);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_muv+i*NGLL3_PADDED,   &h_muv[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1511);
   }
 
   // global indexing
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool,size_padded*sizeof(int)),1021);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool, h_ibool,
-                                     size*sizeof(int),cudaMemcpyHostToDevice),1022);
+                                     NGLL3*(mp->NSPEC_AB)*sizeof(int),cudaMemcpyHostToDevice),1022);
 
 
   // prepare interprocess-edge exchange information
-  if( *num_interfaces_ext_mesh > 0 ){
+  mp->num_interfaces_ext_mesh = *num_interfaces_ext_mesh;
+  mp->max_nibool_interfaces_ext_mesh = *max_nibool_interfaces_ext_mesh;
+  if( mp->num_interfaces_ext_mesh > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_ext_mesh,
-                                       (*num_interfaces_ext_mesh)*sizeof(int)),1201);
+                                       (mp->num_interfaces_ext_mesh)*sizeof(int)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
-                                       (*num_interfaces_ext_mesh)*sizeof(int),cudaMemcpyHostToDevice),1202);
+                                       (mp->num_interfaces_ext_mesh)*sizeof(int),cudaMemcpyHostToDevice),1202);
 
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_ext_mesh,
-                                       (*num_interfaces_ext_mesh)*(*max_nibool_interfaces_ext_mesh)*sizeof(int)),1203);
+                                       (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh)*sizeof(int)),1203);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
-                                       (*num_interfaces_ext_mesh)*(*max_nibool_interfaces_ext_mesh)*sizeof(int),
+                                       (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh)*sizeof(int),
                                        cudaMemcpyHostToDevice),1204);
   }
 
@@ -592,21 +593,21 @@
 
 
     print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ijk),
-                                       3*25*(mp->d_num_abs_boundary_faces)*sizeof(int)),1103);
+                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int)),1103);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ijk, h_abs_boundary_ijk,
-                                       3*25*(mp->d_num_abs_boundary_faces)*sizeof(int),
+                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int),
                                        cudaMemcpyHostToDevice),1104);
 
     print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_normal),
-                                       3*25*(mp->d_num_abs_boundary_faces)*sizeof(float)),1105);
+                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1105);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_normal, h_abs_boundary_normal,
-                                       3*25*(mp->d_num_abs_boundary_faces)*sizeof(float),
+                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
                                        cudaMemcpyHostToDevice),1106);
 
     print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_jacobian2Dw),
-                                       25*(mp->d_num_abs_boundary_faces)*sizeof(float)),1107);
+                                       NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1107);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
-                                       25*(mp->d_num_abs_boundary_faces)*sizeof(float),
+                                       NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
                                        cudaMemcpyHostToDevice),1108);
   }
 
@@ -615,9 +616,9 @@
   if (*SIMULATION_TYPE == 1  || *SIMULATION_TYPE == 3){
     // not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2)
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_sourcearrays,
-                                       sizeof(float)* *NSOURCES*3*125),1301);
+                                       sizeof(realw)* *NSOURCES*3*NGLL3),1301);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_sourcearrays, h_sourcearrays,
-                                       sizeof(float)* *NSOURCES*3*125,cudaMemcpyHostToDevice),1302);
+                                       sizeof(realw)* *NSOURCES*3*NGLL3,cudaMemcpyHostToDevice),1302);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_stf_pre_compute,
                                        *NSOURCES*sizeof(double)),1303);
@@ -700,7 +701,8 @@
   mp->nadj_rec_local = *nadj_rec_local;
   if( mp->nadj_rec_local > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
-                                       (mp->nadj_rec_local)*3*125*sizeof(float)),7003);
+                                       (mp->nadj_rec_local)*3*NGLL3*sizeof(realw)),7003);
+                                       
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
                                        (mp->nadj_rec_local)*sizeof(int)),7004);
 
@@ -725,7 +727,7 @@
     free(h_pre_computed_irec);
 
     // temporary array to prepare extracted source array values
-    mp->h_adj_sourcearrays_slice = (float*) malloc( (mp->nadj_rec_local)*3*125*sizeof(float) );
+    mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
     if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
 
   }
@@ -744,9 +746,9 @@
 extern "C"
 void FC_FUNC_(prepare_fields_acoustic_device,
               PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
-                                              float* rmass_acoustic,
-                                              float* rhostore,
-                                              float* kappastore,
+                                              realw* rmass_acoustic,
+                                              realw* rhostore,
+                                              realw* kappastore,
                                               int* num_phase_ispec_acoustic,
                                               int* phase_ispec_inner_acoustic,
                                               int* ispec_is_acoustic,
@@ -756,13 +758,13 @@
                                               int* free_surface_ijk,
                                               int* ABSORBING_CONDITIONS,
                                               int* b_reclen_potential,
-                                              float* b_absorb_potential,
+                                              realw* b_absorb_potential,
                                               int* ELASTIC_SIMULATION,
                                               int* num_coupling_ac_el_faces,
                                               int* coupling_ac_el_ispec,
                                               int* coupling_ac_el_ijk,
-                                              float* coupling_ac_el_normal,
-                                              float* coupling_ac_el_jacobian2Dw,
+                                              realw* coupling_ac_el_normal,
+                                              realw* coupling_ac_el_jacobian2Dw,
                                               int* num_colors_outer_acoustic,
                                               int* num_colors_inner_acoustic,
                                               int* num_elem_colors_acoustic) {
@@ -771,32 +773,36 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
   /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
-  int size_padded = 128 * mp->NSPEC_AB;
-  int size_nonpadded = 125 * mp->NSPEC_AB;
-  int size = mp->NGLOB_AB;
+  int size_padded = NGLL3_PADDED * mp->NSPEC_AB;
+  int size_nonpadded = NGLL3 * mp->NSPEC_AB;
+  int size_glob = mp->NGLOB_AB;
 
   // allocates arrays on device (GPU)
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(float)*size),9001);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(float)*size),9002);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(float)*size),9003);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),sizeof(float)*size),9004);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(float)*size),9005);
-  // padded array
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(float)),9006);
-  // non-padded array
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(float)),9007);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size_glob),9001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size_glob),9002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),9003);
 
-  // transfer element data
+  // mpi buffer
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer), 
+                      (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),9004);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),9005);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
-                                     sizeof(float)*size,cudaMemcpyHostToDevice),9100);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
-                                     size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),9105);
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),9100);
+
+  // padded array
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),9006);
   // transfer constant element data with padding
   for(int i=0; i < mp->NSPEC_AB; i++) {
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*128, &rhostore[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),9106);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*NGLL3_PADDED, &rhostore[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),9106);
   }
 
+  // non-padded array
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),9007);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),9105);
+
   // phase elements
   mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic),
@@ -820,9 +826,9 @@
                                        mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9203);
 
       print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
-                                       3*25*mp->num_free_surface_faces*sizeof(int)),9202);
+                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int)),9202);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-                                       3*25*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9204);
+                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9204);
     }
   }
 
@@ -838,8 +844,9 @@
   // for seismograms
   if( mp->nrec_local > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),
-                                       mp->nrec_local*125*sizeof(float)),9107);
-    mp->h_station_seismo_potential = (float*) malloc( mp->nrec_local*125*sizeof(float) );
+                                       mp->nrec_local*NGLL3*sizeof(realw)),9107);
+
+    mp->h_station_seismo_potential = (realw*) malloc( mp->nrec_local*NGLL3*sizeof(realw) );
     if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
   }
 
@@ -852,19 +859,19 @@
                                        (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9602);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk),
-                                       3*25*(*num_coupling_ac_el_faces)*sizeof(int)),9603);
+                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),9603);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,
-                                       3*25*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9604);
+                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9604);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal),
-                                        3*25*(*num_coupling_ac_el_faces)*sizeof(float)),9605);
+                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),9605);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
-                                        3*25*(*num_coupling_ac_el_faces)*sizeof(float),cudaMemcpyHostToDevice),9606);
+                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),9606);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw),
-                                        25*(*num_coupling_ac_el_faces)*sizeof(float)),9607);
+                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),9607);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
-                                        25*(*num_coupling_ac_el_faces)*sizeof(float),cudaMemcpyHostToDevice),9608);
+                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),9608);
 
   }
 
@@ -893,32 +900,32 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
 
-  int size = mp->NGLOB_AB;
+  int size_glob = mp->NGLOB_AB;
 
   // kernel simulations
   if( *SIMULATION_TYPE != 3 ) return;
 
   // allocates backward/reconstructed arrays on device (GPU)
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(float)*size),9014);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(float)*size),9015);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(float)*size),9016);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size_glob),9014);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size_glob),9015);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size_glob),9016);
 
   // allocates kernels
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),125*mp->NSPEC_AB*sizeof(float)),9017);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),125*mp->NSPEC_AB*sizeof(float)),9018);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),9017);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),9018);
 
   // initializes kernel values to zero
   print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0,
-                                     125*mp->NSPEC_AB*sizeof(float)),9019);
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),9019);
   print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0,
-                                     125*mp->NSPEC_AB*sizeof(float)),9020);
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),9020);
 
   // preconditioner
   if( *APPROXIMATE_HESS_KL ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),125*mp->NSPEC_AB*sizeof(float)),9030);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),9030);
     // initializes with zeros
     print_CUDA_error_if_any(cudaMemset(mp->d_hess_ac_kl,0,
-                                       125*mp->NSPEC_AB*sizeof(float)),9031);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),9031);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -937,54 +944,78 @@
 void FC_FUNC_(prepare_fields_elastic_device,
               PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
                                              int* size,
-                                             float* rmass,
-                                             float* rho_vp,
-                                             float* rho_vs,
+                                             realw* rmass,
+                                             realw* rho_vp,
+                                             realw* rho_vs,
                                              int* num_phase_ispec_elastic,
                                              int* phase_ispec_inner_elastic,
                                              int* ispec_is_elastic,
                                              int* ABSORBING_CONDITIONS,
-                                             float* h_b_absorb_field,
+                                             realw* h_b_absorb_field,
                                              int* h_b_reclen_field,
                                              int* SIMULATION_TYPE,int* SAVE_FORWARD,
                                              int* COMPUTE_AND_STORE_STRAIN,
-                                             float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
-                                             float* epsilondev_xz,float* epsilondev_yz,
+                                             realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                             realw* epsilondev_xz,realw* epsilondev_yz,
                                              int* ATTENUATION,
                                              int* R_size,
-                                             float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
-                                             float* one_minus_sum_beta,float* factor_common,
-                                             float* alphaval,float* betaval,float* gammaval,
+                                             realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                             realw* one_minus_sum_beta,realw* factor_common,
+                                             realw* alphaval,realw* betaval,realw* gammaval,
                                              int* OCEANS,
-                                             float* rmass_ocean_load,
+                                             realw* rmass_ocean_load,
                                              int* NOISE_TOMOGRAPHY,
-                                             float* free_surface_normal,
+                                             realw* free_surface_normal,
                                              int* free_surface_ispec,
                                              int* free_surface_ijk,
                                              int* num_free_surface_faces,
                                              int* ACOUSTIC_SIMULATION,
                                              int* num_colors_outer_elastic,
                                              int* num_colors_inner_elastic,
-                                             int* num_elem_colors_elastic){
+                                             int* num_elem_colors_elastic,
+                                             int* ANISOTROPY,
+                                             realw *c11store,
+                                             realw *c12store,
+                                             realw *c13store,
+                                             realw *c14store,
+                                             realw *c15store,
+                                             realw *c16store,
+                                             realw *c22store,
+                                             realw *c23store,
+                                             realw *c24store,
+                                             realw *c25store,
+                                             realw *c26store,
+                                             realw *c33store,
+                                             realw *c34store,
+                                             realw *c35store,
+                                             realw *c36store,
+                                             realw *c44store,
+                                             realw *c45store,
+                                             realw *c46store,
+                                             realw *c55store,
+                                             realw *c56store,
+                                             realw *c66store){
 
 TRACE("prepare_fields_elastic_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
   /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
-  //int size_padded = 128 * mp->NSPEC_AB;
-  int size_nonpadded = 125 * mp->NSPEC_AB;
+  int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
+  int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(float)*(*size)),8001);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(float)*(*size)),8002);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(float)*(*size)),8003);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),8001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),8002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),8003);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),sizeof(float)*(*size)),8004);
+  // mpi buffer
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
+                        3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),8004);
 
   // mass matrix
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(float)*mp->NGLOB_AB),8005);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),8005);
   // transfer element data
   print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
-                                     sizeof(float)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8010);
+                                     sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8010);
 
 
   // element indices
@@ -1008,22 +1039,23 @@
   // for seismograms
   if( mp->nrec_local > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
-                                     3*125*(mp->nrec_local)*sizeof(float)),8015);
-    mp->h_station_seismo_field = (float*) malloc( 3*125*(mp->nrec_local)*sizeof(float) );
+                                     3*NGLL3*(mp->nrec_local)*sizeof(realw)),8015);
+
+    mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) );
     if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
   }
 
   // absorbing conditions
   if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){
     // non-padded arrays
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(float)),8006);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(float)),8007);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),8006);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),8007);
 
     // rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
     print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
-                                       size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),8013);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8013);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
-                                       size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),8014);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8014);
 
     // absorb_field array used for file i/o
     if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD )){
@@ -1038,27 +1070,27 @@
   // strains used for attenuation and kernel simulations
   if( *COMPUTE_AND_STORE_STRAIN ){
     // strains
-    int epsilondev_size = 125*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+    int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
 
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
-                                       epsilondev_size*sizeof(float)),8301);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(float),
+                                       epsilondev_size*sizeof(realw)),8301);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw),
                                        cudaMemcpyHostToDevice),8302);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
-                                       epsilondev_size*sizeof(float)),8302);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(float),
+                                       epsilondev_size*sizeof(realw)),8302);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw),
                                        cudaMemcpyHostToDevice),8303);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
-                                       epsilondev_size*sizeof(float)),8304);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(float),
+                                       epsilondev_size*sizeof(realw)),8304);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw),
                                        cudaMemcpyHostToDevice),8305);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
-                                       epsilondev_size*sizeof(float)),8306);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(float),
+                                       epsilondev_size*sizeof(realw)),8306);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw),
                                        cudaMemcpyHostToDevice),8307);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
-                                       epsilondev_size*sizeof(float)),8308);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(float),
+                                       epsilondev_size*sizeof(realw)),8308);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw),
                                        cudaMemcpyHostToDevice),8309);
 
   }
@@ -1067,74 +1099,167 @@
   if( *ATTENUATION ){
     // memory arrays
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
-                                       (*R_size)*sizeof(float)),8401);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(float),
+                                       (*R_size)*sizeof(realw)),8401);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw),
                                        cudaMemcpyHostToDevice),8402);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
-                                       (*R_size)*sizeof(float)),8403);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(float),
+                                       (*R_size)*sizeof(realw)),8403);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw),
                                        cudaMemcpyHostToDevice),8404);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
-                                       (*R_size)*sizeof(float)),8405);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(float),
+                                       (*R_size)*sizeof(realw)),8405);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw),
                                        cudaMemcpyHostToDevice),8406);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
-                                       (*R_size)*sizeof(float)),8407);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(float),
+                                       (*R_size)*sizeof(realw)),8407);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw),
                                        cudaMemcpyHostToDevice),8408);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
-                                       (*R_size)*sizeof(float)),8409);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(float),
+                                       (*R_size)*sizeof(realw)),8409);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw),
                                        cudaMemcpyHostToDevice),8410);
 
     // attenuation factors
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
-                                       125*mp->NSPEC_AB*sizeof(float)),8430);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),8430);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
-                                       125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8431);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8431);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
-                                       N_SLS*125*mp->NSPEC_AB*sizeof(float)),8432);
+                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),8432);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
-                                       N_SLS*125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8433);
+                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8433);
 
     // alpha,beta,gamma factors
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
-                                       N_SLS*sizeof(float)),8434);
+                                       N_SLS*sizeof(realw)),8434);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
-                                       N_SLS*sizeof(float),cudaMemcpyHostToDevice),8435);
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8435);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
-                                       N_SLS*sizeof(float)),8436);
+                                       N_SLS*sizeof(realw)),8436);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
-                                       N_SLS*sizeof(float),cudaMemcpyHostToDevice),8437);
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8437);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
-                                       N_SLS*sizeof(float)),8438);
+                                       N_SLS*sizeof(realw)),8438);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
-                                       N_SLS*sizeof(float),cudaMemcpyHostToDevice),8439);
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8439);
 
   }
 
+  // anisotropy
+  if( *ANISOTROPY ){
+    // allocates memory on GPU
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store),
+                                       size_padded*sizeof(realw)),8700);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store),
+                                       size_padded*sizeof(realw)),8701);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store),
+                                       size_padded*sizeof(realw)),8702);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store),
+                                       size_padded*sizeof(realw)),8703);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store),
+                                       size_padded*sizeof(realw)),8704);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store),
+                                       size_padded*sizeof(realw)),8705);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store),
+                                       size_padded*sizeof(realw)),8706);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store),
+                                       size_padded*sizeof(realw)),8707);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store),
+                                       size_padded*sizeof(realw)),8708);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store),
+                                       size_padded*sizeof(realw)),8709);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store),
+                                       size_padded*sizeof(realw)),8710);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store),
+                                       size_padded*sizeof(realw)),8711);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store),
+                                       size_padded*sizeof(realw)),8712);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store),
+                                       size_padded*sizeof(realw)),8713);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store),
+                                       size_padded*sizeof(realw)),8714);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store),
+                                       size_padded*sizeof(realw)),8715);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store),
+                                       size_padded*sizeof(realw)),8716);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store),
+                                       size_padded*sizeof(realw)),8717);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store),
+                                       size_padded*sizeof(realw)),8718);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store),
+                                       size_padded*sizeof(realw)),8719);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store),
+                                       size_padded*sizeof(realw)),8720);
 
+    // transfer constant element data with padding
+    for(int i=0;i < mp->NSPEC_AB;i++) {
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c11store + i*NGLL3_PADDED, &c11store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8800);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c12store + i*NGLL3_PADDED, &c12store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8801);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c13store + i*NGLL3_PADDED, &c13store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8802);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c14store + i*NGLL3_PADDED, &c14store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8803);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c15store + i*NGLL3_PADDED, &c15store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8804);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c16store + i*NGLL3_PADDED, &c16store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8805);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c22store + i*NGLL3_PADDED, &c22store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8806);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c23store + i*NGLL3_PADDED, &c23store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8807);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c24store + i*NGLL3_PADDED, &c24store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8808);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c25store + i*NGLL3_PADDED, &c25store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8809);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c26store + i*NGLL3_PADDED, &c26store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8810);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c33store + i*NGLL3_PADDED, &c33store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8811);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c34store + i*NGLL3_PADDED, &c34store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8812);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c35store + i*NGLL3_PADDED, &c35store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8813);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c36store + i*NGLL3_PADDED, &c36store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8814);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c44store + i*NGLL3_PADDED, &c44store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8815);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c45store + i*NGLL3_PADDED, &c45store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8816);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c46store + i*NGLL3_PADDED, &c46store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8817);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c55store + i*NGLL3_PADDED, &c55store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8818);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c56store + i*NGLL3_PADDED, &c56store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8819);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c66store + i*NGLL3_PADDED, &c66store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8820);
+    }
+  }
+
+  // ocean load approximation
   if( *OCEANS ){
     // oceans needs a free surface
     mp->num_free_surface_faces = *num_free_surface_faces;
     if( mp->num_free_surface_faces > 0 ){
       // mass matrix
       print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
-                                         sizeof(float)*mp->NGLOB_AB),8501);
+                                         sizeof(realw)*mp->NGLOB_AB),8501);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
-                                         sizeof(float)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8502);
+                                         sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8502);
       // surface normal
       print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
-                                         3*25*(mp->num_free_surface_faces)*sizeof(float)),8503);
+                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),8503);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
-                                         3*25*(mp->num_free_surface_faces)*sizeof(float),cudaMemcpyHostToDevice),8504);
+                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),8504);
 
       // temporary global array: used to synchronize updates on global accel array
       print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load),
@@ -1142,13 +1267,13 @@
 
       if( *NOISE_TOMOGRAPHY == 0 && *ACOUSTIC_SIMULATION == 0 ){
         print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
-                                          mp->num_free_surface_faces*sizeof(int)),9201);
+                                          mp->num_free_surface_faces*sizeof(int)),8601);
         print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
-                                          mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9203);
+                                          mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),8603);
         print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
-                                          3*25*mp->num_free_surface_faces*sizeof(int)),9202);
+                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int)),8602);
         print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-                                          3*25*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9204);
+                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),8604);
       }
     }
   }
@@ -1173,14 +1298,14 @@
                                              int* size,
                                              int* SIMULATION_TYPE,
                                              int* COMPUTE_AND_STORE_STRAIN,
-                                             float* epsilon_trace_over_3,
-                                             float* b_epsilondev_xx,float* b_epsilondev_yy,float* b_epsilondev_xy,
-                                             float* b_epsilondev_xz,float* b_epsilondev_yz,
-                                             float* b_epsilon_trace_over_3,
+                                             realw* epsilon_trace_over_3,
+                                             realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+                                             realw* b_epsilon_trace_over_3,
                                              int* ATTENUATION,
                                              int* R_size,
-                                             float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
-                                             float* b_alphaval,float* b_betaval,float* b_gammaval,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+                                             realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
                                              int* APPROXIMATE_HESS_KL){
 
   TRACE("prepare_fields_elastic_adj_dev");
@@ -1192,111 +1317,111 @@
 
   // kernel simulations
   // allocates backward/reconstructed arrays on device (GPU)
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(float)*(*size)),8201);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(float)*(*size)),8202);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(float)*(*size)),8203);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*(*size)),8201);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*(*size)),8202);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*(*size)),8203);
 
   // allocates kernels
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),125*mp->NSPEC_AB*sizeof(float)),8204);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),125*mp->NSPEC_AB*sizeof(float)),8205);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),125*mp->NSPEC_AB*sizeof(float)),8206);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8204);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8205);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8206);
 
   // initializes kernel values to zero
   print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0,
-                                     125*mp->NSPEC_AB*sizeof(float)),8207);
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),8207);
   print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0,
-                                     125*mp->NSPEC_AB*sizeof(float)),8208);
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),8208);
   print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0,
-                                     125*mp->NSPEC_AB*sizeof(float)),8209);
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),8209);
 
   // strains used for attenuation and kernel simulations
   if( *COMPUTE_AND_STORE_STRAIN ){
     // strains
-    int epsilondev_size = 125*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+    int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
 
     // solid pressure
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
-                                       125*mp->NSPEC_AB*sizeof(float)),8310);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),8310);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
-                                       125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8311);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8311);
     // backward solid pressure
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
-                                       125*mp->NSPEC_AB*sizeof(float)),8312);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),8312);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
-                                       125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8313);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8313);
     // prepares backward strains
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
-                                       epsilondev_size*sizeof(float)),8321);
+                                       epsilondev_size*sizeof(realw)),8321);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
-                                       epsilondev_size*sizeof(float)),8322);
+                                       epsilondev_size*sizeof(realw)),8322);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
-                                       epsilondev_size*sizeof(float)),8323);
+                                       epsilondev_size*sizeof(realw)),8323);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
-                                       epsilondev_size*sizeof(float)),8324);
+                                       epsilondev_size*sizeof(realw)),8324);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
-                                       epsilondev_size*sizeof(float)),8325);
+                                       epsilondev_size*sizeof(realw)),8325);
 
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
-                                       epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8326);
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8326);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
-                                       epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8327);
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8327);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
-                                       epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8328);
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8328);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
-                                       epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8329);
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8329);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
-                                       epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8330);
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8330);
   }
 
   // attenuation memory variables
   if( *ATTENUATION ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx),
-                                       (*R_size)*sizeof(float)),8421);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(float),
+                                       (*R_size)*sizeof(realw)),8421);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(realw),
                                        cudaMemcpyHostToDevice),8422);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy),
-                                       (*R_size)*sizeof(float)),8423);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(float),
+                                       (*R_size)*sizeof(realw)),8423);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(realw),
                                        cudaMemcpyHostToDevice),8424);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy),
-                                       (*R_size)*sizeof(float)),8425);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(float),
+                                       (*R_size)*sizeof(realw)),8425);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(realw),
                                        cudaMemcpyHostToDevice),8426);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz),
-                                       (*R_size)*sizeof(float)),8427);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(float),
+                                       (*R_size)*sizeof(realw)),8427);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(realw),
                                        cudaMemcpyHostToDevice),8428);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz),
-                                       (*R_size)*sizeof(float)),8429);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(float),
+                                       (*R_size)*sizeof(realw)),8429);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(realw),
                                        cudaMemcpyHostToDevice),8420);
 
     // alpha,beta,gamma factors for backward fields
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval),
-                                       N_SLS*sizeof(float)),8434);
+                                       N_SLS*sizeof(realw)),8434);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
-                                       N_SLS*sizeof(float),cudaMemcpyHostToDevice),8435);
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8435);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval),
-                                       N_SLS*sizeof(float)),8436);
+                                       N_SLS*sizeof(realw)),8436);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
-                                       N_SLS*sizeof(float),cudaMemcpyHostToDevice),8437);
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8437);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval),
-                                       N_SLS*sizeof(float)),8438);
+                                       N_SLS*sizeof(realw)),8438);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
-                                       N_SLS*sizeof(float),cudaMemcpyHostToDevice),8439);
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8439);
   }
 
   if( *APPROXIMATE_HESS_KL ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),125*mp->NSPEC_AB*sizeof(float)),8450);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8450);
     // initializes with zeros
     print_CUDA_error_if_any(cudaMemset(mp->d_hess_el_kl,0,
-                                       125*mp->NSPEC_AB*sizeof(float)),8451);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),8451);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1322,12 +1447,12 @@
                                            int* SIMULATION_TYPE,
                                            int* NOISE_TOMOGRAPHY,
                                            int* NSTEP,
-                                           float* noise_sourcearray,
-                                           float* normal_x_noise,
-                                           float* normal_y_noise,
-                                           float* normal_z_noise,
-                                           float* mask_noise,
-                                           float* free_surface_jacobian2Dw) {
+                                           realw* noise_sourcearray,
+                                           realw* normal_x_noise,
+                                           realw* normal_y_noise,
+                                           realw* normal_z_noise,
+                                           realw* mask_noise,
+                                           realw* free_surface_jacobian2Dw) {
 
   TRACE("prepare_fields_noise_device");
 
@@ -1342,56 +1467,56 @@
                                      mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4002);
 
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk,
-                                     3*25*mp->num_free_surface_faces*sizeof(int)),4003);
+                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4003);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-                                     3*25*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4004);
+                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4004);
 
   // alloc storage for the surface buffer to be copied
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie,
-                                     3*25*mp->num_free_surface_faces*sizeof(float)),4005);
+                                     3*NGLL2*mp->num_free_surface_faces*sizeof(realw)),4005);
 
   // prepares noise source array
   if( *NOISE_TOMOGRAPHY == 1 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
-                                       3*125*(*NSTEP)*sizeof(float)),4101);
+                                       3*NGLL3*(*NSTEP)*sizeof(realw)),4101);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
-                                       3*125*(*NSTEP)*sizeof(float),cudaMemcpyHostToDevice),4102);
+                                       3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),4102);
   }
 
   // prepares noise directions
   if( *NOISE_TOMOGRAPHY > 1 ){
-    int nface_size = 25*(*num_free_surface_faces);
+    int nface_size = NGLL2*(*num_free_surface_faces);
     // allocates memory on GPU
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
-                                       nface_size*sizeof(float)),4301);
+                                       nface_size*sizeof(realw)),4301);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
-                                       nface_size*sizeof(float)),4302);
+                                       nface_size*sizeof(realw)),4302);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
-                                       nface_size*sizeof(float)),4303);
+                                       nface_size*sizeof(realw)),4303);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
-                                       nface_size*sizeof(float)),4304);
+                                       nface_size*sizeof(realw)),4304);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
-                                       nface_size*sizeof(float)),4305);
+                                       nface_size*sizeof(realw)),4305);
     // transfers data onto GPU
     print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise,
-                                       nface_size*sizeof(float),cudaMemcpyHostToDevice),4306);
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),4306);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise,
-                                       nface_size*sizeof(float),cudaMemcpyHostToDevice),4307);
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),4307);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
-                                       nface_size*sizeof(float),cudaMemcpyHostToDevice),4308);
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),4308);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
-                                       nface_size*sizeof(float),cudaMemcpyHostToDevice),4309);
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),4309);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw,
-                                       nface_size*sizeof(float),cudaMemcpyHostToDevice),4310);
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),4310);
   }
 
   // prepares noise strength kernel
   if( *NOISE_TOMOGRAPHY == 3 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),
-                                       125*(mp->NSPEC_AB)*sizeof(float)),4401);
+                                       NGLL3*(mp->NSPEC_AB)*sizeof(realw)),4401);
     // initializes kernel values to zero
     print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,
-                                       125*mp->NSPEC_AB*sizeof(float)),4403);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),4403);
 
   }
 
@@ -1419,6 +1544,7 @@
                                       int* NOISE_TOMOGRAPHY,
                                       int* COMPUTE_AND_STORE_STRAIN,
                                       int* ATTENUATION,
+                                      int* ANISOTROPY,
                                       int* OCEANS,
                                       int* APPROXIMATE_HESS_KL) {
 
@@ -1579,6 +1705,30 @@
       }
     }
 
+    if( *ANISOTROPY ){
+      cudaFree(mp->d_c11store);
+      cudaFree(mp->d_c12store);
+      cudaFree(mp->d_c13store);
+      cudaFree(mp->d_c14store);
+      cudaFree(mp->d_c15store);
+      cudaFree(mp->d_c16store);
+      cudaFree(mp->d_c22store);
+      cudaFree(mp->d_c23store);
+      cudaFree(mp->d_c24store);
+      cudaFree(mp->d_c25store);
+      cudaFree(mp->d_c26store);
+      cudaFree(mp->d_c33store);
+      cudaFree(mp->d_c34store);
+      cudaFree(mp->d_c35store);
+      cudaFree(mp->d_c36store);
+      cudaFree(mp->d_c44store);
+      cudaFree(mp->d_c45store);
+      cudaFree(mp->d_c46store);
+      cudaFree(mp->d_c55store);
+      cudaFree(mp->d_c56store);
+      cudaFree(mp->d_c66store);
+    }
+
     if( *OCEANS ){
       if( mp->num_free_surface_faces > 0 ){
         cudaFree(mp->d_rmass_ocean_load);

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2011-11-06 02:02:36 UTC (rev 19152)
@@ -1,67 +1,64 @@
 #include "config.h"
 #include <stdio.h>
 
-typedef float realw;
+typedef realw realw;
 
 /* from check_fields_cuda.cu */
 void FC_FUNC_(check_max_norm_displ_gpu,
-              CHECK_MAX_NORM_DISPL_GPU)(int* size, float* displ,long* Mesh_pointer_f,int* announceID){}
+              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID){}
 				       
 void FC_FUNC_(check_max_norm_vector,
-              CHECK_MAX_NORM_VECTOR)(int* size, float* vector1, int* announceID){}				       
+              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID){}				       
 void FC_FUNC_(check_max_norm_displ,
-              CHECK_MAX_NORM_DISPL)(int* size, float* displ, int* announceID){}
+              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID){}
 
 void FC_FUNC_(check_max_norm_b_displ_gpu,
-              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, float* b_displ,long* Mesh_pointer_f,int* announceID){}
+              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID){}
 
 void FC_FUNC_(check_max_norm_b_accel_gpu,
-              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, float* b_accel,long* Mesh_pointer_f,int* announceID){}
+              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID){}
 
 void FC_FUNC_(check_max_norm_b_veloc_gpu,
-              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, float* b_veloc,long* Mesh_pointer_f,int* announceID){}
+              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID){}
 
 void FC_FUNC_(check_max_norm_b_displ,
-              CHECK_MAX_NORM_B_DISPL)(int* size, float* b_displ,int* announceID){}
+              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID){}
 
 void FC_FUNC_(check_max_norm_b_accel,
-              CHECK_MAX_NORM_B_ACCEL)(int* size, float* b_accel,int* announceID){}
+              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID){}
 
 void FC_FUNC_(check_error_vectors,
-              CHECK_ERROR_VECTORS)(int* sizef, float* vector1,float* vector2){}
+              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2){}
 
 void FC_FUNC_(get_max_accel,
               GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer){}
 
 void FC_FUNC_(get_norm_acoustic_from_device,
-              GET_NORM_ACOUSTIC_FROM_DEVICE)(float* norm, 
+              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm, 
                                                   long* Mesh_pointer_f,
                                                   int* SIMULATION_TYPE){}
 
 void FC_FUNC_(get_norm_elastic_from_device,
-              GET_NORM_ELASTIC_FROM_DEVICE)(float* norm, 
+              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm, 
                                                  long* Mesh_pointer_f,
                                                  int* SIMULATION_TYPE){}
 
 						
-/* from file compute_add_sources_cuda.cu */
+/* from file compute_add_sources_elastic_cuda.cu */
 
 void FC_FUNC_(compute_add_sources_el_cuda,
               COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
-                                           int* NSPEC_ABf, int* NGLOB_ABf,
-                                           int* phase_is_innerf,int* NSOURCESf,
-                                           int* itf, float* dtf, float* t0f,
-                                           int* SIMULATION_TYPEf,int* NSTEPf,
-                                           int* NOISE_TOMOGRAPHYf,
-                                           int* USE_FORCE_POINT_SOURCEf,
-                                           double* h_stf_pre_compute, int* myrankf){}
+                                           int* phase_is_innerf,
+                                           int* NSOURCESf,
+                                           double* h_stf_pre_compute, 
+                                           int* myrankf){}
 
 void FC_FUNC_(compute_add_sources_el_s3_cuda,
               COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
-                                              int* USE_FORCE_POINT_SOURCE,
                                               double* h_stf_pre_compute,
                                               int* NSOURCES,
-                                              int* phase_is_inner,int* myrank){}
+                                              int* phase_is_inner,
+                                              int* myrank){}
 
 void FC_FUNC_(add_source_master_rec_noise_cu,
               ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f, 
@@ -72,7 +69,7 @@
 
 void FC_FUNC_(add_sources_el_sim_type_2_or_3,
               ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer, 
-                                           float* h_adj_sourcearrays,
+                                           realw* h_adj_sourcearrays,
                                            int* phase_is_inner,
                                            int* h_ispec_is_inner,
                                            int* h_ispec_is_elastic,                                            
@@ -84,12 +81,13 @@
                                            int* nadj_rec_local,
                                            int* NTSTEP_BETWEEN_READ_ADJSRC){}
 
+/* from file compute_add_sources_acoustic_cuda.cu */
+
 void FC_FUNC_(compute_add_sources_ac_cuda,
               COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f, 
                                                  int* phase_is_innerf,
                                                  int* NSOURCESf, 
                                                  int* SIMULATION_TYPEf,
-                                                 int* USE_FORCE_POINT_SOURCEf, 
                                                  double* h_stf_pre_compute, 
                                                  int* myrankf){}
 
@@ -98,13 +96,12 @@
                                                       int* phase_is_innerf,
                                                       int* NSOURCESf, 
                                                       int* SIMULATION_TYPEf,
-                                                      int* USE_FORCE_POINT_SOURCEf, 
                                                       double* h_stf_pre_compute, 
                                                       int* myrankf){}
 
 void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
               ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer, 
-                                               float* h_adj_sourcearrays,
+                                               realw* h_adj_sourcearrays,
                                                int* phase_is_inner,
                                                int* h_ispec_is_inner,
                                                int* h_ispec_is_acoustic,
@@ -138,8 +135,8 @@
               TRANSFER_BOUN_POT_FROM_DEVICE)(
                                               int* size, 
                                               long* Mesh_pointer_f, 
-                                              float* potential_dot_dot_acoustic, 
-                                              float* send_potential_dot_dot_buffer,
+                                              realw* potential_dot_dot_acoustic, 
+                                              realw* send_potential_dot_dot_buffer,
                                               int* num_interfaces_ext_mesh, 
                                               int* max_nibool_interfaces_ext_mesh,
                                               int* nibool_interfaces_ext_mesh, 
@@ -172,9 +169,9 @@
 void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
                                                              long* Mesh_pointer,
                                                              int* size_F, 
-                                                             float* deltatover2_F, 
+                                                             realw* deltatover2_F, 
                                                              int* SIMULATION_TYPE, 
-                                                             float* b_deltatover2_F){}
+                                                             realw* b_deltatover2_F){}
 
 void FC_FUNC_(acoustic_enforce_free_surf_cuda,
               ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f, 
@@ -184,8 +181,8 @@
 
 /* from compute_forces_elastic_cuda.cu */
 void FC_FUNC_(transfer_boun_accel_from_device,
-              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, float* accel,
-						   float* send_accel_buffer,
+              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
+						   realw* send_accel_buffer,
 						   int* num_interfaces_ext_mesh,
 						   int* max_nibool_interfaces_ext_mesh,
 						   int* nibool_interfaces_ext_mesh,
@@ -209,22 +206,23 @@
                                            int* nspec_inner_elastic,
                                            int* SIMULATION_TYPE,
                                            int* COMPUTE_AND_STORE_STRAIN,
-                                           int* ATTENUATION){}
+                                           int* ATTENUATION,
+                                           int* ANISOTROPY){}
 
 void FC_FUNC_(kernel_3_a_cuda,
               KERNEL_3_A_CUDA)(long* Mesh_pointer,
                                int* size_F, 
-                               float* deltatover2_F, 
+                               realw* deltatover2_F, 
                                int* SIMULATION_TYPE_f, 
-                               float* b_deltatover2_F,
+                               realw* b_deltatover2_F,
                                int* OCEANS){}
 
 void FC_FUNC_(kernel_3_b_cuda,
               KERNEL_3_B_CUDA)(long* Mesh_pointer,
                              int* size_F, 
-                             float* deltatover2_F, 
+                             realw* deltatover2_F, 
                              int* SIMULATION_TYPE_f, 
-			       float* b_deltatover2_F){}
+			       realw* b_deltatover2_F){}
 
 void FC_FUNC_(elastic_ocean_load_cuda,
               ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f, 
@@ -234,21 +232,21 @@
 				      
 void FC_FUNC_(compute_kernels_elastic_cuda,
               COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
-                                            float* deltat_f){}
+                                            realw* deltat_f){}
 
 void FC_FUNC_(compute_kernels_strgth_noise_cu,
               COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer, 
-                                               float* h_noise_surface_movie,
-                                               float* deltat){}
+                                               realw* h_noise_surface_movie,
+                                               realw* deltat){}
 
 void FC_FUNC_(compute_kernels_acoustic_cuda,
               COMPUTE_KERNELS_ACOUSTIC_CUDA)(
                                              long* Mesh_pointer, 
-                                             float* deltat_f){}
+                                             realw* deltat_f){}
 
 void FC_FUNC_(compute_kernels_hess_cuda,
               COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
-                                         float* deltat_f) {}
+                                         realw* deltat_f) {}
                                          
 /* from file compute_stacey_acoustic_cuda.cu */
 void FC_FUNC_(compute_stacey_acoustic_cuda,
@@ -257,7 +255,7 @@
                                     int* phase_is_innerf, 
                                     int* SIMULATION_TYPEf, 
                                     int* SAVE_FORWARDf,
-                                    float* h_b_absorb_potential){}
+                                    realw* h_b_absorb_potential){}
 
 
 /* from file compute_stacey_elastic_cuda.cu */
@@ -267,42 +265,42 @@
                                            int* phase_is_innerf, 
                                            int* SIMULATION_TYPEf, 
                                            int* SAVE_FORWARDf,
-                                           float* h_b_absorb_field){}
+                                           realw* h_b_absorb_field){}
 
 /* from file it_update_displacement_cuda.cu */
 					  
 void FC_FUNC_(it_update_displacement_cuda,
               it_update_displacement_cuda)(long* Mesh_pointer_f,
                                                  int* size_F, 
-                                                 float* deltat_F, 
-                                                 float* deltatsqover2_F, 
-                                                 float* deltatover2_F,
+                                                 realw* deltat_F, 
+                                                 realw* deltatsqover2_F, 
+                                                 realw* deltatover2_F,
                                                  int* SIMULATION_TYPE, 
-                                                 float* b_deltat_F, 
-                                                 float* b_deltatsqover2_F, 
-                                                 float* b_deltatover2_F){}
+                                                 realw* b_deltat_F, 
+                                                 realw* b_deltatsqover2_F, 
+                                                 realw* b_deltatover2_F){}
 
 void FC_FUNC_(it_update_displacement_ac_cuda,
               IT_UPDATE_DISPLACEMENT_AC_CUDA)(long* Mesh_pointer_f, 
                                                            int* size_F,
-                                                           float* deltat_F, 
-                                                           float* deltatsqover2_F, 
-                                                           float* deltatover2_F,
+                                                           realw* deltat_F, 
+                                                           realw* deltatsqover2_F, 
+                                                           realw* deltatover2_F,
                                                            int* SIMULATION_TYPE, 
-                                                           float* b_deltat_F, 
-                                                           float* b_deltatsqover2_F, 
-                                                           float* b_deltatover2_F){}
+                                                           realw* b_deltat_F, 
+                                                           realw* b_deltatsqover2_F, 
+                                                           realw* b_deltatover2_F){}
 
 /* from file noise_tomography_cuda.cu */
 							  
 void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}							  
 void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id){}					
 
-void FC_FUNC_(fortranprintf,FORTRANPRINTF)(float* val){}
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val){}
 
 void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val){}
 
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,float* h_displ){}
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ){}
 
 void FC_FUNC_(transfer_surface_to_host,
               TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
@@ -324,31 +322,31 @@
               SHOW_FREE_DEVICE_MEMORY)(){}
 
 void FC_FUNC_(get_free_device_memory,
-              get_FREE_DEVICE_MEMORY)(float* free, float* used, float* total ){}
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ){}
 
 void FC_FUNC_(prepare_constants_device,
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
                                         int* h_NGLLX, 
                                         int* NSPEC_AB, int* NGLOB_AB,
-                                        float* h_xix, float* h_xiy, float* h_xiz,
-                                        float* h_etax, float* h_etay, float* h_etaz,
-                                        float* h_gammax, float* h_gammay, float* h_gammaz,
-                                        float* h_kappav, float* h_muv,
+                                        realw* h_xix, realw* h_xiy, realw* h_xiz,
+                                        realw* h_etax, realw* h_etay, realw* h_etaz,
+                                        realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+                                        realw* h_kappav, realw* h_muv,
                                         int* h_ibool, 
                                         int* num_interfaces_ext_mesh, int* max_nibool_interfaces_ext_mesh,
                                         int* h_nibool_interfaces_ext_mesh, int* h_ibool_interfaces_ext_mesh,
-                                        float* h_hprime_xx,float* h_hprime_yy,float* h_hprime_zz, 
-                                        float* h_hprimewgll_xx,float* h_hprimewgll_yy,float* h_hprimewgll_zz,
-                                        float* h_wgllwgll_xy,float* h_wgllwgll_xz,float* h_wgllwgll_yz,        
+                                        realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz, 
+                                        realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
+                                        realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,        
                                         int* ABSORBING_CONDITIONS,    
                                         int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
-                                        float* h_abs_boundary_normal,
-                                        float* h_abs_boundary_jacobian2Dw,
+                                        realw* h_abs_boundary_normal,
+                                        realw* h_abs_boundary_jacobian2Dw,
                                         int* h_num_abs_boundary_faces,
                                         int* h_ispec_is_inner, 
                                         int* NSOURCES,
                                         int* nsources_local,
-                                        float* h_sourcearrays,
+                                        realw* h_sourcearrays,
                                         int* h_islice_selected_source,
                                         int* h_ispec_selected_source,
                                         int* h_number_receiver_global,
@@ -358,7 +356,7 @@
                                         int* SIMULATION_TYPE,
                                         int* USE_MESH_COLORING_GPU,
                                         int* nspec_acoustic,int* nspec_elastic,                                        
-                                        int* ncuda_devices)
+                                        int* myrank_f, int* ncuda_devices)
 {
   fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
   exit(1);
@@ -375,9 +373,9 @@
 
 void FC_FUNC_(prepare_fields_acoustic_device,
               PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f, 
-                                              float* rmass_acoustic, 
-                                              float* rhostore,
-                                              float* kappastore,
+                                              realw* rmass_acoustic, 
+                                              realw* rhostore,
+                                              realw* kappastore,
                                               int* num_phase_ispec_acoustic, 
                                               int* phase_ispec_inner_acoustic,
                                               int* ispec_is_acoustic,
@@ -387,13 +385,13 @@
                                               int* free_surface_ijk,
                                               int* ABSORBING_CONDITIONS,
                                               int* b_reclen_potential,
-                                              float* b_absorb_potential,
+                                              realw* b_absorb_potential,
                                               int* ELASTIC_SIMULATION,
                                               int* num_coupling_ac_el_faces,
                                               int* coupling_ac_el_ispec,
                                               int* coupling_ac_el_ijk,
-                                              float* coupling_ac_el_normal,
-                                              float* coupling_ac_el_jacobian2Dw,
+                                              realw* coupling_ac_el_normal,
+                                              realw* coupling_ac_el_jacobian2Dw,
                                               int* num_colors_outer_acoustic,
                                               int* num_colors_inner_acoustic,
                                               int* num_elem_colors_acoustic){}							 
@@ -405,49 +403,71 @@
 void FC_FUNC_(prepare_fields_elastic_device,
               PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
                                              int* size,
-                                             float* rmass,
-                                             float* rho_vp,
-                                             float* rho_vs,
+                                             realw* rmass,
+                                             realw* rho_vp,
+                                             realw* rho_vs,
                                              int* num_phase_ispec_elastic,
                                              int* phase_ispec_inner_elastic,
                                              int* ispec_is_elastic,
                                              int* ABSORBING_CONDITIONS,
-                                             float* h_b_absorb_field,
+                                             realw* h_b_absorb_field,
                                              int* h_b_reclen_field,
                                              int* SIMULATION_TYPE,int* SAVE_FORWARD,
                                              int* COMPUTE_AND_STORE_STRAIN,
-                                             float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
-                                             float* epsilondev_xz,float* epsilondev_yz,
+                                             realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                             realw* epsilondev_xz,realw* epsilondev_yz,
                                              int* ATTENUATION, 
                                              int* R_size,
-                                             float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
-                                             float* one_minus_sum_beta,float* factor_common,
-                                             float* alphaval,float* betaval,float* gammaval,
+                                             realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                             realw* one_minus_sum_beta,realw* factor_common,
+                                             realw* alphaval,realw* betaval,realw* gammaval,
                                              int* OCEANS,
-                                             float* rmass_ocean_load,
+                                             realw* rmass_ocean_load,
                                              int* NOISE_TOMOGRAPHY,
-                                             float* free_surface_normal,
+                                             realw* free_surface_normal,
                                              int* free_surface_ispec,
                                              int* free_surface_ijk,                                             
                                              int* num_free_surface_faces,
                                              int* ACOUSTIC_SIMULATION,
                                              int* num_colors_outer_elastic,
                                              int* num_colors_inner_elastic,
-                                             int* num_elem_colors_elastic){}
+                                             int* num_elem_colors_elastic,
+                                             int* ANISOTROY,
+                                             realw *c11store,
+                                             realw *c12store,
+                                             realw *c13store,
+                                             realw *c14store,
+                                             realw *c15store,
+                                             realw *c16store,
+                                             realw *c22store,
+                                             realw *c23store,
+                                             realw *c24store,                                             
+                                             realw *c25store,
+                                             realw *c26store,
+                                             realw *c33store,
+                                             realw *c34store,
+                                             realw *c35store,
+                                             realw *c36store,
+                                             realw *c44store,
+                                             realw *c45store,
+                                             realw *c46store,
+                                             realw *c55store,
+                                             realw *c56store,
+                                             realw *c66store){}
   
 void FC_FUNC_(prepare_fields_elastic_adj_dev,
               PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
                                               int* size,
                                               int* SIMULATION_TYPE,
                                               int* COMPUTE_AND_STORE_STRAIN,
-                                              float* epsilon_trace_over_3,                                             
-                                              float* b_epsilondev_xx,float* b_epsilondev_yy,float* b_epsilondev_xy,
-                                              float* b_epsilondev_xz,float* b_epsilondev_yz,
-                                              float* b_epsilon_trace_over_3,
+                                              realw* epsilon_trace_over_3,                                             
+                                              realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                              realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+                                              realw* b_epsilon_trace_over_3,
                                               int* ATTENUATION, 
                                               int* R_size,
-                                              float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
-                                              float* b_alphaval,float* b_betaval,float* b_gammaval,
+                                              realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+                                              realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
                                               int* APPROXIMATE_HESS_KL){}
   
 
@@ -460,12 +480,12 @@
                                            int* SIMULATION_TYPE,
                                            int* NOISE_TOMOGRAPHY,
                                            int* NSTEP,
-                                           float* noise_sourcearray,
-                                           float* normal_x_noise,
-                                           float* normal_y_noise,
-                                           float* normal_z_noise,
-                                           float* mask_noise,
-                                           float* free_surface_jacobian2Dw){}
+                                           realw* noise_sourcearray,
+                                           realw* normal_x_noise,
+                                           realw* normal_y_noise,
+                                           realw* normal_z_noise,
+                                           realw* mask_noise,
+                                           realw* free_surface_jacobian2Dw){}
 
 void FC_FUNC_(prepare_cleanup_device,
               PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
@@ -477,178 +497,179 @@
                                       int* NOISE_TOMOGRAPHY,
                                       int* COMPUTE_AND_STORE_STRAIN,
                                       int* ATTENUATION,
+                                      int* ANISOTROPY,
                                       int* OCEANS,
                                       int* APPROXIMATE_HESS_KL){}
 
 /* from file transfer_fields_cuda.cu				      */
 
 void FC_FUNC_(transfer_fields_el_to_device,
-              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, float* displ, float* veloc, float* accel,long* Mesh_pointer_f){}
+              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_fields_el_from_device,
-              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, float* displ, float* veloc, float* accel,long* Mesh_pointer_f){}
+              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_b_fields_to_device,
-              TRANSFER_B_FIELDS_TO_DEVICE)(int* size, float* b_displ, float* b_veloc, float* b_accel,
+              TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
                                            long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_b_fields_from_device,
-              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, float* b_displ, float* b_veloc, float* b_accel,long* Mesh_pointer_f){}
+              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_accel_to_device,
-              TRNASFER_ACCEL_TO_DEVICE)(int* size, float* accel,long* Mesh_pointer_f){}
+              TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_accel_from_device,
-              TRANSFER_ACCEL_FROM_DEVICE)(int* size, float* accel,long* Mesh_pointer_f){}
+              TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f){}
 void FC_FUNC_(transfer_b_accel_from_device,
-              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, float* b_accel,long* Mesh_pointer_f){}
+              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_sigma_from_device,
-              TRANSFER_SIGMA_FROM_DEVICE)(int* size, float* sigma_kl,long* Mesh_pointer_f){}
+              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_b_displ_from_device,
-              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, float* displ,long* Mesh_pointer_f){}
+              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_displ_from_device,
-              TRANSFER_DISPL_FROM_DEVICE)(int* size, float* displ,long* Mesh_pointer_f){}
+              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f){}
 
 /*
 void FC_FUNC_(transfer_compute_kernel_answers_from_device,
               TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
-                                                           float* rho_kl,int* size_rho,
-                                                           float* mu_kl, int* size_mu,
-                                                           float* kappa_kl, int* size_kappa){}
+                                                           realw* rho_kl,int* size_rho,
+                                                           realw* mu_kl, int* size_mu,
+                                                           realw* kappa_kl, int* size_kappa){}
 */
 
 /*
 void FC_FUNC_(transfer_compute_kernel_fields_from_device,
               TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
-                                                          float* accel, int* size_accel,
-                                                          float* b_displ, int* size_b_displ,
-                                                          float* epsilondev_xx,
-                                                          float* epsilondev_yy,
-                                                          float* epsilondev_xy,
-                                                          float* epsilondev_xz,
-                                                          float* epsilondev_yz,
+                                                          realw* accel, int* size_accel,
+                                                          realw* b_displ, int* size_b_displ,
+                                                          realw* epsilondev_xx,
+                                                          realw* epsilondev_yy,
+                                                          realw* epsilondev_xy,
+                                                          realw* epsilondev_xz,
+                                                          realw* epsilondev_yz,
                                                           int* size_epsilondev,
-                                                          float* b_epsilondev_xx,
-                                                          float* b_epsilondev_yy,
-                                                          float* b_epsilondev_xy,
-                                                          float* b_epsilondev_xz,
-                                                          float* b_epsilondev_yz,
+                                                          realw* b_epsilondev_xx,
+                                                          realw* b_epsilondev_yy,
+                                                          realw* b_epsilondev_xy,
+                                                          realw* b_epsilondev_xz,
+                                                          realw* b_epsilondev_yz,
                                                           int* size_b_epsilondev,
-                                                          float* rho_kl,int* size_rho,
-                                                          float* mu_kl, int* size_mu,
-                                                          float* kappa_kl, int* size_kappa,
-                                                          float* epsilon_trace_over_3,
-                                                          float* b_epsilon_trace_over_3,
+                                                          realw* rho_kl,int* size_rho,
+                                                          realw* mu_kl, int* size_mu,
+                                                          realw* kappa_kl, int* size_kappa,
+                                                          realw* epsilon_trace_over_3,
+                                                          realw* b_epsilon_trace_over_3,
                                                           int* size_epsilon_trace_over_3) {}
 */
                                                           
 void FC_FUNC_(transfer_b_fields_att_to_device,
               TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
-                                             float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
                                              int* size_R,
-                                             float* b_epsilondev_xx,
-                                             float* b_epsilondev_yy,
-                                             float* b_epsilondev_xy,
-                                             float* b_epsilondev_xz,
-                                             float* b_epsilondev_yz,
+                                             realw* b_epsilondev_xx,
+                                             realw* b_epsilondev_yy,
+                                             realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,
+                                             realw* b_epsilondev_yz,
                                              int* size_epsilondev){}
 
 void FC_FUNC_(transfer_fields_att_from_device,
               TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
-                                               float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+                                               realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
                                                int* size_R,
-                                               float* epsilondev_xx,
-                                               float* epsilondev_yy,
-                                               float* epsilondev_xy,
-                                               float* epsilondev_xz,
-                                               float* epsilondev_yz,
+                                               realw* epsilondev_xx,
+                                               realw* epsilondev_yy,
+                                               realw* epsilondev_xy,
+                                               realw* epsilondev_xz,
+                                               realw* epsilondev_yz,
                                                int* size_epsilondev){}
 
 void FC_FUNC_(transfer_kernels_el_to_host,
               TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer, 
-                                                    float* h_rho_kl,
-                                                    float* h_mu_kl, 
-                                                    float* h_kappa_kl,
+                                                    realw* h_rho_kl,
+                                                    realw* h_mu_kl, 
+                                                    realw* h_kappa_kl,
                                                     int* NSPEC_AB){}
 
 void FC_FUNC_(transfer_kernels_noise_to_host,
               TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer, 
-                                              float* h_Sigma_kl,
+                                              realw* h_Sigma_kl,
                                               int* NSPEC_AB){}
 
 							 
 void FC_FUNC_(transfer_fields_ac_to_device,
               TRANSFER_FIELDS_AC_TO_DEVICE)(
                                                   int* size, 
-                                                  float* potential_acoustic, 
-                                                  float* potential_dot_acoustic, 
-                                                  float* potential_dot_dot_acoustic,
+                                                  realw* potential_acoustic, 
+                                                  realw* potential_dot_acoustic, 
+                                                  realw* potential_dot_dot_acoustic,
                                                   long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_b_fields_ac_to_device,
               TRANSFER_B_FIELDS_AC_TO_DEVICE)(
                                                     int* size, 
-                                                    float* b_potential_acoustic, 
-                                                    float* b_potential_dot_acoustic, 
-                                                    float* b_potential_dot_dot_acoustic,
+                                                    realw* b_potential_acoustic, 
+                                                    realw* b_potential_dot_acoustic, 
+                                                    realw* b_potential_dot_dot_acoustic,
                                                     long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_fields_ac_from_device,TRANSFER_FIELDS_AC_FROM_DEVICE)(
                                                                              int* size, 
-                                                                             float* potential_acoustic, 
-                                                                             float* potential_dot_acoustic, 
-                                                                             float* potential_dot_dot_acoustic,
+                                                                             realw* potential_acoustic, 
+                                                                             realw* potential_dot_acoustic, 
+                                                                             realw* potential_dot_dot_acoustic,
                                                                              long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_b_fields_ac_from_device,
               TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
                                                       int* size, 
-                                                      float* b_potential_acoustic, 
-                                                      float* b_potential_dot_acoustic, 
-                                                      float* b_potential_dot_dot_acoustic,
+                                                      realw* b_potential_acoustic, 
+                                                      realw* b_potential_dot_acoustic, 
+                                                      realw* b_potential_dot_dot_acoustic,
                                                       long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_dot_dot_from_device,
-              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, float* potential_dot_dot_acoustic,long* Mesh_pointer_f){}
+              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_b_dot_dot_from_device,
-              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, float* b_potential_dot_dot_acoustic,long* Mesh_pointer_f){}
+              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_kernels_ac_to_host,
               TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer, 
-                                                             float* h_rho_ac_kl,
-                                                             float* h_kappa_ac_kl,
+                                                             realw* h_rho_ac_kl,
+                                                             realw* h_kappa_ac_kl,
                                                              int* NSPEC_AB){}
 
 void FC_FUNC_(transfer_kernels_hess_el_tohost,
               TRANSFER_KERNELS_HESS_TO_HOST)(long* Mesh_pointer,
-                                             float* h_hess_kl,
+                                             realw* h_hess_kl,
                                              int* NSPEC_AB) {}
 void FC_FUNC_(transfer_kernels_hess_ac_tohost,
               TRANSFER_KERNELS_HESS_TO_HOST)(long* Mesh_pointer,
-                                             float* h_hess_ac_kl,
+                                             realw* h_hess_ac_kl,
                                              int* NSPEC_AB) {}
 
 /* from file write_seismograms_cuda.cu */
 
 void FC_FUNC_(transfer_station_el_from_device,
-              TRANSFER_STATION_EL_FROM_DEVICE)(float* displ,float* veloc,float* accel,
-                                                   float* b_displ, float* b_veloc, float* b_accel,
+              TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
+                                                   realw* b_displ, realw* b_veloc, realw* b_accel,
                                                    long* Mesh_pointer_f,int* number_receiver_global,
                                                    int* ispec_selected_rec,int* ispec_selected_source,
                                                    int* ibool,int* SIMULATION_TYPEf){}
 
 void FC_FUNC_(transfer_station_ac_from_device,
               TRANSFER_STATION_AC_FROM_DEVICE)(
-                                                float* potential_acoustic,
-                                                float* potential_dot_acoustic,
-                                                float* potential_dot_dot_acoustic,
-                                                float* b_potential_acoustic, 
-                                                float* b_potential_dot_acoustic, 
-                                                float* b_potential_dot_dot_acoustic,
+                                                realw* potential_acoustic,
+                                                realw* potential_dot_acoustic,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* b_potential_acoustic, 
+                                                realw* b_potential_dot_acoustic, 
+                                                realw* b_potential_dot_dot_acoustic,
                                                 long* Mesh_pointer_f,
                                                 int* number_receiver_global,
                                                 int* ispec_selected_rec,

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
-#include <mpi.h>
 
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -54,15 +53,15 @@
 
 extern "C"
 void FC_FUNC_(transfer_fields_el_to_device,
-              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, float* displ, float* veloc, float* accel,long* Mesh_pointer_f) {
+              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
 
 TRACE("transfer_fields_el_to_device_");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,displ,sizeof(float)*(*size),cudaMemcpyHostToDevice),40003);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc,veloc,sizeof(float)*(*size),cudaMemcpyHostToDevice),40004);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(float)*(*size),cudaMemcpyHostToDevice),40005);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc,veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40004);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40005);
 
 }
 
@@ -70,38 +69,31 @@
 
 extern "C"
 void FC_FUNC_(transfer_fields_el_from_device,
-              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, float* displ, float* veloc, float* accel,long* Mesh_pointer_f) {
+              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
 
   TRACE("transfer_fields_el_from_device_");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40006);
-  print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40007);
-  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40008);
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
+  print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
+  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40008);
 
-  // printf("Transfered Fields From Device\n");
-  // int procid;
-  // MPI_Comm_rank(MPI_COMM_WORLD,&procid);
-  // printf("Quick check of answer for p:%d in transfer_fields_el_from_device\n",procid);
-  // for(int i=0;i<5;i++) {
-  // printf("accel[%d]=%2.20e\n",i,accel[i]);
-  // }
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
 void FC_FUNC_(transfer_b_fields_to_device,
-              TRANSFER_B_FIELDS_TO_DEVICE)(int* size, float* b_displ, float* b_veloc, float* b_accel,
+              TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
                                            long* Mesh_pointer_f) {
 
   TRACE("transfer_b_fields_to_device_");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  cudaMemcpy(mp->d_b_displ,b_displ,sizeof(float)*(*size),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_veloc,b_veloc,sizeof(float)*(*size),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_accel,b_accel,sizeof(float)*(*size),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_displ,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_veloc,b_veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_accel,b_accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
 
 }
 
@@ -109,15 +101,15 @@
 
 extern "C"
 void FC_FUNC_(transfer_b_fields_from_device,
-              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, float* b_displ, float* b_veloc, float* b_accel,long* Mesh_pointer_f) {
+              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {
 
 TRACE("transfer_b_fields_from_device_");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  cudaMemcpy(b_displ,mp->d_b_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_veloc,mp->d_b_veloc,sizeof(float)*(*size),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_accel,mp->d_b_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_displ,mp->d_b_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_veloc,mp->d_b_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
 
 }
 
@@ -126,13 +118,13 @@
 
 extern "C"
 void FC_FUNC_(transfer_accel_to_device,
-              TRNASFER_ACCEL_TO_DEVICE)(int* size, float* accel,long* Mesh_pointer_f) {
+              TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
 
 TRACE("transfer_accel_to_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(float)*(*size),cudaMemcpyHostToDevice),40016);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40016);
 
 }
 
@@ -140,13 +132,13 @@
 
 extern "C"
 void FC_FUNC_(transfer_accel_from_device,
-              TRANSFER_ACCEL_FROM_DEVICE)(int* size, float* accel,long* Mesh_pointer_f) {
+              TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
 
 TRACE("transfer_accel_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40026);
+  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40026);
 
 }
 
@@ -154,13 +146,13 @@
 
 extern "C"
 void FC_FUNC_(transfer_b_accel_from_device,
-              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, float* b_accel,long* Mesh_pointer_f) {
+              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {
 
 TRACE("transfer_b_accel_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40036);
+  print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40036);
 
 }
 
@@ -168,13 +160,13 @@
 
 extern "C"
 void FC_FUNC_(transfer_sigma_from_device,
-              TRANSFER_SIGMA_FROM_DEVICE)(int* size, float* sigma_kl,long* Mesh_pointer_f) {
+              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {
 
 TRACE("transfer_sigma_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40046);
+  print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40046);
 
 }
 
@@ -182,13 +174,13 @@
 
 extern "C"
 void FC_FUNC_(transfer_b_displ_from_device,
-              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, float* displ,long* Mesh_pointer_f) {
+              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
 
 TRACE("transfer_b_displ_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40056);
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40056);
 
 }
 
@@ -196,13 +188,13 @@
 
 extern "C"
 void FC_FUNC_(transfer_displ_from_device,
-              TRANSFER_DISPL_FROM_DEVICE)(int* size, float* displ,long* Mesh_pointer_f) {
+              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
 
 TRACE("transfer_displ_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40066);
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40066);
 
 }
 
@@ -211,15 +203,15 @@
 extern "C"
 void FC_FUNC_(transfer_compute_kernel_answers_from_device,
               TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
-                                                           float* rho_kl,int* size_rho,
-                                                           float* mu_kl, int* size_mu,
-                                                           float* kappa_kl, int* size_kappa) {
+                                                           realw* rho_kl,int* size_rho,
+                                                           realw* mu_kl, int* size_mu,
+                                                           realw* kappa_kl, int* size_kappa) {
 TRACE("transfer_compute_kernel_answers_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-  cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
 
 }
 */
@@ -229,47 +221,47 @@
 extern "C"
 void FC_FUNC_(transfer_compute_kernel_fields_from_device,
               TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
-                                                          float* accel, int* size_accel,
-                                                          float* b_displ, int* size_b_displ,
-                                                          float* epsilondev_xx,
-                                                          float* epsilondev_yy,
-                                                          float* epsilondev_xy,
-                                                          float* epsilondev_xz,
-                                                          float* epsilondev_yz,
+                                                          realw* accel, int* size_accel,
+                                                          realw* b_displ, int* size_b_displ,
+                                                          realw* epsilondev_xx,
+                                                          realw* epsilondev_yy,
+                                                          realw* epsilondev_xy,
+                                                          realw* epsilondev_xz,
+                                                          realw* epsilondev_yz,
                                                           int* size_epsilondev,
-                                                          float* b_epsilondev_xx,
-                                                          float* b_epsilondev_yy,
-                                                          float* b_epsilondev_xy,
-                                                          float* b_epsilondev_xz,
-                                                          float* b_epsilondev_yz,
+                                                          realw* b_epsilondev_xx,
+                                                          realw* b_epsilondev_yy,
+                                                          realw* b_epsilondev_xy,
+                                                          realw* b_epsilondev_xz,
+                                                          realw* b_epsilondev_yz,
                                                           int* size_b_epsilondev,
-                                                          float* rho_kl,int* size_rho,
-                                                          float* mu_kl, int* size_mu,
-                                                          float* kappa_kl, int* size_kappa,
-                                                          float* epsilon_trace_over_3,
-                                                          float* b_epsilon_trace_over_3,
+                                                          realw* rho_kl,int* size_rho,
+                                                          realw* mu_kl, int* size_mu,
+                                                          realw* kappa_kl, int* size_kappa,
+                                                          realw* epsilon_trace_over_3,
+                                                          realw* b_epsilon_trace_over_3,
                                                           int* size_epsilon_trace_over_3) {
 TRACE("transfer_compute_kernel_fields_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-  cudaMemcpy(accel,mp->d_accel,*size_accel*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_displ,mp->d_b_displ,*size_b_displ*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_epsilondev_xx,mp->d_b_epsilondev_xx,*size_b_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_epsilondev_yy,mp->d_b_epsilondev_yy,*size_b_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_epsilondev_xy,mp->d_b_epsilondev_xy,*size_b_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_epsilondev_xz,mp->d_b_epsilondev_xz,*size_b_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_epsilondev_yz,mp->d_b_epsilondev_yz,*size_b_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilon_trace_over_3,mp->d_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(float),
+  cudaMemcpy(accel,mp->d_accel,*size_accel*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_displ,mp->d_b_displ,*size_b_displ*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_xx,mp->d_b_epsilondev_xx,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_yy,mp->d_b_epsilondev_yy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_xy,mp->d_b_epsilondev_xy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_xz,mp->d_b_epsilondev_xz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_yz,mp->d_b_epsilondev_yz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilon_trace_over_3,mp->d_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
        cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_epsilon_trace_over_3,mp->d_b_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(float),
+  cudaMemcpy(b_epsilon_trace_over_3,mp->d_b_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
        cudaMemcpyDeviceToHost);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -285,29 +277,29 @@
 extern "C"
 void FC_FUNC_(transfer_b_fields_att_to_device,
               TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
-                                             float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
                                              int* size_R,
-                                             float* b_epsilondev_xx,
-                                             float* b_epsilondev_yy,
-                                             float* b_epsilondev_xy,
-                                             float* b_epsilondev_xz,
-                                             float* b_epsilondev_yz,
+                                             realw* b_epsilondev_xx,
+                                             realw* b_epsilondev_yy,
+                                             realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,
+                                             realw* b_epsilondev_yz,
                                              int* size_epsilondev) {
   TRACE("transfer_b_fields_att_to_device");
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
 
-  cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
 
-  cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
 
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -322,29 +314,29 @@
 extern "C"
 void FC_FUNC_(transfer_fields_att_from_device,
               TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
-                                               float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+                                               realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
                                                int* size_R,
-                                               float* epsilondev_xx,
-                                               float* epsilondev_yy,
-                                               float* epsilondev_xy,
-                                               float* epsilondev_xz,
-                                               float* epsilondev_yz,
+                                               realw* epsilondev_xx,
+                                               realw* epsilondev_yy,
+                                               realw* epsilondev_xy,
+                                               realw* epsilondev_xz,
+                                               realw* epsilondev_yz,
                                                int* size_epsilondev) {
   TRACE("transfer_fields_att_from_device");
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
 
-  cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
 
-  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
 
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -358,19 +350,19 @@
 extern "C"
 void FC_FUNC_(transfer_kernels_el_to_host,
               TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
-                                                    float* h_rho_kl,
-                                                    float* h_mu_kl,
-                                                    float* h_kappa_kl,
+                                                    realw* h_rho_kl,
+                                                    realw* h_mu_kl,
+                                                    realw* h_kappa_kl,
                                                     int* NSPEC_AB) {
 TRACE("transfer_kernels_el_to_host");
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
 
-  print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*125*sizeof(float),
+  print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*NGLL3*sizeof(realw),
                                      cudaMemcpyDeviceToHost),40101);
-  print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*125*sizeof(float),
+  print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*NGLL3*sizeof(realw),
                                      cudaMemcpyDeviceToHost),40102);
-  print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*125*sizeof(float),
+  print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*NGLL3*sizeof(realw),
                                      cudaMemcpyDeviceToHost),40103);
 
 }
@@ -384,13 +376,13 @@
 extern "C"
 void FC_FUNC_(transfer_kernels_noise_to_host,
               TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
-                                                          float* h_Sigma_kl,
+                                                          realw* h_Sigma_kl,
                                                           int* NSPEC_AB) {
 TRACE("transfer_kernels_noise_to_host");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,125*(*NSPEC_AB)*sizeof(float),
+  print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
                                      cudaMemcpyDeviceToHost),40201);
 
 }
@@ -406,20 +398,20 @@
 void FC_FUNC_(transfer_fields_ac_to_device,
               TRANSFER_FIELDS_AC_TO_DEVICE)(
                                                   int* size,
-                                                  float* potential_acoustic,
-                                                  float* potential_dot_acoustic,
-                                                  float* potential_dot_dot_acoustic,
+                                                  realw* potential_acoustic,
+                                                  realw* potential_dot_acoustic,
+                                                  realw* potential_dot_dot_acoustic,
                                                   long* Mesh_pointer_f) {
 TRACE("transfer_fields_ac_to_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_acoustic,potential_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),50110);
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),50110);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_acoustic,potential_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),50120);
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),50120);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),50130);
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),50130);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("after transfer_fields_ac_to_device");
@@ -432,20 +424,20 @@
 void FC_FUNC_(transfer_b_fields_ac_to_device,
               TRANSFER_B_FIELDS_AC_TO_DEVICE)(
                                                     int* size,
-                                                    float* b_potential_acoustic,
-                                                    float* b_potential_dot_acoustic,
-                                                    float* b_potential_dot_dot_acoustic,
+                                                    realw* b_potential_acoustic,
+                                                    realw* b_potential_dot_acoustic,
+                                                    realw* b_potential_dot_dot_acoustic,
                                                     long* Mesh_pointer_f) {
 TRACE("transfer_b_fields_ac_to_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_acoustic,b_potential_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),51110);
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),51110);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),51120);
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),51120);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_dot_acoustic,b_potential_dot_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),51130);
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),51130);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("after transfer_b_fields_ac_to_device");
@@ -458,20 +450,20 @@
 extern "C"
 void FC_FUNC_(transfer_fields_ac_from_device,TRANSFER_FIELDS_AC_FROM_DEVICE)(
                                                                                          int* size,
-                                                                                         float* potential_acoustic,
-                                                                                         float* potential_dot_acoustic,
-                                                                                         float* potential_dot_dot_acoustic,
+                                                                                         realw* potential_acoustic,
+                                                                                         realw* potential_dot_acoustic,
+                                                                                         realw* potential_dot_dot_acoustic,
                                                                                          long* Mesh_pointer_f) {
 TRACE("transfer_fields_ac_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(potential_acoustic,mp->d_potential_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),52111);
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52111);
   print_CUDA_error_if_any(cudaMemcpy(potential_dot_acoustic,mp->d_potential_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),52121);
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52121);
   print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),52131);
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52131);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("after transfer_fields_ac_from_device");
@@ -484,20 +476,20 @@
 void FC_FUNC_(transfer_b_fields_ac_from_device,
               TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
                                                       int* size,
-                                                      float* b_potential_acoustic,
-                                                      float* b_potential_dot_acoustic,
-                                                      float* b_potential_dot_dot_acoustic,
+                                                      realw* b_potential_acoustic,
+                                                      realw* b_potential_dot_acoustic,
+                                                      realw* b_potential_dot_dot_acoustic,
                                                       long* Mesh_pointer_f) {
 TRACE("transfer_b_fields_ac_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(b_potential_acoustic,mp->d_b_potential_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),53111);
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53111);
   print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_acoustic,mp->d_b_potential_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),53121);
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53121);
   print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),53131);
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53131);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("after transfer_b_fields_ac_from_device");
@@ -508,14 +500,14 @@
 
 extern "C"
 void FC_FUNC_(transfer_dot_dot_from_device,
-              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, float* potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {
 
   TRACE("transfer_dot_dot_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),50041);
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50041);
 
 }
 
@@ -523,14 +515,14 @@
 
 extern "C"
 void FC_FUNC_(transfer_b_dot_dot_from_device,
-              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, float* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {
 
   TRACE("transfer_b_dot_dot_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),50042);
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50042);
 
 }
 
@@ -540,20 +532,20 @@
 extern "C"
 void FC_FUNC_(transfer_kernels_ac_to_host,
               TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
-                                                             float* h_rho_ac_kl,
-                                                             float* h_kappa_ac_kl,
+                                                             realw* h_rho_ac_kl,
+                                                             realw* h_kappa_ac_kl,
                                                              int* NSPEC_AB) {
 
   TRACE("transfer_kernels_ac_to_host");
 
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
-  int size = *NSPEC_AB*125;
+  int size = *NSPEC_AB*NGLL3;
 
   // copies kernel values over to CPU host
-  print_CUDA_error_if_any(cudaMemcpy(h_rho_ac_kl,mp->d_rho_ac_kl,size*sizeof(float),
+  print_CUDA_error_if_any(cudaMemcpy(h_rho_ac_kl,mp->d_rho_ac_kl,size*sizeof(realw),
                                      cudaMemcpyDeviceToHost),54101);
-  print_CUDA_error_if_any(cudaMemcpy(h_kappa_ac_kl,mp->d_kappa_ac_kl,size*sizeof(float),
+  print_CUDA_error_if_any(cudaMemcpy(h_kappa_ac_kl,mp->d_kappa_ac_kl,size*sizeof(realw),
                                      cudaMemcpyDeviceToHost),54102);
 }
 
@@ -566,13 +558,13 @@
 extern "C"
 void FC_FUNC_(transfer_kernels_hess_el_tohost,
               TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
-                                              float* h_hess_kl,
+                                              realw* h_hess_kl,
                                               int* NSPEC_AB) {
 TRACE("transfer_kernels_hess_el_tohost");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(h_hess_kl,mp->d_hess_el_kl,125*(*NSPEC_AB)*sizeof(float),
+  print_CUDA_error_if_any(cudaMemcpy(h_hess_kl,mp->d_hess_el_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
                                      cudaMemcpyDeviceToHost),70201);
 }
 
@@ -581,13 +573,13 @@
 extern "C"
 void FC_FUNC_(transfer_kernels_hess_ac_tohost,
               TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
-                                             float* h_hess_ac_kl,
+                                             realw* h_hess_ac_kl,
                                              int* NSPEC_AB) {
   TRACE("transfer_kernels_hess_ac_tohost");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(h_hess_ac_kl,mp->d_hess_ac_kl,125*(*NSPEC_AB)*sizeof(float),
+  print_CUDA_error_if_any(cudaMemcpy(h_hess_ac_kl,mp->d_hess_ac_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
                                      cudaMemcpyDeviceToHost),70202);
 }
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2011-11-06 02:02:36 UTC (rev 19152)
@@ -46,36 +46,25 @@
 __global__ void transfer_stations_fields_from_device_kernel(int* number_receiver_global,
                                                             int* ispec_selected_rec,
                                                             int* ibool,
-                                                            float* station_seismo_field,
-                                                            float* desired_field,
-                                                            int nrec_local //,int* debug_index
-                                                            ) {
+                                                            realw* station_seismo_field,
+                                                            realw* desired_field,
+                                                            int nrec_local) {
   int blockID = blockIdx.x + blockIdx.y*gridDim.x;
   if(blockID<nrec_local) {
-    //int nodeID = threadIdx.x + blockID*blockDim.x;
     int irec = number_receiver_global[blockID]-1;
-    int ispec = ispec_selected_rec[irec]-1; // ispec==0 before -1???
-    // if(threadIdx.x==1 && blockID < 125) {
-    //   // debug_index[threadIdx.x] = threadIdx.x + 125*ispec;
-    //   debug_index[blockID] = ispec;
-    //   debug_index[blockID + 4] = irec;
-    //   debug_index[blockID + 8] = ispec_selected_rec[0];
-    //   debug_index[blockID + 9] = ispec_selected_rec[1];
-    //   debug_index[blockID +10] = ispec_selected_rec[2];
-    //   debug_index[blockID +11] = ispec_selected_rec[3];
-    //   debug_index[blockID +12] = ispec_selected_rec[4];
-    // }
-    int iglob = ibool[threadIdx.x + 125*ispec]-1;
-    station_seismo_field[3*125*blockID + 3*threadIdx.x+0] = desired_field[3*iglob];
-    station_seismo_field[3*125*blockID + 3*threadIdx.x+1] = desired_field[3*iglob+1];
-    station_seismo_field[3*125*blockID + 3*threadIdx.x+2] = desired_field[3*iglob+2];
+    int ispec = ispec_selected_rec[irec]-1;
+    int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
+
+    station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+0] = desired_field[3*iglob];
+    station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+1] = desired_field[3*iglob+1];
+    station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+2] = desired_field[3*iglob+2];
   }
 }
 
 
 /* ----------------------------------------------------------------------------------------------- */
 
-void transfer_field_from_device(Mesh* mp, float* d_field,float* h_field,
+void transfer_field_from_device(Mesh* mp, realw* d_field,realw* h_field,
                                           int* number_receiver_global,
                                           int* d_ispec_selected,
                                           int* h_ispec_selected,
@@ -86,11 +75,9 @@
   // checks if anything to do
   if( mp->nrec_local == 0 ) return;
 
-  int blocksize = 125;
+  int blocksize = NGLL3;
   int num_blocks_x = mp->nrec_local;
   int num_blocks_y = 1;
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
   while(num_blocks_x > 65535) {
     num_blocks_x = ceil(num_blocks_x/2.0);
     num_blocks_y = num_blocks_y*2;
@@ -99,59 +86,41 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  //int* d_debug_index;
-  //int* h_debug_index;
-  //cudaMalloc((void**)&d_debug_index,125*sizeof(int));
-  //h_debug_index = (int*)calloc(125,sizeof(int));
-  //cudaMemcpy(d_debug_index,h_debug_index,125*sizeof(int),cudaMemcpyHostToDevice);
-
-
   // prepare field transfer array on device
   transfer_stations_fields_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
                                                                 d_ispec_selected,
                                                                 mp->d_ibool,
                                                                 mp->d_station_seismo_field,
                                                                 d_field,
-                                                                mp->nrec_local //,d_debug_index
-                                                                );
+                                                                mp->nrec_local);
 
-  //cudaMemcpy(h_debug_index,d_debug_index,125*sizeof(int),cudaMemcpyDeviceToHost);
-
-  // pause_for_debug(1);
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("transfer_stations_fields_from_device_kernel");
-#endif
-
   cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
-       (3*125)*(mp->nrec_local)*sizeof(float),cudaMemcpyDeviceToHost);
+       (3*NGLL3)*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
 
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("transfer_stations_fields_from_device_kernel_memcpy");
-#endif
-
-  // pause_for_debug(1);
   int irec_local;
-
   for(irec_local=0;irec_local<mp->nrec_local;irec_local++) {
     int irec = number_receiver_global[irec_local] - 1;
     int ispec = h_ispec_selected[irec] - 1;
 
-    for(int i=0;i<125;i++) {
-      int iglob = ibool[i+125*ispec] - 1;
-      h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*125*3];
-      h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*125*3];
-      h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*125*3];
+    for(int i=0;i<NGLL3;i++) {
+      int iglob = ibool[i+NGLL3*ispec] - 1;
+      h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*NGLL3*3];
+      h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3];
+      h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3];
     }
 
   }
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("transfer_field_from_device");
+#endif
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
 void FC_FUNC_(transfer_station_el_from_device,
-              TRANSFER_STATION_EL_FROM_DEVICE)(float* displ,float* veloc,float* accel,
-                                                   float* b_displ, float* b_veloc, float* b_accel,
+              TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
+                                                   realw* b_displ, realw* b_veloc, realw* b_accel,
                                                    long* Mesh_pointer_f,int* number_receiver_global,
                                                    int* ispec_selected_rec,int* ispec_selected_source,
                                                    int* ibool,int* SIMULATION_TYPEf) {
@@ -199,15 +168,15 @@
 __global__ void transfer_stations_fields_acoustic_from_device_kernel(int* number_receiver_global,
                                                                      int* ispec_selected_rec,
                                                                      int* ibool,
-                                                                     float* station_seismo_potential,
-                                                                     float* desired_potential) {
+                                                                     realw* station_seismo_potential,
+                                                                     realw* desired_potential) {
 
   int blockID = blockIdx.x + blockIdx.y*gridDim.x;
   int nodeID = threadIdx.x + blockID*blockDim.x;
 
   int irec = number_receiver_global[blockID]-1;
   int ispec = ispec_selected_rec[irec]-1;
-  int iglob = ibool[threadIdx.x + 125*ispec]-1;
+  int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
 
   //if(threadIdx.x == 0 ) printf("node acoustic: %i %i %i %i %i %e \n",blockID,nodeID,irec,ispec,iglob,desired_potential[iglob]);
 
@@ -217,8 +186,8 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 void transfer_field_acoustic_from_device(Mesh* mp,
-                                         float* d_potential,
-                                         float* h_potential,
+                                         realw* d_potential,
+                                         realw* h_potential,
                                          int* number_receiver_global,
                                          int* d_ispec_selected,
                                          int* h_ispec_selected,
@@ -232,7 +201,7 @@
   if( mp->nrec_local == 0 ) return;
 
   // sets up kernel dimensions
-  int blocksize = 125;
+  int blocksize = NGLL3;
   int num_blocks_x = mp->nrec_local;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
@@ -252,7 +221,7 @@
 
 
   print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_potential,mp->d_station_seismo_potential,
-                                     mp->nrec_local*125*sizeof(float),cudaMemcpyDeviceToHost),500);
+                                     mp->nrec_local*NGLL3*sizeof(realw),cudaMemcpyDeviceToHost),500);
 
   //printf("copy local receivers: %i \n",mp->nrec_local);
 
@@ -262,14 +231,14 @@
 
     // copy element values
     // note: iglob may vary and can be irregularly accessing the h_potential array
-    for(j=0; j < 125; j++){
-      iglob = ibool[j+125*ispec]-1;
-      h_potential[iglob] = mp->h_station_seismo_potential[j+irec_local*125];
+    for(j=0; j < NGLL3; j++){
+      iglob = ibool[j+NGLL3*ispec]-1;
+      h_potential[iglob] = mp->h_station_seismo_potential[j+irec_local*NGLL3];
     }
 
     // copy each station element's points to working array
     // note: this works if iglob values would be all aligned...
-    //memcpy(&(h_potential[iglob]),&(mp->h_station_seismo_potential[irec_local*125]),125*sizeof(float));
+    //memcpy(&(h_potential[iglob]),&(mp->h_station_seismo_potential[irec_local*NGLL3]),NGLL3*sizeof(realw));
 
   }
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -282,12 +251,12 @@
 extern "C"
 void FC_FUNC_(transfer_station_ac_from_device,
               TRANSFER_STATION_AC_FROM_DEVICE)(
-                                                float* potential_acoustic,
-                                                float* potential_dot_acoustic,
-                                                float* potential_dot_dot_acoustic,
-                                                float* b_potential_acoustic,
-                                                float* b_potential_dot_acoustic,
-                                                float* b_potential_dot_dot_acoustic,
+                                                realw* potential_acoustic,
+                                                realw* potential_dot_acoustic,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* b_potential_acoustic,
+                                                realw* b_potential_dot_acoustic,
+                                                realw* b_potential_dot_dot_acoustic,
                                                 long* Mesh_pointer_f,
                                                 int* number_receiver_global,
                                                 int* ispec_selected_rec,

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90	2011-11-06 02:02:36 UTC (rev 19152)
@@ -559,7 +559,7 @@
   integer :: num_xmin,num_xmax,num_ymin,num_ymax,num_top,num_bottom,num
   integer :: num_moho
   integer :: j
-  character(len=128) :: line
+  !character(len=128) :: line
 
 ! read databases about external mesh simulation
 ! global node coordinates
@@ -823,10 +823,10 @@
       ! no moho informations given
       nspec2D_moho_ext = 0
       boundary_number = 7
-    else
-      ! tries to read in number of moho elements
-      read(line,*,iostat=ier) boundary_number ,nspec2D_moho_ext
-      if( ier /= 0 ) call exit_mpi(myrank,'error reading moho mesh in database')
+    !else
+    !  ! tries to read in number of moho elements
+    !  read(line,*,iostat=ier) boundary_number ,nspec2D_moho_ext
+    !  if( ier /= 0 ) call exit_mpi(myrank,'error reading moho mesh in database')
     endif
     if(boundary_number /= 7) stop "Error : invalid database file"
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_aniso.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_aniso.f90	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_aniso.f90	2011-11-06 02:02:36 UTC (rev 19152)
@@ -309,5 +309,28 @@
   c56 = - d46
   c66 = d66
 
+! unused: fills values with the isotropic model
+!  c11 = rho*vpv*vpv
+!  c12 = rho*(vpv*vpv-2.*vsv*vsv)
+!  c13 = c12
+!  c14 = 0.d0
+!  c15 = 0.d0
+!  c16 = 0.d0
+!  c22 = c11
+!  c23 = c12
+!  c24 = 0.d0
+!  c25 = 0.d0
+!  c26 = 0.d0
+!  c33 = c11
+!  c34 = 0.d0
+!  c35 = 0.d0
+!  c36 = 0.d0
+!  c44 = rho*vsv*vsv
+!  c45 = 0.d0
+!  c46 = 0.d0
+!  c55 = c44
+!  c56 = 0.d0
+!  c66 = c44
+
   end subroutine model_aniso
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/save_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/save_databases.f90	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/save_databases.f90	2011-11-06 02:02:36 UTC (rev 19152)
@@ -72,7 +72,7 @@
 ! second dimension : #rho  #vp  #vs  #Q_flag  #anisotropy_flag #domain_id
   double precision , dimension(NMATERIALS,6) ::  material_properties
   double precision , dimension(6) :: matpropl
-  integer i,ispec,iglob
+  integer :: i,ispec,iglob,ier
 
 ! name of the database files
   character(len=256) prname
@@ -82,67 +82,69 @@
   logical, dimension(8) ::  interfaces
   integer, dimension(8) ::  nspec_interface
 
+  integer, parameter :: IIN_database = 15
 
-  !open(unit=15,file=prname(1:len_trim(prname))//'Database',status='unknown',action='write',form='formatted')
-  open(unit=15,file=prname(1:len_trim(prname))//'Database', &
-        status='unknown',action='write',form='unformatted')
+  open(unit=IIN_database,file=prname(1:len_trim(prname))//'Database', &
+        status='unknown',action='write',form='unformatted',iostat=ier)
+  if( ier /= 0 ) stop 'error opening Database file'
 
-  write(15) nglob
+  write(IIN_database) nglob
   do iglob=1,nglob
-     write(15) iglob,nodes_coords(iglob,1),nodes_coords(iglob,2),nodes_coords(iglob,3)
+     write(IIN_database) iglob,nodes_coords(iglob,1),nodes_coords(iglob,2),nodes_coords(iglob,3)
   end do
 
 
 ! Materials properties
-   write(15) NMATERIALS, 0
+   write(IIN_database) NMATERIALS, 0
    do idoubl = 1,NMATERIALS
-      !write(15,*) material_properties(idoubl,:)
-      matpropl(:) = material_properties(idoubl,:)
-      write(15) matpropl
+      !write(IIN_database,*) material_properties(idoubl,:)
+      matpropl(:) = 0.d0
+      matpropl(1:6) = material_properties(idoubl,1:6)
+      ! pad dummy zeros to fill up 16 entries (poroelastic medium not allowed)
+      write(IIN_database) matpropl
    end do
 
 
-  write(15) nspec
+  write(IIN_database) nspec
   do ispec=1,nspec
-      !write(15,'(11i14)') ispec,true_material_num(ispec),1,ibool(1,1,1,ispec),ibool(2,1,1,ispec),&
+      !write(IIN_database,'(11i14)') ispec,true_material_num(ispec),1,ibool(1,1,1,ispec),ibool(2,1,1,ispec),&
       !     ibool(2,2,1,ispec),ibool(1,2,1,ispec),ibool(1,1,2,ispec),&
       !     ibool(2,1,2,ispec),ibool(2,2,2,ispec),ibool(1,2,2,ispec)
-      write(15) ispec,true_material_num(ispec),1,ibool(1,1,1,ispec),ibool(2,1,1,ispec),&
+      write(IIN_database) ispec,true_material_num(ispec),1,ibool(1,1,1,ispec),ibool(2,1,1,ispec),&
            ibool(2,2,1,ispec),ibool(1,2,1,ispec),ibool(1,1,2,ispec),&
            ibool(2,1,2,ispec),ibool(2,2,2,ispec),ibool(1,2,2,ispec)
-
   end do
 
   ! Boundaries
-  write(15) 1,nspec2D_xmin
-  write(15) 2,nspec2D_xmax
-  write(15) 3,nspec2D_ymin
-  write(15) 4,nspec2D_ymax
-  write(15) 5,NSPEC2D_BOTTOM
-  write(15) 6,NSPEC2D_TOP
+  write(IIN_database) 1,nspec2D_xmin
+  write(IIN_database) 2,nspec2D_xmax
+  write(IIN_database) 3,nspec2D_ymin
+  write(IIN_database) 4,nspec2D_ymax
+  write(IIN_database) 5,NSPEC2D_BOTTOM
+  write(IIN_database) 6,NSPEC2D_TOP
 
   do i=1,nspec2D_xmin
-     write(15) ibelm_xmin(i),ibool(1,1,1,ibelm_xmin(i)),ibool(1,NGLLY,1,ibelm_xmin(i)),&
+     write(IIN_database) ibelm_xmin(i),ibool(1,1,1,ibelm_xmin(i)),ibool(1,NGLLY,1,ibelm_xmin(i)),&
           ibool(1,1,NGLLZ,ibelm_xmin(i)),ibool(1,NGLLY,NGLLZ,ibelm_xmin(i))
   end do
   do i=1,nspec2D_xmax
-     write(15) ibelm_xmax(i),ibool(NGLLX,1,1,ibelm_xmax(i)),ibool(NGLLX,NGLLY,1,ibelm_xmax(i)), &
+     write(IIN_database) ibelm_xmax(i),ibool(NGLLX,1,1,ibelm_xmax(i)),ibool(NGLLX,NGLLY,1,ibelm_xmax(i)), &
           ibool(NGLLX,1,NGLLZ,ibelm_xmax(i)),ibool(NGLLX,NGLLY,NGLLZ,ibelm_xmax(i))
   end do
   do i=1,nspec2D_ymin
-     write(15) ibelm_ymin(i),ibool(1,1,1,ibelm_ymin(i)),ibool(NGLLX,1,1,ibelm_ymin(i)),&
+     write(IIN_database) ibelm_ymin(i),ibool(1,1,1,ibelm_ymin(i)),ibool(NGLLX,1,1,ibelm_ymin(i)),&
           ibool(1,1,NGLLZ,ibelm_ymin(i)),ibool(NGLLX,1,NGLLZ,ibelm_ymin(i))
   end do
   do i=1,nspec2D_ymax
-     write(15) ibelm_ymax(i),ibool(NGLLX,NGLLY,1,ibelm_ymax(i)),ibool(1,NGLLY,1,ibelm_ymax(i)), &
+     write(IIN_database) ibelm_ymax(i),ibool(NGLLX,NGLLY,1,ibelm_ymax(i)),ibool(1,NGLLY,1,ibelm_ymax(i)), &
           ibool(NGLLX,NGLLY,NGLLZ,ibelm_ymax(i)),ibool(1,NGLLY,NGLLZ,ibelm_ymax(i))
   end do
   do i=1,NSPEC2D_BOTTOM
-     write(15) ibelm_bottom(i),ibool(1,1,1,ibelm_bottom(i)),ibool(NGLLX,1,1,ibelm_bottom(i)), &
+     write(IIN_database) ibelm_bottom(i),ibool(1,1,1,ibelm_bottom(i)),ibool(NGLLX,1,1,ibelm_bottom(i)), &
           ibool(NGLLX,NGLLY,1,ibelm_bottom(i)),ibool(1,NGLLY,1,ibelm_bottom(i))
   end do
   do i=1,NSPEC2D_TOP
-     write(15) ibelm_top(i),ibool(1,1,NGLLZ,ibelm_top(i)),ibool(NGLLX,1,NGLLZ,ibelm_top(i)), &
+     write(IIN_database) ibelm_top(i),ibool(1,1,NGLLZ,ibelm_top(i)),ibool(NGLLX,1,NGLLZ,ibelm_top(i)), &
           ibool(NGLLX,NGLLY,NGLLZ,ibelm_top(i)),ibool(1,NGLLY,NGLLZ,ibelm_top(i))
   end do
 
@@ -200,86 +202,82 @@
 
   nspec_interfaces_max = maxval(nspec_interface)
 
-  write(15) nb_interfaces,nspec_interfaces_max
+  write(IIN_database) nb_interfaces,nspec_interfaces_max
 
   if(interfaces(W)) then
-     write(15) addressing(iproc_xi-1,iproc_eta),nspec_interface(W)
+     write(IIN_database) addressing(iproc_xi-1,iproc_eta),nspec_interface(W)
      do ispec = 1,nspec
-        if(iMPIcut_xi(1,ispec))  write(15) ispec,4,ibool(1,1,1,ispec),ibool(1,2,1,ispec), &
+        if(iMPIcut_xi(1,ispec))  write(IIN_database) ispec,4,ibool(1,1,1,ispec),ibool(1,2,1,ispec), &
              ibool(1,1,2,ispec),ibool(1,2,2,ispec)
      end do
   end if
 
   if(interfaces(E)) then
-     write(15) addressing(iproc_xi+1,iproc_eta),nspec_interface(E)
+     write(IIN_database) addressing(iproc_xi+1,iproc_eta),nspec_interface(E)
      do ispec = 1,nspec
-        if(iMPIcut_xi(2,ispec))  write(15) ispec,4,ibool(2,1,1,ispec),ibool(2,2,1,ispec), &
+        if(iMPIcut_xi(2,ispec))  write(IIN_database) ispec,4,ibool(2,1,1,ispec),ibool(2,2,1,ispec), &
              ibool(2,1,2,ispec),ibool(2,2,2,ispec)
      end do
   end if
 
    if(interfaces(S)) then
-     write(15) addressing(iproc_xi,iproc_eta-1),nspec_interface(S)
+     write(IIN_database) addressing(iproc_xi,iproc_eta-1),nspec_interface(S)
      do ispec = 1,nspec
-        if(iMPIcut_eta(1,ispec))  write(15) ispec,4,ibool(1,1,1,ispec),ibool(2,1,1,ispec), &
+        if(iMPIcut_eta(1,ispec))  write(IIN_database) ispec,4,ibool(1,1,1,ispec),ibool(2,1,1,ispec), &
              ibool(1,1,2,ispec),ibool(2,1,2,ispec)
      end do
   end if
 
   if(interfaces(N)) then
-     write(15) addressing(iproc_xi,iproc_eta+1),nspec_interface(N)
+     write(IIN_database) addressing(iproc_xi,iproc_eta+1),nspec_interface(N)
      do ispec = 1,nspec
-        if(iMPIcut_eta(2,ispec))  write(15) ispec,4,ibool(2,2,1,ispec),ibool(1,2,1,ispec), &
+        if(iMPIcut_eta(2,ispec))  write(IIN_database) ispec,4,ibool(2,2,1,ispec),ibool(1,2,1,ispec), &
              ibool(2,2,2,ispec),ibool(1,2,2,ispec)
      end do
   end if
 
   if(interfaces(NW)) then
-     write(15) addressing(iproc_xi-1,iproc_eta+1),nspec_interface(NW)
+     write(IIN_database) addressing(iproc_xi-1,iproc_eta+1),nspec_interface(NW)
      do ispec = 1,nspec
         if((iMPIcut_xi(1,ispec) .eqv. .true.) .and. (iMPIcut_eta(2,ispec) .eqv. .true.))  then
-           write(15) ispec,2,ibool(1,2,1,ispec),ibool(1,2,2,ispec),-1,-1
+           write(IIN_database) ispec,2,ibool(1,2,1,ispec),ibool(1,2,2,ispec),-1,-1
         end if
      end do
   end if
 
   if(interfaces(NE)) then
-     write(15) addressing(iproc_xi+1,iproc_eta+1),nspec_interface(NE)
+     write(IIN_database) addressing(iproc_xi+1,iproc_eta+1),nspec_interface(NE)
      do ispec = 1,nspec
         if((iMPIcut_xi(2,ispec) .eqv. .true.) .and. (iMPIcut_eta(2,ispec) .eqv. .true.))  then
-           write(15) ispec,2,ibool(2,2,1,ispec),ibool(2,2,2,ispec),-1,-1
+           write(IIN_database) ispec,2,ibool(2,2,1,ispec),ibool(2,2,2,ispec),-1,-1
         end if
      end do
   end if
 
   if(interfaces(SE)) then
-     write(15) addressing(iproc_xi+1,iproc_eta-1),nspec_interface(SE)
+     write(IIN_database) addressing(iproc_xi+1,iproc_eta-1),nspec_interface(SE)
      do ispec = 1,nspec
         if((iMPIcut_xi(2,ispec) .eqv. .true.) .and. (iMPIcut_eta(1,ispec) .eqv. .true.))  then
-           write(15) ispec,2,ibool(2,1,1,ispec),ibool(2,1,2,ispec),-1,-1
+           write(IIN_database) ispec,2,ibool(2,1,1,ispec),ibool(2,1,2,ispec),-1,-1
         end if
      end do
   end if
 
   if(interfaces(SW)) then
-     write(15) addressing(iproc_xi-1,iproc_eta-1),nspec_interface(SW)
+     write(IIN_database) addressing(iproc_xi-1,iproc_eta-1),nspec_interface(SW)
      do ispec = 1,nspec
         if((iMPIcut_xi(1,ispec) .eqv. .true.) .and. (iMPIcut_eta(1,ispec) .eqv. .true.))  then
-           write(15) ispec,2,ibool(1,1,1,ispec),ibool(1,1,2,ispec),-1,-1
+           write(IIN_database) ispec,2,ibool(1,1,1,ispec),ibool(1,1,2,ispec),-1,-1
         end if
      end do
   end if
 
   else
 
-     write(15) 0,0
+     write(IIN_database) 0,0
 
   end if
 
-  close(15)
+  close(IIN_database)
 
-
   end subroutine save_databases
-
-
-

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in	2011-11-06 02:02:36 UTC (rev 19152)
@@ -133,7 +133,8 @@
 
 CUDA_OBJECTS = \
 	$O/check_fields_cuda.cuda.o \
-	$O/compute_add_sources_cuda.cuda.o \
+	$O/compute_add_sources_acoustic_cuda.cuda.o \
+	$O/compute_add_sources_elastic_cuda.cuda.o \
 	$O/compute_coupling_cuda.cuda.o \
 	$O/compute_forces_acoustic_cuda.cuda.o \
 	$O/compute_forces_elastic_cuda.cuda.o \

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90	2011-11-06 02:02:36 UTC (rev 19152)
@@ -133,8 +133,8 @@
         ! write(*,*) "fortran dt = ", dt
         ! change dt -> DT
         call compute_add_sources_ac_cuda(Mesh_pointer, phase_is_inner, &
-                                              NSOURCES, SIMULATION_TYPE, &
-                                              USE_FORCE_POINT_SOURCE, stf_pre_compute, myrank)
+                                        NSOURCES, SIMULATION_TYPE, &
+                                        stf_pre_compute, myrank)
       endif
 
     else ! .NOT. GPU_MODE
@@ -411,8 +411,8 @@
 
         ! only implements SIMTYPE=3
         call compute_add_sources_ac_s3_cuda(Mesh_pointer, phase_is_inner, &
-                                    NSOURCES, SIMULATION_TYPE, &
-                                    USE_FORCE_POINT_SOURCE, stf_pre_compute, myrank)
+                                           NSOURCES, SIMULATION_TYPE, &
+                                           stf_pre_compute, myrank)
       endif
 
     else ! .NOT. GPU_MODE
@@ -512,5 +512,4 @@
     if( myrank == 0 ) write(IOSTF,*) time_source,stf_used_total_all
   endif
 
-
-end subroutine compute_add_sources_acoustic
+  end subroutine compute_add_sources_acoustic

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90	2011-11-06 02:02:36 UTC (rev 19152)
@@ -133,11 +133,7 @@
       ! write(*,*) "fortran dt = ", dt
       ! change dt -> DT
       call compute_add_sources_el_cuda(Mesh_pointer, &
-                                      !NSPEC_AB, NGLOB_AB,
                                       phase_is_inner,NSOURCES, &
-                                      !it, DT, t0, &
-                                      !SIMULATION_TYPE, NSTEP, NOISE_TOMOGRAPHY,&
-                                      !USE_FORCE_POINT_SOURCE, &
                                       stf_pre_compute, myrank)
 
     else ! .NOT. GPU_MODE
@@ -363,7 +359,7 @@
             endif
           enddo ! nrec
         else ! GPU_MODE == .true.
-           call add_sources_el_sim_type_2_or_3(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
+          call add_sources_el_sim_type_2_or_3(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
                                             ispec_is_inner,ispec_is_elastic, &
                                             ispec_selected_rec,myrank,nrec, &
                                             NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
@@ -387,8 +383,8 @@
                                           dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
       enddo
 
-      call compute_add_sources_el_s3_cuda(Mesh_pointer, USE_FORCE_POINT_SOURCE,&
-                                          stf_pre_compute, NSOURCES,phase_is_inner,myrank)
+      call compute_add_sources_el_s3_cuda(Mesh_pointer,stf_pre_compute, &
+                                         NSOURCES,phase_is_inner,myrank)
 
     else ! .NOT. GPU_MODE
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90	2011-11-06 02:02:36 UTC (rev 19152)
@@ -124,7 +124,8 @@
       call compute_forces_elastic_cuda(Mesh_pointer, iphase, &
                                       nspec_outer_elastic, &
                                       nspec_inner_elastic, &
-                                      SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,ATTENUATION)
+                                      SIMULATION_TYPE, &
+                                      COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
     endif ! GPU_MODE
 
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90	2011-11-06 02:02:36 UTC (rev 19152)
@@ -217,8 +217,6 @@
       if( N_SLS /= 3 ) &
         stop 'GPU mode does not support N_SLS /= 3 yet'
     endif
-    if( ANISOTROPY ) &
-      stop 'GPU mode does not support ANISOTROPY yet'
   endif
 
   ! absorbing surfaces

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90	2011-11-06 02:02:36 UTC (rev 19152)
@@ -184,6 +184,11 @@
       endif
     endif
   endif
+  ! check stability of the code, exit if unstable
+  ! negative values can occur with some compilers when the unstable value is greater
+  ! than the greatest possible floating-point number of the machine
+  if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0) &
+    call exit_MPI(myrank,'forward simulation became unstable and blew up')
 
 ! compute the maximum of the maxima for all the slices using an MPI reduction
   call max_all_cr(Usolidnorm,Usolidnorm_all)
@@ -207,6 +212,13 @@
         endif
       endif
     endif
+    ! check stability of the code, exit if unstable
+    ! negative values can occur with some compilers when the unstable value is greater
+    ! than the greatest possible floating-point number of the machine
+    if(b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0) &
+      call exit_MPI(myrank,'backward simulation became unstable and blew up')
+
+    ! compute max of all slices
     call max_all_cr(b_Usolidnorm,b_Usolidnorm_all)
   endif
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2011-11-06 02:02:36 UTC (rev 19152)
@@ -780,7 +780,7 @@
                                   nrec, nrec_local, &
                                   SIMULATION_TYPE, &
                                   USE_MESH_COLORING_GPU,nspec_acoustic,nspec_elastic, &
-                                  ncuda_devices)
+                                  myrank,ncuda_devices)
 
   call min_all_i(ncuda_devices,ncuda_devices_min)
   call max_all_i(ncuda_devices,ncuda_devices_max)
@@ -828,7 +828,12 @@
                                   num_free_surface_faces, &
                                   ACOUSTIC_SIMULATION, &
                                   num_colors_outer_elastic,num_colors_inner_elastic, &
-                                  num_elem_colors_elastic)
+                                  num_elem_colors_elastic, &
+                                  ANISOTROPY, &
+                                  c11store,c12store,c13store,c14store,c15store,c16store, &
+                                  c22store,c23store,c24store,c25store,c26store, &
+                                  c33store,c34store,c35store,c36store, &
+                                  c44store,c45store,c46store,c55store,c56store,c66store)
 
     if( SIMULATION_TYPE == 3 ) &
       call prepare_fields_elastic_adj_dev(Mesh_pointer, NDIM*NGLOB_AB, &

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2011-11-06 02:02:36 UTC (rev 19152)
@@ -407,17 +407,17 @@
                               kappastore,mustore,rho_vp,rho_vs, &
                               DT,model_speed_max,min_resolved_period )
   else if( ACOUSTIC_SIMULATION ) then
-      allocate(rho_vp(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array rho_vp'
-      allocate(rho_vs(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array rho_vs'
-      rho_vp = sqrt( kappastore / rhostore ) * rhostore
-      rho_vs = 0.0_CUSTOM_REAL
-      call check_mesh_resolution(myrank,NSPEC_AB,NGLOB_AB, &
+    allocate(rho_vp(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array rho_vp'
+    allocate(rho_vs(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array rho_vs'
+    rho_vp = sqrt( kappastore / rhostore ) * rhostore
+    rho_vs = 0.0_CUSTOM_REAL
+    call check_mesh_resolution(myrank,NSPEC_AB,NGLOB_AB, &
                                 ibool,xstore,ystore,zstore, &
                                 kappastore,mustore,rho_vp,rho_vs, &
                                 DT,model_speed_max,min_resolved_period )
-      deallocate(rho_vp,rho_vs)
+    deallocate(rho_vp,rho_vs)
   endif
 
 ! reads adjoint parameters