[cig-commits] r22718 - in seismo/3D/SPECFEM3D/trunk: . src/cuda src/generate_databases src/shared src/specfem3D

Tue Aug 20 07:13:27 PDT 2013

Author: danielpeter
Date: 2013-08-20 07:13:26 -0700 (Tue, 20 Aug 2013)
New Revision: 22718

Added:
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/check_stability.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_total_energy.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/update_displacement_scheme.f90
Modified:
   seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_viscoelastic_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_coupling_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_viscoelastic_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_viscoelastic_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D/trunk/src/cuda/noise_tomography_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_constants_cuda.h
   seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D/trunk/src/cuda/transfer_fields_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_MPI.f90
   seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_coupling_surfaces.f90
   seismo/3D/SPECFEM3D/trunk/src/generate_databases/memory_eval.f90
   seismo/3D/SPECFEM3D/trunk/src/shared/assemble_MPI_scalar.f90
   seismo/3D/SPECFEM3D/trunk/src/shared/constants.h.in
   seismo/3D/SPECFEM3D/trunk/src/shared/detect_surface.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_acoustic.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_poroelastic.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_viscoelastic.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_arrays_source.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_ac.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_poroelastic_calling_routine.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_acoustic.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_poroelastic.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_viscoelastic.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/fault_solver_common.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/finalize_simulation.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/initialize_simulation.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.F90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/read_mesh_databases.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/save_adjoint_kernels.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/specfem3D_par.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/write_seismograms.f90
   seismo/3D/SPECFEM3D/trunk/todo_list_please_dont_remove.txt
Log:
adds new files update_displacement_scheme.f90, check_stability.f90 and compute_total_energy.f90 for corresponding routines; renames some routines & variables to remove obsolete _ext_mesh ending; bug fix for cuda routines and kernel simulations

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu
===================================================================

--- seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -39,7 +39,6 @@
 
 #include "config.h"
 #include "mesh_constants_cuda.h"
-//#include "prepare_constants_cuda.h"
 
 /* ----------------------------------------------------------------------------------------------- */
 
@@ -64,6 +63,7 @@
   pause_for_debugger(1);
 }
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
 void pause_for_debugger(int pause) {
@@ -94,7 +94,7 @@
 
 void exit_on_cuda_error(char* kernel_name) {
   // sync and check to catch errors from previous async operations
-  cudaThreadSynchronize();
+  synchronize_cuda();
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess){
     fprintf(stderr,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
@@ -193,8 +193,50 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+void synchronize_cuda(){
+#if CUDA_VERSION >= 4000
+    cudaDeviceSynchronize();
+#else
+    cudaThreadSynchronize();
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void synchronize_mpi(){
+#ifdef WITH_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y) {
+
+// Initially sets the blocks_x to be the num_blocks, and adds rows as needed (block size limit of 65535).
+// If an additional row is added, the row length is cut in
+// half. If the block count is odd, there will be 1 too many blocks,
+// which must be managed at runtime with an if statement.
+
+  *num_blocks_x = num_blocks;
+  *num_blocks_y = 1;
+
+  while(*num_blocks_x > MAXIMUM_GRID_DIM) {
+    *num_blocks_x = (int) ceil(*num_blocks_x * 0.5f);
+    *num_blocks_y = *num_blocks_y * 2;
+  }
+
+  return;
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
 void get_free_memory(double* free_db, double* used_db, double* total_db) {
 
+  TRACE("get_free_memory");
+
   // gets memory usage in byte
   size_t free_byte ;
   size_t total_byte ;
@@ -215,13 +257,15 @@
 // Saves GPU memory usage to file
 void output_free_memory(int myrank,char* info_str) {
 
+  TRACE("output_free_memory");
+
   FILE* fp;
   char filename[BUFSIZ];
   double free_db,used_db,total_db;
 
   get_free_memory(&free_db,&used_db,&total_db);
 
-  sprintf(filename,"../OUTPUT_FILES/gpu_device_mem_usage_proc_%06d.txt",myrank);
+  sprintf(filename,"../OUTPUT_FILES/gpu_memory_usage_proc_%06d.txt",myrank);
   fp = fopen(filename,"a+");
   if (fp != NULL){
     fprintf(fp,"%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
@@ -235,12 +279,14 @@
 // Fortran-callable version of above method
 extern "C"
 void FC_FUNC_(output_free_device_memory,
-              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank_f) {
   TRACE("output_free_device_memory");
 
-  char info[6];
-  sprintf(info,"f %d:",*myrank);
-  output_free_memory(*myrank,info);
+  char info[64];
+  int myrank = *myrank_f;
+
+  sprintf(info,"f %d:",myrank);
+  output_free_memory(myrank,info);
 }
 
 
@@ -264,7 +310,411 @@
 }
 
 
+
 /* ----------------------------------------------------------------------------------------------- */
+
+// Auxiliary functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+__global__ void memset_to_realw_kernel(realw* array, int size, realw value){
+
+  unsigned int tid = threadIdx.x;
+  unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  unsigned int i = tid + bx*blockDim.x;
+
+  if( i < size ){
+    array[i] = *value;
+  }
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+realw get_device_array_maximum_value(realw* array, int size){
+
+// get maximum of array on GPU by copying over to CPU and handle it there
+
+  realw max = 0.0f;
+
+  // checks if anything to do
+  if( size > 0 ){
+    realw* h_array;
+
+    // explicitly wait for cuda kernels to finish
+    // (cudaMemcpy implicitly synchronizes all other cuda operations)
+    synchronize_cuda();
+
+    h_array = (realw*)calloc(size,sizeof(realw));
+    print_CUDA_error_if_any(cudaMemcpy(h_array,array,sizeof(realw)*size,cudaMemcpyDeviceToHost),33001);
+
+    // finds maximum value in array
+    max = h_array[0];
+    for( int i=1; i < size; i++){
+      if( abs(h_array[i]) > max ) max = abs(h_array[i]);
+    }
+    free(h_array);
+  }
+  return max;
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ACOUSTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void get_maximum_kernel(realw* array, int size, realw* d_max){
+
+  /* simplest version: uses only 1 thread
+   realw max;
+   max = 0;
+   // finds maximum value in array
+   if( size > 0 ){
+   max = abs(array[0]);
+   for( int i=1; i < size; i++){
+   if( abs(array[i]) > max ) max = abs(array[i]);
+   }
+   }
+   *d_max = max;
+   */
+
+  // reduction example:
+  __shared__ realw sdata[BLOCKSIZE_TRANSFER] ;
+
+  // load shared mem
+  unsigned int tid = threadIdx.x;
+  unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  unsigned int i = tid + bx*blockDim.x;
+
+  // loads absolute values into shared memory
+  sdata[tid] = (i < size) ? fabs(array[i]) : 0.0 ;
+
+  __syncthreads();
+
+  // do reduction in shared mem
+  for(unsigned int s=blockDim.x/2; s>0; s>>=1)
+  {
+    if (tid < s){
+      // summation:
+      //sdata[tid] += sdata[tid + s];
+      // maximum:
+      if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
+    }
+    __syncthreads();
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) d_max[bx] = sdata[0];
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(get_norm_acoustic_from_device,
+              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,long* Mesh_pointer,int* sim_type) {
+
+  TRACE("get_norm_acoustic_from_device");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  realw max = 0.0;
+  realw *d_max;
+
+  //initializes
+  *norm = 0.0f;
+
+  /* way 1 : timing Elapsed time: 8.464813e-03
+   realw* h_array;
+   h_array = (realw*)calloc(mp->NGLOB_AB,sizeof(realw));
+
+   print_CUDA_error_if_any(cudaMemcpy(h_array,mp->d_potential_dot_dot_acoustic,
+   sizeof(realw)*(mp->NGLOB_AB),cudaMemcpyDeviceToHost),131);
+
+   // finds maximum value in array
+   max = h_array[0];
+   for( int i=1; i < mp->NGLOB_AB; i++){
+   if( abs(h_array[i]) > max ) max = abs(h_array[i]);
+   }
+   free(h_array);
+   */
+
+  /* way 2: timing Elapsed time: 8.818102e-02
+   // launch simple kernel
+   cudaMalloc((void**)&d_max,sizeof(realw));
+
+   dim3 grid(1,1);
+   dim3 threads(1,1,1);
+
+   get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+   mp->NGLOB_AB,
+   d_max);
+   print_CUDA_error_if_any(cudaMemcpy(&max,d_max, sizeof(realw), cudaMemcpyDeviceToHost),222);
+
+   cudaFree(d_max);
+   */
+
+  // way 2 b: timing Elapsed time: 1.236916e-03
+  // launch simple reduction kernel
+  realw* h_max;
+  int blocksize = BLOCKSIZE_TRANSFER;
+
+  int size = mp->NGLOB_AB;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  //printf("num_blocks_x %i \n",num_blocks_x);
+
+  // on host (allocates & initializes to zero)
+  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+
+  // allocates memory on device
+  print_CUDA_error_if_any(cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw)),78001);
+  // initializes values to zero
+  print_CUDA_error_if_any(cudaMemset(d_max,0,num_blocks_x*num_blocks_y*sizeof(realw)),77002);
+
+
+  if(*sim_type == 1 ){
+    get_maximum_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,size,d_max);
+  }else if(*sim_type == 3 ){
+    get_maximum_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_dot_dot_acoustic,size,d_max);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("kernel get_maximum_kernel");
+#endif
+
+  // synchronizes
+  //synchronize_cuda();
+  // explicitly waits for stream to finish
+  // (cudaMemcpy implicitly synchronizes all other cuda operations)
+  cudaStreamSynchronize(mp->compute_stream);
+
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),222);
+
+  // determines max for all blocks
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
+    if( max < h_max[i]) max = h_max[i];
+  }
+
+  cudaFree(d_max);
+  free(h_max);
+
+  /* way 3: doesn't work properly...
+   cublasStatus status;
+
+   // Initialize CUBLAS
+   status = cublasInit();
+   if (status != CUBLAS_STATUS_SUCCESS) {
+   fprintf (stderr, "!!!! CUBLAS initialization error\n");
+   exit(1);
+   }
+
+   // cublas function: cublasIsamax
+   //       finds the smallest index of the maximum magnitude element of single 
+   //      precision vector x
+   int incr = 1;
+   int imax = 0;
+   imax = cublasIsamax(mp->NGLOB_AB,(realw*)mp->d_potential_dot_dot_acoustic, incr);
+   status= cublasGetError();
+   if (status != CUBLAS_STATUS_SUCCESS) {
+   fprintf (stderr, "!!!! CUBLAS error in cublasIsamax\n");
+   exit(1);
+   }
+
+   print_CUDA_error_if_any(cudaMemcpy(&max,&(mp->d_potential_dot_dot_acoustic[imax]),
+                      sizeof(realw), cudaMemcpyDeviceToHost),222);
+
+   printf("maximum %i %i %f \n",mp->NGLOB_AB,imax,max);
+
+   // Shutdown
+   status = cublasShutdown();
+   if (status != CUBLAS_STATUS_SUCCESS) {
+   fprintf (stderr, "!!!! shutdown error (A)\n");
+   exit(1);
+   }
+
+   */
+
+  // return result
+  *norm = max;
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("get_norm_acoustic_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ELASTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
+
+  // reduction example:
+  __shared__ realw sdata[BLOCKSIZE_TRANSFER] ;
+
+  // load shared mem
+  unsigned int tid = threadIdx.x;
+  unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  unsigned int i = tid + bx*blockDim.x;
+
+  // loads values into shared memory: assume array is a vector array
+  sdata[tid] = (i < size) ? (array[i*3]*array[i*3] + array[i*3+1]*array[i*3+1] + array[i*3+2]*array[i*3+2]) : 0.0 ;
+
+  __syncthreads();
+
+  // do reduction in shared mem
+  for(unsigned int s=blockDim.x/2; s>0; s>>=1)
+  {
+    if (tid < s){
+      // summation:
+      //sdata[tid] += sdata[tid + s];
+      // maximum:
+      if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
+    }
+    __syncthreads();
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) d_max[bx] = sdata[0];
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(get_norm_elastic_from_device,
+              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
+                                            long* Mesh_pointer,
+                                            int* type) {
+
+  TRACE("\tget_norm_elastic_from_device");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  realw max,res;
+  realw *d_max;
+
+  //initializes
+  *norm = 0.0f;
+
+  // launch simple reduction kernel
+  realw* h_max;
+  int blocksize = BLOCKSIZE_TRANSFER;
+
+  int size = mp->NGLOB_AB;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // on host (allocates & initializes to zero)
+  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+
+  // allocates memory on device
+  print_CUDA_error_if_any(cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw)),77001);
+  // initializes values to zero
+  print_CUDA_error_if_any(cudaMemset(d_max,0,num_blocks_x*num_blocks_y*sizeof(realw)),77002);
+
+  if(*type == 1 ){
+    get_maximum_vector_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ,size,d_max);
+  }else if(*type == 3 ){
+    get_maximum_vector_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ,size,d_max);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("kernel get_norm_elastic_from_device");
+#endif
+
+  // synchronizes
+  //synchronize_cuda();
+  // explicitly waits for stream to finish
+  // (cudaMemcpy implicitly synchronizes all other cuda operations)
+  cudaStreamSynchronize(mp->compute_stream);
+
+  // copies reduction array back to CPU
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),222);
+
+  // determines max for all blocks
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
+    if( max < h_max[i]) max = h_max[i];
+  }
+  res = sqrt(max);
+
+  // return result
+  *norm = res;
+
+  // debug
+  //printf("rank % d - type: %d norm: %f \n",mp->myrank,*type,res);
+
+  cudaFree(d_max);
+  free(h_max);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("get_norm_elastic_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// unused ...
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+extern "C"
+void FC_FUNC_(get_max_accel,
+              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {
+
+TRACE("get_max_accel");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  int size = *sizef;
+  int it = *itf;
+  realw* accel_cpy = (realw*)malloc(size*sizeof(realw));
+  cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  realw maxval=0;
+  for(int i=0;i<size;++i) {
+    maxval = MAX(maxval,accel_cpy[i]);
+  }
+  printf("%d/%d: max=%e\n",it,procid,maxval);
+  free(accel_cpy);
+}
+*/
+
+
+/* ----------------------------------------------------------------------------------------------- */
 //daniel: helper function
 /*
  __global__ void check_phase_ispec_kernel(int num_phase_ispec,
@@ -295,9 +745,9 @@
  }
  }
 
- void check_phase_ispec(long* Mesh_pointer_f,int type){
+ void check_phase_ispec(long* Mesh_pointer,int type){
 
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
  printf("check phase_ispec for type=%d\n",type);
 
@@ -331,7 +781,7 @@
  #endif
 
  }
- */
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
 //daniel: helper function
@@ -358,9 +808,9 @@
  printf("check_ispec_is done: count = %d %d\n",count0,count1);
  }
 
- void check_ispec_is(long* Mesh_pointer_f,int type){
+ void check_ispec_is(long* Mesh_pointer,int type){
 
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
  printf("check ispec_is for type=%d\n",type);
 
@@ -395,7 +845,7 @@
  exit_on_cuda_error("check_ispec_is");
  #endif
  }
- */
+*/
 /* ----------------------------------------------------------------------------------------------- */
 //daniel: helper function
 /*
@@ -424,9 +874,9 @@
  printf("check_array_ispec done: count = %d %d \n",count0,count1);
  }
 
- void check_array_ispec(long* Mesh_pointer_f,int type){
+ void check_array_ispec(long* Mesh_pointer,int type){
 
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
  printf("check array_ispec for type=%d\n",type);
 
@@ -455,7 +905,7 @@
  #endif
 
  }
- */
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
 
@@ -463,13 +913,16 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+//max: helper functions
+
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_displ_gpu,
-              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {
+              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer,int* announceID) {
 
-TRACE("check_max_norm_displ_gpu");
+  TRACE("check_max_norm_displ_gpu");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   cudaMemcpy(displ, mp->d_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
   realw maxnorm=0;
@@ -479,9 +932,10 @@
   }
   printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
 }
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_vector,
               CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {
@@ -504,9 +958,11 @@
   }
   printf("%d:maxnorm of vector %d [%d] = %e\n",procid,*announceID,maxloc,maxnorm);
 }
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
 
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_displ,
               CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {
@@ -520,16 +976,18 @@
   }
   printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
 }
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
 
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_b_displ_gpu,
-              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {
+              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer,int* announceID) {
 
-TRACE("check_max_norm_b_displ_gpu");
+  TRACE("check_max_norm_b_displ_gpu");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   realw* b_accel = (realw*)malloc(*size*sizeof(realw));
 
@@ -547,16 +1005,18 @@
   printf("%d: maxnorm of backward displ = %e\n",*announceID,maxnorm);
   printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm_accel);
 }
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
 
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_b_accel_gpu,
-              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {
+              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer,int* announceID) {
 
-TRACE("check_max_norm_b_accel_gpu");
+  TRACE("check_max_norm_b_accel_gpu");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
 
@@ -567,16 +1027,18 @@
   }
   printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm);
 }
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
 
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_b_veloc_gpu,
-              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {
+              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer,int* announceID) {
 
-TRACE("check_max_norm_b_veloc_gpu");
+  TRACE("check_max_norm_b_veloc_gpu");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   cudaMemcpy(b_veloc, mp->d_b_veloc,*size*sizeof(realw),cudaMemcpyDeviceToHost);
 
@@ -587,9 +1049,11 @@
   }
   printf("%d: maxnorm of backward veloc = %e\n",*announceID,maxnorm);
 }
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
 
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_b_displ,
               CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {
@@ -603,9 +1067,11 @@
   }
   printf("%d:maxnorm of backward displ = %e\n",*announceID,maxnorm);
 }
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
 
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_b_accel,
               CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {
@@ -619,9 +1085,11 @@
   }
   printf("%d:maxnorm of backward accel = %e\n",*announceID,maxnorm);
 }
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
 
+/*
 extern "C"
 void FC_FUNC_(check_error_vectors,
               CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {
@@ -660,331 +1128,5 @@
   }
 
 }
+*/
 
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// Auxiliary functions
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(get_max_accel,
-              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {
-
-TRACE("get_max_accel");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer);
-  int procid;
-#ifdef WITH_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
-#else
-  procid = 0;
-#endif
-  int size = *sizef;
-  int it = *itf;
-  realw* accel_cpy = (realw*)malloc(size*sizeof(realw));
-  cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(realw),cudaMemcpyDeviceToHost);
-  realw maxval=0;
-  for(int i=0;i<size;++i) {
-    maxval = MAX(maxval,accel_cpy[i]);
-  }
-  printf("%d/%d: max=%e\n",it,procid,maxval);
-  free(accel_cpy);
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// ACOUSTIC simulations
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void get_maximum_kernel(realw* array, int size, realw* d_max){
-
-  /* simplest version: uses only 1 thread
-   realw max;
-   max = 0;
-   // finds maximum value in array
-   if( size > 0 ){
-   max = abs(array[0]);
-   for( int i=1; i < size; i++){
-   if( abs(array[i]) > max ) max = abs(array[i]);
-   }
-   }
-   *d_max = max;
-   */
-
-  // reduction example:
-  __shared__ realw sdata[BLOCKSIZE_TRANSFER] ;
-
-  // load shared mem
-  unsigned int tid = threadIdx.x;
-  unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
-  unsigned int i = tid + bx*blockDim.x;
-
-  // loads absolute values into shared memory
-  sdata[tid] = (i < size) ? fabs(array[i]) : 0.0 ;
-
-  __syncthreads();
-
-  // do reduction in shared mem
-  for(unsigned int s=blockDim.x/2; s>0; s>>=1)
-  {
-    if (tid < s){
-      // summation:
-      //sdata[tid] += sdata[tid + s];
-      // maximum:
-      if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
-    }
-    __syncthreads();
-  }
-
-  // write result for this block to global mem
-  if (tid == 0) d_max[bx] = sdata[0];
-
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(get_norm_acoustic_from_device,
-              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
-                                             long* Mesh_pointer_f) {
-
-TRACE("get_norm_acoustic_from_device");
-  //double start_time = get_time();
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  realw max;
-  realw *d_max;
-
-  max = 0.0;
-
-  /* way 1 : timing Elapsed time: 8.464813e-03
-   realw* h_array;
-   h_array = (realw*)calloc(mp->NGLOB_AB,sizeof(realw));
-
-   print_CUDA_error_if_any(cudaMemcpy(h_array,mp->d_potential_dot_dot_acoustic,
-   sizeof(realw)*(mp->NGLOB_AB),cudaMemcpyDeviceToHost),131);
-
-   // finds maximum value in array
-   max = h_array[0];
-   for( int i=1; i < mp->NGLOB_AB; i++){
-   if( abs(h_array[i]) > max ) max = abs(h_array[i]);
-   }
-   free(h_array);
-   */
-
-  /* way 2: timing Elapsed time: 8.818102e-02
-   // launch simple kernel
-   cudaMalloc((void**)&d_max,sizeof(realw));
-
-   dim3 grid(1,1);
-   dim3 threads(1,1,1);
-
-   get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
-   mp->NGLOB_AB,
-   d_max);
-   print_CUDA_error_if_any(cudaMemcpy(&max,d_max, sizeof(realw), cudaMemcpyDeviceToHost),222);
-
-   cudaFree(d_max);
-   */
-
-  // way 2 b: timing Elapsed time: 1.236916e-03
-  // launch simple reduction kernel
-  realw* h_max;
-  int blocksize = BLOCKSIZE_TRANSFER;
-
-  int size = mp->NGLOB_AB;
-  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  //printf("num_blocks_x %i \n",num_blocks_x);
-
-  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
-  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-  if(mp->simulation_type == 1 ){
-    get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,size,d_max);
-  }else if(mp->simulation_type == 3 ){
-    get_maximum_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,size,d_max);
-  }
-
-  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
-                                     cudaMemcpyDeviceToHost),222);
-
-  // determines max for all blocks
-  max = h_max[0];
-  for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
-    if( max < h_max[i]) max = h_max[i];
-  }
-
-  cudaFree(d_max);
-  free(h_max);
-
-  /* way 3: doesn't work properly...
-   cublasStatus status;
-
-   // Initialize CUBLAS
-   status = cublasInit();
-   if (status != CUBLAS_STATUS_SUCCESS) {
-   fprintf (stderr, "!!!! CUBLAS initialization error\n");
-   exit(1);
-   }
-
-   // cublas function: cublasIsamax
-   //       finds the smallest index of the maximum magnitude element of single 
-   //      precision vector x
-   int incr = 1;
-   int imax = 0;
-   imax = cublasIsamax(mp->NGLOB_AB,(realw*)mp->d_potential_dot_dot_acoustic, incr);
-   status= cublasGetError();
-   if (status != CUBLAS_STATUS_SUCCESS) {
-   fprintf (stderr, "!!!! CUBLAS error in cublasIsamax\n");
-   exit(1);
-   }
-
-   print_CUDA_error_if_any(cudaMemcpy(&max,&(mp->d_potential_dot_dot_acoustic[imax]),
-                      sizeof(realw), cudaMemcpyDeviceToHost),222);
-
-   printf("maximum %i %i %f \n",mp->NGLOB_AB,imax,max);
-
-   // Shutdown
-   status = cublasShutdown();
-   if (status != CUBLAS_STATUS_SUCCESS) {
-   fprintf (stderr, "!!!! shutdown error (A)\n");
-   exit(1);
-   }
-
-   */
-
-  // return result
-  *norm = max;
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //double end_time = get_time();
-  //printf("Elapsed time: %e\n",end_time-start_time);
-  exit_on_cuda_error("get_norm_acoustic_from_device");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// ELASTIC simulations
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
-
-  // reduction example:
-  __shared__ realw sdata[BLOCKSIZE_TRANSFER] ;
-
-  // load shared mem
-  unsigned int tid = threadIdx.x;
-  unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
-  unsigned int i = tid + bx*blockDim.x;
-
-  // loads values into shared memory: assume array is a vector array
-  sdata[tid] = (i < size) ? sqrt(array[i*3]*array[i*3]
-                                 + array[i*3+1]*array[i*3+1]
-                                 + array[i*3+2]*array[i*3+2]) : 0.0 ;
-
-  __syncthreads();
-
-  // do reduction in shared mem
-  for(unsigned int s=blockDim.x/2; s>0; s>>=1)
-  {
-    if (tid < s){
-      // summation:
-      //sdata[tid] += sdata[tid + s];
-      // maximum:
-      if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
-    }
-    __syncthreads();
-  }
-
-  // write result for this block to global mem
-  if (tid == 0) d_max[bx] = sdata[0];
-
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(get_norm_elastic_from_device,
-              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
-                                            long* Mesh_pointer_f) {
-
-  TRACE("get_norm_elastic_from_device");
-  //double start_time = get_time();
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  realw max;
-  realw *d_max;
-
-  max = 0.0;
-
-  // launch simple reduction kernel
-  realw* h_max;
-  int blocksize = BLOCKSIZE_TRANSFER;
-
-  int size = mp->NGLOB_AB;
-  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  //printf("num_blocks_x %i \n",num_blocks_x);
-  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
-  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-  if(mp->simulation_type == 1 ){
-    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ,size,d_max);
-  }else if(mp->simulation_type == 3 ){
-    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ,size,d_max);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //double end_time = get_time();
-  //printf("Elapsed time: %e\n",end_time-start_time);
-  exit_on_cuda_error("kernel get_norm_elastic_from_device");
-#endif
-
-  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
-                                     cudaMemcpyDeviceToHost),222);
-
-  // determines max for all blocks
-  max = h_max[0];
-  for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
-    if( max < h_max[i]) max = h_max[i];
-  }
-
-  cudaFree(d_max);
-  free(h_max);
-
-  // return result
-  *norm = max;
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //double end_time = get_time();
-  //printf("Elapsed time: %e\n",end_time-start_time);
-  exit_on_cuda_error("get_norm_elastic_from_device");
-#endif
-}

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_acoustic_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_acoustic_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -37,7 +37,6 @@
 
 #include "config.h"
 #include "mesh_constants_cuda.h"
-// #include "epik_user.h"
 
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -64,10 +63,8 @@
 
   int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
 
-  int ispec;
-  int iglob;
-  realw stf;
-  realw kappal;
+  int ispec,iglob;
+  realw stf,kappal;
 
   if( isource < NSOURCES ){
 
@@ -77,13 +74,15 @@
 
       if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) {
 
+        iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+
         stf = (realw) stf_pre_compute[isource];
-        iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
         kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
 
         atomicAdd(&potential_dot_dot_acoustic[iglob],
                   -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal);
 
+        // debug: without atomic operation
         //      potential_dot_dot_acoustic[iglob] +=
         //                -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal;
       }
@@ -96,49 +95,43 @@
 
 extern "C"
 void FC_FUNC_(compute_add_sources_ac_cuda,
-              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
-                                                 int* phase_is_innerf,
-                                                 int* NSOURCESf,
-                                                 double* h_stf_pre_compute,
-                                                 int* myrankf) {
+              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer,
+                                           int* phase_is_innerf,
+                                           int* NSOURCESf,
+                                           double* h_stf_pre_compute) {
 
-TRACE("compute_add_sources_ac_cuda");
+  TRACE("compute_add_sources_ac_cuda");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   // check if anything to do
   if( mp->nsources_local == 0 ) return;
 
-  int phase_is_inner = *phase_is_innerf;
   int NSOURCES = *NSOURCESf;
-  int myrank = *myrankf;
+  int phase_is_inner = *phase_is_innerf;
 
-  int num_blocks_x = NSOURCES;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
   // copies pre-computed source time factors onto GPU
   print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
                                      NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
 
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(5,5,5);
 
-  compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
-                                                        mp->d_ibool,
-                                                        mp->d_ispec_is_inner,
-                                                        phase_is_inner,
-                                                        mp->d_sourcearrays,
-                                                        mp->d_stf_pre_compute,
-                                                        myrank,
-                                                        mp->d_islice_selected_source,
-                                                        mp->d_ispec_selected_source,
-                                                        mp->d_ispec_is_acoustic,
-                                                        mp->d_kappastore,
-                                                        NSOURCES);
+  compute_add_sources_acoustic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
+                                                                              mp->d_ibool,
+                                                                              mp->d_ispec_is_inner,
+                                                                              phase_is_inner,
+                                                                              mp->d_sourcearrays,
+                                                                              mp->d_stf_pre_compute,
+                                                                              mp->myrank,
+                                                                              mp->d_islice_selected_source,
+                                                                              mp->d_ispec_selected_source,
+                                                                              mp->d_ispec_is_acoustic,
+                                                                              mp->d_kappastore,
+                                                                              NSOURCES);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_add_sources_ac_cuda");
@@ -149,49 +142,43 @@
 
 extern "C"
 void FC_FUNC_(compute_add_sources_ac_s3_cuda,
-              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
-                                                      int* phase_is_innerf,
-                                                      int* NSOURCESf,
-                                                      double* h_stf_pre_compute,
-                                                      int* myrankf) {
+              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer,
+                                              int* phase_is_innerf,
+                                              int* NSOURCESf,
+                                              double* h_stf_pre_compute) {
 
-TRACE("compute_add_sources_ac_s3_cuda");
+  TRACE("compute_add_sources_ac_s3_cuda");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   // check if anything to do
   if( mp->nsources_local == 0 ) return;
 
-  int phase_is_inner = *phase_is_innerf;
   int NSOURCES = *NSOURCESf;
-  int myrank = *myrankf;
+  int phase_is_inner = *phase_is_innerf;
 
-  int num_blocks_x = NSOURCES;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
   // copies source time factors onto GPU
   print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
                                      NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
 
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(5,5,5);
 
-  compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
-                                                        mp->d_ibool,
-                                                        mp->d_ispec_is_inner,
-                                                        phase_is_inner,
-                                                        mp->d_sourcearrays,
-                                                        mp->d_stf_pre_compute,
-                                                        myrank,
-                                                        mp->d_islice_selected_source,
-                                                        mp->d_ispec_selected_source,
-                                                        mp->d_ispec_is_acoustic,
-                                                        mp->d_kappastore,
-                                                        NSOURCES);
+  compute_add_sources_acoustic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                                              mp->d_ibool,
+                                                                              mp->d_ispec_is_inner,
+                                                                              phase_is_inner,
+                                                                              mp->d_sourcearrays,
+                                                                              mp->d_stf_pre_compute,
+                                                                              mp->myrank,
+                                                                              mp->d_islice_selected_source,
+                                                                              mp->d_ispec_selected_source,
+                                                                              mp->d_ispec_is_acoustic,
+                                                                              mp->d_kappastore,
+                                                                              NSOURCES);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_add_sources_ac_s3_cuda");
@@ -273,27 +260,21 @@
                                                int* h_ispec_is_inner,
                                                int* h_ispec_is_acoustic,
                                                int* h_ispec_selected_rec,
-                                               int* myrank,
                                                int* nrec,
                                                int* time_index,
                                                int* h_islice_selected_rec,
                                                int* nadj_rec_local,
                                                int* NTSTEP_BETWEEN_READ_ADJSRC) {
 
-TRACE("add_sources_ac_sim_2_or_3_cuda");
+  TRACE("add_sources_ac_sim_2_or_3_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   // checks
   if( *nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
 
-  // make sure grid dimension is less than 65535 in x dimension
-  int num_blocks_x = mp->nadj_rec_local;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nadj_rec_local,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y,1);
   dim3 threads(5,5,5);
@@ -304,17 +285,18 @@
   int ispec,i,j,k;
   int irec_local = 0;
   for(int irec = 0; irec < *nrec; irec++) {
-    if(*myrank == h_islice_selected_rec[irec]) {
+    if(mp->myrank == h_islice_selected_rec[irec]) {
       irec_local++;
 
       // takes only acoustic sources
-      ispec = h_ispec_selected_rec[irec]-1;
-      if( h_ispec_is_acoustic[ispec] ){
+      ispec = h_ispec_selected_rec[irec] - 1;
 
+      if( h_ispec_is_acoustic[ispec] ){
         if( h_ispec_is_inner[ispec] == *phase_is_inner) {
           for(k=0;k<5;k++) {
             for(j=0;j<5;j++) {
               for(i=0;i<5;i++) {
+
                 mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,irec_local-1)]
                   = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
                                             *NTSTEP_BETWEEN_READ_ADJSRC,
@@ -347,20 +329,20 @@
 
   // copies extracted array values onto GPU
   print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
-                              (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),99099);
+                                    (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),99099);
 
   // launches cuda kernel for acoustic adjoint sources
-  add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
-                                                          *nrec,
-                                                          mp->d_adj_sourcearrays,
-                                                          mp->d_ibool,
-                                                          mp->d_ispec_is_inner,
-                                                          mp->d_ispec_is_acoustic,
-                                                          mp->d_ispec_selected_rec,
-                                                          *phase_is_inner,
-                                                          mp->d_pre_computed_irec,
-                                                          mp->nadj_rec_local,
-                                                          mp->d_kappastore);
+  add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
+                                                                                *nrec,
+                                                                                mp->d_adj_sourcearrays,
+                                                                                mp->d_ibool,
+                                                                                mp->d_ispec_is_inner,
+                                                                                mp->d_ispec_is_acoustic,
+                                                                                mp->d_ispec_selected_rec,
+                                                                                *phase_is_inner,
+                                                                                mp->d_pre_computed_irec,
+                                                                                mp->nadj_rec_local,
+                                                                                mp->d_kappastore);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_viscoelastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_viscoelastic_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_viscoelastic_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -62,8 +62,8 @@
   int k = threadIdx.z;
 
   int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
-  int ispec;
-  int iglob;
+
+  int ispec,iglob;
   realw stf;
 
   if(isource < NSOURCES) { // when NSOURCES > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
@@ -94,51 +94,45 @@
 
 extern "C"
 void FC_FUNC_(compute_add_sources_el_cuda,
-              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
-                                            int* phase_is_innerf,
-                                            int* NSOURCESf,
-                                            double* h_stf_pre_compute,
-                                            int* myrankf) {
+              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer,
+                                           double* h_stf_pre_compute,
+                                           int* h_NSOURCES,
+                                           int* h_phase_is_inner) {
 
-TRACE("compute_add_sources_el_cuda");
+  TRACE("\tcompute_add_sources_el_cuda");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   // check if anything to do
   if( mp->nsources_local == 0 ) return;
 
-  int phase_is_inner = *phase_is_innerf;
-  int NSOURCES = *NSOURCESf;
-  int myrank = *myrankf;
+  int NSOURCES = *h_NSOURCES;
+  int phase_is_inner = *h_phase_is_inner;
 
-  int num_blocks_x = NSOURCES;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  //double* d_stf_pre_compute;
   print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
                                      NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
 
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_el_cuda copy");
+#endif
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(5,5,5);
 
-  compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
-                                               mp->d_ibool,
-                                               mp->d_ispec_is_inner,
-                                               phase_is_inner,
-                                               mp->d_sourcearrays,
-                                               mp->d_stf_pre_compute,
-                                               myrank,
-                                               mp->d_islice_selected_source,
-                                               mp->d_ispec_selected_source,
-                                               mp->d_ispec_is_elastic,
-                                               NSOURCES);
+  compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_ibool,
+                                                                    mp->d_ispec_is_inner,phase_is_inner,
+                                                                    mp->d_sourcearrays,
+                                                                    mp->d_stf_pre_compute,
+                                                                    mp->myrank,
+                                                                    mp->d_islice_selected_source,mp->d_ispec_selected_source,
+                                                                    mp->d_ispec_is_elastic,
+                                                                    NSOURCES);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_kernel");
+  exit_on_cuda_error("compute_add_sources_el_cuda");
 #endif
 }
 
@@ -148,41 +142,38 @@
 void FC_FUNC_(compute_add_sources_el_s3_cuda,
               COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
                                               double* h_stf_pre_compute,
-                                              int* NSOURCESf,
-                                              int* phase_is_inner,
-                                              int* myrank) {
-  TRACE("compute_add_sources_el_s3_cuda");
+                                              int* h_NSOURCES,
+                                              int* h_phase_is_inner) {
+
+  TRACE("\tcompute_add_sources_el_s3_cuda");
   // EPIK_TRACER("compute_add_sources_el_s3_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  int NSOURCES = *NSOURCESf;
+  int NSOURCES = *h_NSOURCES;
+  int phase_is_inner = *h_phase_is_inner;
 
   print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
                                      NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_el_s3_cuda");
+  exit_on_cuda_error("compute_add_sources_el_s3_cuda copy");
 #endif
 
-  int num_blocks_x = NSOURCES;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(5,5,5);
 
   compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_ibool,
-                                               mp->d_ispec_is_inner, *phase_is_inner,
-                                               mp->d_sourcearrays,
-                                               mp->d_stf_pre_compute,
-                                               *myrank,
-                                               mp->d_islice_selected_source,mp->d_ispec_selected_source,
-                                               mp->d_ispec_is_elastic,
-                                               NSOURCES);
+                                                                    mp->d_ispec_is_inner, phase_is_inner,
+                                                                    mp->d_sourcearrays,
+                                                                    mp->d_stf_pre_compute,
+                                                                    mp->myrank,
+                                                                    mp->d_islice_selected_source,mp->d_ispec_selected_source,
+                                                                    mp->d_ispec_is_elastic,
+                                                                    NSOURCES);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_add_sources_el_s3_cuda");
@@ -219,30 +210,28 @@
 
 extern "C"
 void FC_FUNC_(add_source_master_rec_noise_cu,
-              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
-                                                int* myrank_f,
-                                                int* it_f,
-                                                int* irec_master_noise_f,
-                                                int* islice_selected_rec) {
+              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer,
+                                              int* it_f,
+                                              int* irec_master_noise_f,
+                                              int* islice_selected_rec) {
 
-TRACE("add_source_master_rec_noise_cu");
+TRACE("\tadd_source_master_rec_noise_cu");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   int it = *it_f-1; // -1 for Fortran -> C indexing differences
   int irec_master_noise = *irec_master_noise_f;
-  int myrank = *myrank_f;
 
   dim3 grid(1,1,1);
   dim3 threads(NGLL3,1,1);
 
-  if(myrank == islice_selected_rec[irec_master_noise-1]) {
+  if(mp->myrank == islice_selected_rec[irec_master_noise-1]) {
     add_source_master_rec_noise_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool,
-                                                              mp->d_ispec_selected_rec,
-                                                              irec_master_noise,
-                                                              mp->d_accel,
-                                                              mp->d_noise_sourcearray,
-                                                              it);
+                                                                                    mp->d_ispec_selected_rec,
+                                                                                    irec_master_noise,
+                                                                                    mp->d_accel,
+                                                                                    mp->d_noise_sourcearray,
+                                                                                    it);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("add_source_master_rec_noise_cuda_kernel");
@@ -313,27 +302,21 @@
                                                int* h_ispec_is_inner,
                                                int* h_ispec_is_elastic,
                                                int* h_ispec_selected_rec,
-                                               int* myrank,
                                                int* nrec,
                                                int* time_index,
                                                int* h_islice_selected_rec,
                                                int* nadj_rec_local,
                                                int* NTSTEP_BETWEEN_READ_ADJSRC) {
 
-TRACE("add_sources_el_sim_type_2_or_3");
+  TRACE("\tadd_sources_el_sim_type_2_or_3");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   // checks
   if( *nadj_rec_local != mp->nadj_rec_local) exit_on_error("add_sources_el_sim_type_2_or_3: nadj_rec_local not equal\n");
 
-  // make sure grid dimension is less than 65535 in x dimension
-  int num_blocks_x = mp->nadj_rec_local;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nadj_rec_local,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y,1);
   dim3 threads(5,5,5);
@@ -344,11 +327,12 @@
   int ispec,i,j,k;
   int irec_local = 0;
   for(int irec = 0; irec < *nrec; irec++) {
-    if(*myrank == h_islice_selected_rec[irec]) {
+    if(mp->myrank == h_islice_selected_rec[irec]) {
       irec_local++;
 
       // takes only elastic sources
-      ispec = h_ispec_selected_rec[irec]-1;
+      ispec = h_ispec_selected_rec[irec] - 1;
+
       if( h_ispec_is_elastic[ispec] ){
 
         if( h_ispec_is_inner[ispec] == *phase_is_inner) {
@@ -356,34 +340,25 @@
             for(j=0;j<5;j++) {
               for(i=0;i<5;i++) {
 
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
-                                                i,j,k,0,
-                                                irec_local-1)]
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,irec_local-1)]
                         = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
                                                     *NTSTEP_BETWEEN_READ_ADJSRC,
                                                     3,5,5,
-                                                    irec_local-1,
-                                                    *time_index-1,
+                                                    irec_local-1,(*time_index)-1,
                                                     0,i,j,k)];
 
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
-                                                i,j,k,1,
-                                                irec_local-1)]
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,irec_local-1)]
                         = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
                                                     *NTSTEP_BETWEEN_READ_ADJSRC,
                                                     3,5,5,
-                                                    irec_local-1,
-                                                    *time_index-1,
+                                                    irec_local-1,(*time_index)-1,
                                                     1,i,j,k)];
 
-                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
-                                                i,j,k,2,
-                                                irec_local-1)]
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,irec_local-1)]
                         = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
                                                     *NTSTEP_BETWEEN_READ_ADJSRC,
                                                     3,5,5,
-                                                    irec_local-1,
-                                                    *time_index-1,
+                                                    irec_local-1,(*time_index)-1,
                                                     2,i,j,k)];
               }
             }
@@ -396,8 +371,8 @@
   if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
 
   // copies extracted array values onto GPU
-  cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
-             (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+                                    (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),98001);
 
 
   // the irec_local variable needs to be precomputed (as
@@ -405,15 +380,15 @@
   // and due to how it's incremented, it cannot be parallelized
 
   add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
-                                                         *nrec,
-                                                         mp->d_adj_sourcearrays,
-                                                         mp->d_ibool,
-                                                         mp->d_ispec_is_inner,
-                                                         mp->d_ispec_is_elastic,
-                                                         mp->d_ispec_selected_rec,
-                                                         *phase_is_inner,
-                                                         mp->d_pre_computed_irec,
-                                                         mp->nadj_rec_local);
+                                                                               *nrec,
+                                                                               mp->d_adj_sourcearrays,
+                                                                               mp->d_ibool,
+                                                                               mp->d_ispec_is_inner,
+                                                                               mp->d_ispec_is_elastic,
+                                                                               mp->d_ispec_selected_rec,
+                                                                               *phase_is_inner,
+                                                                               mp->d_pre_computed_irec,
+                                                                               mp->nadj_rec_local);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_coupling_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_coupling_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -115,25 +115,22 @@
 
 extern "C"
 void FC_FUNC_(compute_coupling_ac_el_cuda,
-              COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f,
+              COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer,
                                            int* phase_is_innerf,
                                            int* num_coupling_ac_el_facesf) {
   TRACE("compute_coupling_ac_el_cuda");
   //double start_time = get_time();
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
   int phase_is_inner            = *phase_is_innerf;
   int num_coupling_ac_el_faces  = *num_coupling_ac_el_facesf;
 
   // way 1: exact blocksize to match NGLLSQUARE
   int blocksize = NGLL2;
-  int num_blocks_x = num_coupling_ac_el_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
 
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(num_coupling_ac_el_faces,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
@@ -275,25 +272,21 @@
 
 extern "C"
 void FC_FUNC_(compute_coupling_el_ac_cuda,
-              COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f,
+              COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer,
                                            int* phase_is_innerf,
                                            int* num_coupling_ac_el_facesf) {
   TRACE("compute_coupling_el_ac_cuda");
   //double start_time = get_time();
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
   int phase_is_inner            = *phase_is_innerf;
   int num_coupling_ac_el_faces  = *num_coupling_ac_el_facesf;
 
   // way 1: exact blocksize to match NGLLSQUARE
   int blocksize = 25;
 
-  int num_blocks_x = num_coupling_ac_el_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(num_coupling_ac_el_faces,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
@@ -359,6 +352,7 @@
   // gets spectral element face id
   int igll = threadIdx.x ;  //  threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1)
   int iface = blockIdx.x + gridDim.x*blockIdx.y;
+
   realw nx,ny,nz;
   realw force_normal_comp;
 
@@ -408,11 +402,11 @@
 
 extern "C"
 void FC_FUNC_(compute_coupling_ocean_cuda,
-              COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f) {
+              COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer) {
 
-  TRACE("compute_coupling_ocean_cuda");
+  TRACE("\tcompute_coupling_ocean_cuda");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   // checks if anything to do
   if( mp->num_free_surface_faces == 0 ) return;
@@ -420,12 +414,8 @@
   // block sizes: exact blocksize to match NGLLSQUARE
   int blocksize = NGLL2;
 
-  int num_blocks_x = mp->num_free_surface_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->num_free_surface_faces,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
@@ -440,14 +430,14 @@
 #endif
 
   compute_coupling_ocean_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
-                                                   mp->d_rmassx,mp->d_rmassy,mp->d_rmassz,
-                                                   mp->d_rmass_ocean_load,
-                                                   mp->num_free_surface_faces,
-                                                   mp->d_free_surface_ispec,
-                                                   mp->d_free_surface_ijk,
-                                                   mp->d_free_surface_normal,
-                                                   mp->d_ibool,
-                                                   mp->d_updated_dof_ocean_load);
+                                                                           mp->d_rmassx,mp->d_rmassy,mp->d_rmassz,
+                                                                           mp->d_rmass_ocean_load,
+                                                                           mp->num_free_surface_faces,
+                                                                           mp->d_free_surface_ispec,
+                                                                           mp->d_free_surface_ijk,
+                                                                           mp->d_free_surface_normal,
+                                                                           mp->d_ibool,
+                                                                           mp->d_updated_dof_ocean_load);
   // for backward/reconstructed potentials
   if(mp->simulation_type == 3) {
     // re-initializes array
@@ -455,14 +445,14 @@
                                        sizeof(int)*mp->NGLOB_AB),88502);
 
     compute_coupling_ocean_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,
-                                                     mp->d_rmassx,mp->d_rmassy,mp->d_rmassz,
-                                                     mp->d_rmass_ocean_load,
-                                                     mp->num_free_surface_faces,
-                                                     mp->d_free_surface_ispec,
-                                                     mp->d_free_surface_ijk,
-                                                     mp->d_free_surface_normal,
-                                                     mp->d_ibool,
-                                                     mp->d_updated_dof_ocean_load);
+                                                                             mp->d_rmassx,mp->d_rmassy,mp->d_rmassz,
+                                                                             mp->d_rmass_ocean_load,
+                                                                             mp->num_free_surface_faces,
+                                                                             mp->d_free_surface_ispec,
+                                                                             mp->d_free_surface_ijk,
+                                                                             mp->d_free_surface_normal,
+                                                                             mp->d_ibool,
+                                                                             mp->d_updated_dof_ocean_load);
 
   }
 

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -36,7 +36,37 @@
 #include "config.h"
 #include "mesh_constants_cuda.h"
 
+#ifdef USE_TEXTURES_FIELDS
+realw_texture d_potential_tex;
+realw_texture d_potential_dot_dot_tex;
+//backward/reconstructed
+realw_texture d_b_potential_tex;
+realw_texture d_b_potential_dot_dot_tex;
 
+//note: texture variables are implicitly static, and cannot be passed as arguments to cuda kernels;
+//      thus, 1) we thus use if-statements (FORWARD_OR_ADJOINT) to determine from which texture to fetch from
+//            2) we use templates
+//      since if-statements are a bit slower as the variable is only known at runtime, we use option 2)
+
+// templates definitions
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_potential(int x);
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_potential_dot_dot(int x);
+
+// templates for texture fetching
+// FORWARD_OR_ADJOINT == 1 <- forward arrays
+template<> __device__ float texfetch_potential<1>(int x) { return tex1Dfetch(d_potential_tex, x); }
+template<> __device__ float texfetch_potential_dot_dot<1>(int x) { return tex1Dfetch(d_potential_dot_dot_tex, x); }
+// FORWARD_OR_ADJOINT == 3 <- backward/reconstructed arrays
+template<> __device__ float texfetch_potential<3>(int x) { return tex1Dfetch(d_b_potential_tex, x); }
+template<> __device__ float texfetch_potential_dot_dot<3>(int x) { return tex1Dfetch(d_b_potential_dot_dot_tex, x); }
+
+#endif
+
+#ifdef USE_TEXTURES_CONSTANTS
+extern realw_texture d_hprime_xx_tex;
+#endif
+
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // prepares a device array with with all inter-element edge-nodes -- this
@@ -49,12 +79,17 @@
                                                      int* d_ibool_interfaces_ext_mesh) {
 
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-  int iinterface=0;
+  int ientry,iglob;
 
-  for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
+  for(int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
     if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
-      d_send_potential_dot_dot_buffer[(id + max_nibool_interfaces_ext_mesh*iinterface)] =
-        d_potential_dot_dot_acoustic[(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)];
+
+      // entry in interface array
+      ientry = id + max_nibool_interfaces_ext_mesh*iinterface;
+      // global index in wavefield
+      iglob = d_ibool_interfaces_ext_mesh[ientry] - 1;
+
+      d_send_potential_dot_dot_buffer[ientry] = d_potential_dot_dot_acoustic[iglob];
     }
   }
 
@@ -66,58 +101,68 @@
 // prepares and transfers the inter-element edge-nodes to the host to be MPI'd
 extern "C"
 void FC_FUNC_(transfer_boun_pot_from_device,
-              TRANSFER_BOUN_POT_FROM_DEVICE)(
-                                              int* size,
-                                              long* Mesh_pointer_f,
-                                              realw* potential_dot_dot_acoustic,
-                                              realw* send_potential_dot_dot_buffer,
-                                              int* num_interfaces_ext_mesh,
-                                              int* max_nibool_interfaces_ext_mesh,
-                                              int* nibool_interfaces_ext_mesh,
-                                              int* ibool_interfaces_ext_mesh,
-                                              int* FORWARD_OR_ADJOINT){
+              TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer,
+                                             realw* potential_dot_dot_acoustic,
+                                             realw* send_potential_dot_dot_buffer,
+                                             int* FORWARD_OR_ADJOINT){
 
 TRACE("transfer_boun_pot_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  if( *num_interfaces_ext_mesh == 0 ) return;
+  // checks if anything to do
+  if( mp->size_mpi_buffer_potential > 0 ){
 
-  int blocksize = BLOCKSIZE_TRANSFER;
-  int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_ext_mesh))/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+    int blocksize = BLOCKSIZE_TRANSFER;
+    int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_ext_mesh))/((double)blocksize)))*blocksize;
 
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
+    int num_blocks_x, num_blocks_y;
+    get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
-  if(*FORWARD_OR_ADJOINT == 1) {
-    prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
-                                                           mp->d_send_potential_dot_dot_buffer,
-                                                           mp->num_interfaces_ext_mesh,
-                                                           mp->max_nibool_interfaces_ext_mesh,
-                                                           mp->d_nibool_interfaces_ext_mesh,
-                                                           mp->d_ibool_interfaces_ext_mesh);
+    dim3 grid(num_blocks_x,num_blocks_y);
+    dim3 threads(blocksize,1,1);
+
+    if(*FORWARD_OR_ADJOINT == 1) {
+      prepare_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
+                                                                                   mp->d_send_potential_dot_dot_buffer,
+                                                                                   mp->num_interfaces_ext_mesh,
+                                                                                   mp->max_nibool_interfaces_ext_mesh,
+                                                                                   mp->d_nibool_interfaces_ext_mesh,
+                                                                                   mp->d_ibool_interfaces_ext_mesh);
+
+      // synchronizes
+      //synchronize_cuda();
+      // explicitly waits until previous compute stream finishes
+      // (cudaMemcpy implicitly synchronizes all other cuda operations)
+      cudaStreamSynchronize(mp->compute_stream);
+
+      print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_potential_dot_dot_buffer,
+                                         mp->size_mpi_buffer_potential*sizeof(realw),cudaMemcpyDeviceToHost),98000);
+    }
+    else if(*FORWARD_OR_ADJOINT == 3) {
+      // backward/reconstructed wavefield buffer
+      prepare_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                                                   mp->d_b_send_potential_dot_dot_buffer,
+                                                                                   mp->num_interfaces_ext_mesh,
+                                                                                   mp->max_nibool_interfaces_ext_mesh,
+                                                                                   mp->d_nibool_interfaces_ext_mesh,
+                                                                                   mp->d_ibool_interfaces_ext_mesh);
+
+      // synchronizes
+      //synchronize_cuda();
+      // explicitly waits until previous compute stream finishes
+      // (cudaMemcpy implicitly synchronizes all other cuda operations)
+      cudaStreamSynchronize(mp->compute_stream);
+
+      print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_b_send_potential_dot_dot_buffer,
+                                         mp->size_mpi_buffer_potential*sizeof(realw),cudaMemcpyDeviceToHost),98000);
+    }
   }
-  else if(*FORWARD_OR_ADJOINT == 3) {
-    prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
-                                                           mp->d_send_potential_dot_dot_buffer,
-                                                           mp->num_interfaces_ext_mesh,
-                                                           mp->max_nibool_interfaces_ext_mesh,
-                                                           mp->d_nibool_interfaces_ext_mesh,
-                                                           mp->d_ibool_interfaces_ext_mesh);
-  }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("after prepare_boundary_potential_on_device");
 #endif
 
-  print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_potential_dot_dot_buffer,
-      (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),cudaMemcpyDeviceToHost),98000);
 
   // finish timing of kernel+memcpy
   // cudaEventRecord( stop, 0 );
@@ -143,17 +188,21 @@
                                                       int* d_ibool_interfaces_ext_mesh) {
 
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-  int iinterface=0;
+  int ientry,iglob;
 
-  for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
+  for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
     if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
 
+      // entry in interface array
+      ientry = id + max_nibool_interfaces_ext_mesh*iinterface;
+      // global index in wavefield
+      iglob = d_ibool_interfaces_ext_mesh[ientry] - 1;
+
       // for testing atomic operations against not atomic operations (0.1ms vs. 0.04 ms)
       // d_potential_dot_dot_acoustic[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)] +=
       // d_send_potential_dot_dot_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)];
 
-      atomicAdd(&d_potential_dot_dot_acoustic[(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)],
-                d_send_potential_dot_dot_buffer[(id + max_nibool_interfaces_ext_mesh*iinterface)]);
+      atomicAdd(&d_potential_dot_dot_acoustic[iglob],d_send_potential_dot_dot_buffer[ientry]);
     }
   }
   // ! This step is done via previous function transfer_and_assemble...
@@ -170,19 +219,15 @@
 
 extern "C"
 void FC_FUNC_(transfer_asmbl_pot_to_device,
-              TRANSFER_ASMBL_POT_TO_DEVICE)(
-                                                long* Mesh_pointer,
-                                                realw* potential_dot_dot_acoustic,
-                                                realw* buffer_recv_scalar_ext_mesh,
-                                                int* num_interfaces_ext_mesh,
-                                                int* max_nibool_interfaces_ext_mesh,
-                                                int* nibool_interfaces_ext_mesh,
-                                                int* ibool_interfaces_ext_mesh,
-                                                int* FORWARD_OR_ADJOINT) {
+              TRANSFER_ASMBL_POT_TO_DEVICE)(long* Mesh_pointer,
+                                            realw* potential_dot_dot_acoustic,
+                                            realw* buffer_recv_scalar_ext_mesh,
+                                            int* FORWARD_OR_ADJOINT) {
 
 TRACE("transfer_asmbl_pot_to_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
   //double start_time = get_time();
   // cudaEvent_t start, stop;
   // realw time;
@@ -190,43 +235,49 @@
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
 
-  if( *num_interfaces_ext_mesh == 0 ) return;
+  // checks if anything to do
+  if( mp->size_mpi_buffer_potential > 0 ){
 
-  // copies buffer onto GPU
-  cudaMemcpy(mp->d_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh,
-             (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw), cudaMemcpyHostToDevice);
+    // assembles on GPU
+    int blocksize = BLOCKSIZE_TRANSFER;
+    int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
 
-  // assembles on GPU
-  int blocksize = BLOCKSIZE_TRANSFER;
-  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+    int num_blocks_x, num_blocks_y;
+    get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
+    dim3 grid(num_blocks_x,num_blocks_y);
+    dim3 threads(blocksize,1,1);
 
-  if(*FORWARD_OR_ADJOINT == 1) {
-    //assemble forward field
-    assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
-                                                          mp->d_send_potential_dot_dot_buffer,
-                                                          mp->num_interfaces_ext_mesh,
-                                                          mp->max_nibool_interfaces_ext_mesh,
-                                                          mp->d_nibool_interfaces_ext_mesh,
-                                                          mp->d_ibool_interfaces_ext_mesh);
+    // synchronizes
+    synchronize_cuda();
+
+    if(*FORWARD_OR_ADJOINT == 1) {
+      // copies buffer onto GPU
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh,
+                                         mp->size_mpi_buffer_potential*sizeof(realw), cudaMemcpyHostToDevice),98010);
+
+      //assemble forward field
+      assemble_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
+                                                                                    mp->d_send_potential_dot_dot_buffer,
+                                                                                    mp->num_interfaces_ext_mesh,
+                                                                                    mp->max_nibool_interfaces_ext_mesh,
+                                                                                    mp->d_nibool_interfaces_ext_mesh,
+                                                                                    mp->d_ibool_interfaces_ext_mesh);
+    }
+    else if(*FORWARD_OR_ADJOINT == 3) {
+      // copies buffer onto GPU
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh,
+                                         mp->size_mpi_buffer_potential*sizeof(realw), cudaMemcpyHostToDevice),98011);
+
+      //assemble reconstructed/backward field
+      assemble_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                                                    mp->d_b_send_potential_dot_dot_buffer,
+                                                                                    mp->num_interfaces_ext_mesh,
+                                                                                    mp->max_nibool_interfaces_ext_mesh,
+                                                                                    mp->d_nibool_interfaces_ext_mesh,
+                                                                                    mp->d_ibool_interfaces_ext_mesh);
+    }
   }
-  else if(*FORWARD_OR_ADJOINT == 3) {
-    //assemble reconstructed/backward field
-    assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
-                                                            mp->d_send_potential_dot_dot_buffer,
-                                                            mp->num_interfaces_ext_mesh,
-                                                            mp->max_nibool_interfaces_ext_mesh,
-                                                            mp->d_nibool_interfaces_ext_mesh,
-                                                            mp->d_ibool_interfaces_ext_mesh);
-  }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   // cudaEventRecord( stop, 0 );
@@ -244,34 +295,32 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-/* KERNEL 2 */
+// KERNEL 2 - acoustic compute forces kernel
 
 /* ----------------------------------------------------------------------------------------------- */
 
+template<int FORWARD_OR_ADJOINT> __global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,
+                                                                         int NGLOB, int* d_ibool,
+                                                                         int* d_phase_ispec_inner_acoustic,
+                                                                         int num_phase_ispec_acoustic,
+                                                                         int d_iphase,
+                                                                         int use_mesh_coloring_gpu,
+                                                                         realw* d_potential_acoustic, realw* d_potential_dot_dot_acoustic,
+                                                                         realw* d_xix, realw* d_xiy, realw* d_xiz,
+                                                                         realw* d_etax, realw* d_etay, realw* d_etaz,
+                                                                         realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+                                                                         realw* d_hprime_xx,
+                                                                         realw* hprimewgll_xx,
+                                                                         realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
+                                                                         realw* d_rhostore,
+                                                                         int gravity,
+                                                                         realw* minus_g,
+                                                                         realw* d_kappastore,
+                                                                         realw* wgll_cube){
 
-__global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,
-                                       int NGLOB, int* d_ibool,
-                                       int* d_phase_ispec_inner_acoustic,
-                                       int num_phase_ispec_acoustic,
-                                       int d_iphase,
-                                       int use_mesh_coloring_gpu,
-                                       realw* d_potential_acoustic, realw* d_potential_dot_dot_acoustic,
-                                       realw* d_xix, realw* d_xiy, realw* d_xiz,
-                                       realw* d_etax, realw* d_etay, realw* d_etaz,
-                                       realw* d_gammax, realw* d_gammay, realw* d_gammaz,
-                                       realw* d_hprime_xx,
-                                       realw* hprimewgll_xx,
-                                       realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
-                                       realw* d_rhostore,
-                                       int gravity,
-                                       realw* minus_g,
-                                       realw* d_kappastore,
-                                       realw* wgll_cube){
-
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
   int tx = threadIdx.x;
 
-  //const int NGLL3 = NGLL3;
   const int NGLL3_ALIGN = NGLL3_PADDED;
 
   int K = (tx/NGLL2);
@@ -281,6 +330,7 @@
   int active,offset;
   int iglob = 0;
   int working_element;
+
   realw temp1l,temp2l,temp3l;
   realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
   realw dpotentialdxl,dpotentialdyl,dpotentialdzl;
@@ -301,6 +351,8 @@
   __shared__ realw s_temp2[NGLL3];
   __shared__ realw s_temp3[NGLL3];
 
+  __shared__ realw sh_hprime_xx[NGLL2];
+
 // use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
 // because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
   active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
@@ -321,24 +373,29 @@
     }
 #endif
 
-    // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
     iglob = d_ibool[working_element*NGLL3 + tx]-1;
 
-#ifdef USE_TEXTURES
-    s_dummy_loc[tx] = tex1Dfetch(tex_potential_acoustic, iglob);
+#ifdef USE_TEXTURES_FIELDS
+    s_dummy_loc[tx] = texfetch_potential<FORWARD_OR_ADJOINT>(iglob);
 #else
     // changing iglob indexing to match fortran row changes fast style
     s_dummy_loc[tx] = d_potential_acoustic[iglob];
 #endif
   }
 
+  if (tx < NGLL2) {
+#ifdef USE_TEXTURES_CONSTANTS
+    sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_tex,tx);
+#else
+    sh_hprime_xx[tx] = d_hprime_xx[tx];
+#endif
+  }
+
 // synchronize all the threads (one thread for each of the NGLL grid points of the
 // current spectral element) because we need the whole element to be ready in order
 // to be able to compute the matrix products along cut planes of the 3D element below
   __syncthreads();
 
-#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-
   if (active) {
 
 #ifndef MANUALLY_UNROLLED_LOOPS
@@ -348,16 +405,16 @@
     temp3l = 0.f;
 
     for (l=0;l<NGLLX;l++) {
-        hp1 = d_hprime_xx[l*NGLLX+I];
+        hp1 = sh_hprime_xx[l*NGLLX+I];
         offset1 = K*NGLL2+J*NGLLX+l;
         temp1l += s_dummy_loc[offset1]*hp1;
 
         //assumes that hprime_xx = hprime_yy = hprime_zz
-        hp2 = d_hprime_xx[l*NGLLX+J];
+        hp2 = sh_hprime_xx[l*NGLLX+J];
         offset2 = K*NGLL2+l*NGLLX+I;
         temp2l += s_dummy_loc[offset2]*hp2;
 
-        hp3 = d_hprime_xx[l*NGLLX+K];
+        hp3 = sh_hprime_xx[l*NGLLX+K];
         offset3 = l*NGLL2+J*NGLLX+I;
         temp3l += s_dummy_loc[offset3]*hp3;
     }
@@ -504,34 +561,38 @@
 
     iglob = d_ibool[working_element*NGLL3 + tx]-1;
 
-#ifdef USE_TEXTURES
-    d_potential_dot_dot_acoustic[iglob] = tex1Dfetch(tex_potential_dot_dot_acoustic, iglob)
-                                            + sum_terms;
-#else
-
 #ifdef USE_MESH_COLORING_GPU
     // no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+    d_potential_dot_dot_acoustic[iglob] = texfetch_potential_dot_dot<FORWARD_OR_ADJOINT>(iglob) + sum_terms;
+#else
     d_potential_dot_dot_acoustic[iglob] += sum_terms;
-#else
+#endif // USE_TEXTURES_FIELDS
+
+
+#else  // MESH_COLORING
+
     //mesh coloring
     if( use_mesh_coloring_gpu ){
-
       // no atomic operation needed, colors don't share global points between elements
+#ifdef USE_TEXTURES_FIELDS
+      d_potential_dot_dot_acoustic[iglob] = texfetch_potential_dot_dot<FORWARD_OR_ADJOINT>(iglob) + sum_terms;
+#else
       d_potential_dot_dot_acoustic[iglob] += sum_terms;
+#endif // USE_TEXTURES_FIELDS
 
     }else{
 
+      // for testing purposes only: w/out atomic updates
+      //d_potential_dot_dot_acoustic[iglob]     += sum_terms1;
+
       atomicAdd(&d_potential_dot_dot_acoustic[iglob],sum_terms);
 
     }
-#endif
+#endif // MESH_COLORING
 
-#endif
   }
-
-#else  // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-  d_potential_dot_dot_acoustic[iglob] = 123.123f;
-#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
 }
 
 
@@ -539,18 +600,11 @@
 
 void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
                        int* d_ibool,
-                       realw* d_xix,
-                       realw* d_xiy,
-                       realw* d_xiz,
-                       realw* d_etax,
-                       realw* d_etay,
-                       realw* d_etaz,
-                       realw* d_gammax,
-                       realw* d_gammay,
-                       realw* d_gammaz,
+                       realw* d_xix,realw* d_xiy,realw* d_xiz,
+                       realw* d_etax,realw* d_etay,realw* d_etaz,
+                       realw* d_gammax,realw* d_gammay,realw* d_gammaz,
                        realw* d_rhostore,
-                       realw* d_kappastore)
-{
+                       realw* d_kappastore){
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("before acoustic kernel Kernel 2");
@@ -560,17 +614,15 @@
   /* grid_2_x = nb_elem_color; */
   /* nb_elem_color is just how many blocks we are computing now */
 
-  int num_blocks_x = nb_blocks_to_compute;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
 
-  int threads_2 = NGLL3_PADDED;//BLOCK_SIZE_K2;
-  dim3 grid_2(num_blocks_x,num_blocks_y);
+  int blocksize = NGLL3_PADDED;
 
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(nb_blocks_to_compute,&num_blocks_x,&num_blocks_y);
 
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
   // Cuda timing
   // cudaEvent_t start, stop;
   // realw time;
@@ -578,28 +630,30 @@
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
 
-  Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
-                                                        mp->NGLOB_AB,
-                                                        d_ibool,
-                                                        mp->d_phase_ispec_inner_acoustic,
-                                                        mp->num_phase_ispec_acoustic,
-                                                        d_iphase,
-                                                        mp->use_mesh_coloring_gpu,
-                                                        mp->d_potential_acoustic, mp->d_potential_dot_dot_acoustic,
-                                                        d_xix, d_xiy, d_xiz,
-                                                        d_etax, d_etay, d_etaz,
-                                                        d_gammax, d_gammay, d_gammaz,
-                                                        mp->d_hprime_xx,
-                                                        mp->d_hprimewgll_xx,
-                                                        mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
-                                                        d_rhostore,
-                                                        mp->gravity,
-                                                        mp->d_minus_g,
-                                                        d_kappastore,
-                                                        mp->d_wgll_cube);
+  // forward wavefields -> FORWARD_OR_ADJOINT == 1
+  Kernel_2_acoustic_impl<1><<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
+                                                            mp->NGLOB_AB,
+                                                            d_ibool,
+                                                            mp->d_phase_ispec_inner_acoustic,
+                                                            mp->num_phase_ispec_acoustic,
+                                                            d_iphase,
+                                                            mp->use_mesh_coloring_gpu,
+                                                            mp->d_potential_acoustic, mp->d_potential_dot_dot_acoustic,
+                                                            d_xix, d_xiy, d_xiz,
+                                                            d_etax, d_etay, d_etaz,
+                                                            d_gammax, d_gammay, d_gammaz,
+                                                            mp->d_hprime_xx,
+                                                            mp->d_hprimewgll_xx,
+                                                            mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                            d_rhostore,
+                                                            mp->gravity,
+                                                            mp->d_minus_g,
+                                                            d_kappastore,
+                                                            mp->d_wgll_cube);
 
   if(mp->simulation_type == 3) {
-    Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
+    // backward/reconstructed wavefields -> FORWARD_OR_ADJOINT == 3
+    Kernel_2_acoustic_impl<3><<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
                                                           mp->NGLOB_AB,
                                                           d_ibool,
                                                           mp->d_phase_ispec_inner_acoustic,
@@ -627,8 +681,8 @@
   // cudaEventDestroy( stop );
   // printf("Kernel2 Execution Time: %f ms\n",time);
 
-  /* cudaThreadSynchronize(); */
-  /* TRACE("Kernel 2 finished"); */
+  // cudaThreadSynchronize(); //
+  // TRACE("Kernel 2 finished"); //
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //printf("Tried to start with %dx1 blocks\n",nb_blocks_to_compute);
   exit_on_cuda_error("kernel Kernel_2");
@@ -643,7 +697,7 @@
 
 extern "C"
 void FC_FUNC_(compute_forces_acoustic_cuda,
-              COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+              COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer,
                                             int* iphase,
                                             int* nspec_outer_acoustic,
                                             int* nspec_inner_acoustic) {
@@ -651,7 +705,7 @@
   TRACE("compute_forces_acoustic_cuda");
   //double start_time = get_time();
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
   int num_elements;
 
@@ -671,7 +725,7 @@
 
     int nb_colors,nb_blocks_to_compute;
     int istart;
-    int color_offset,color_offset_nonpadded;
+    int offset,offset_nonpadded;
 
     // sets up color loop
     if( *iphase == 1 ){
@@ -680,16 +734,16 @@
       istart = 0;
 
       // array offsets (acoustic elements start after elastic ones)
-      color_offset = mp->nspec_elastic * NGLL3_PADDED;
-      color_offset_nonpadded = mp->nspec_elastic * NGLL3;
+      offset = mp->nspec_elastic * NGLL3_PADDED;
+      offset_nonpadded = mp->nspec_elastic * NGLL3;
     }else{
       // inner element colors (start after outer elements)
       nb_colors = mp->num_colors_outer_acoustic + mp->num_colors_inner_acoustic;
       istart = mp->num_colors_outer_acoustic;
 
       // array offsets (inner elements start after outer ones)
-      color_offset = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3_PADDED;
-      color_offset_nonpadded = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3;
+      offset = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3_PADDED;
+      offset_nonpadded = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3;
     }
 
     // loops over colors
@@ -698,23 +752,17 @@
       nb_blocks_to_compute = mp->h_num_elem_colors_acoustic[icolor];
 
       Kernel_2_acoustic(nb_blocks_to_compute,mp,*iphase,
-                         mp->d_ibool + color_offset_nonpadded,
-                         mp->d_xix + color_offset,
-                         mp->d_xiy + color_offset,
-                         mp->d_xiz + color_offset,
-                         mp->d_etax + color_offset,
-                         mp->d_etay + color_offset,
-                         mp->d_etaz + color_offset,
-                         mp->d_gammax + color_offset,
-                         mp->d_gammay + color_offset,
-                         mp->d_gammaz + color_offset,
-                         mp->d_rhostore + color_offset,
-                         mp->d_kappastore + color_offset_nonpadded);
+                         mp->d_ibool + offset_nonpadded,
+                         mp->d_xix + offset,mp->d_xiy + offset,mp->d_xiz + offset,
+                         mp->d_etax + offset,mp->d_etay + offset,mp->d_etaz + offset,
+                         mp->d_gammax + offset,mp->d_gammay + offset,mp->d_gammaz + offset,
+                         mp->d_rhostore + offset,
+                         mp->d_kappastore + offset_nonpadded);
 
       // for padded and aligned arrays
-      color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+      offset += nb_blocks_to_compute * NGLL3_PADDED;
       // for no-aligned arrays
-      color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+      offset_nonpadded += nb_blocks_to_compute * NGLL3;
     }
 
   }else{
@@ -722,15 +770,9 @@
     // no mesh coloring: uses atomic updates
     Kernel_2_acoustic(num_elements, mp, *iphase,
                       mp->d_ibool,
-                      mp->d_xix,
-                      mp->d_xiy,
-                      mp->d_xiz,
-                      mp->d_etax,
-                      mp->d_etay,
-                      mp->d_etaz,
-                      mp->d_gammax,
-                      mp->d_gammay,
-                      mp->d_gammaz,
+                      mp->d_xix,mp->d_xiy,mp->d_xiz,
+                      mp->d_etax,mp->d_etay,mp->d_etaz,
+                      mp->d_gammax,mp->d_gammay,mp->d_gammaz,
                       mp->d_rhostore,
                       mp->d_kappastore);
 
@@ -787,12 +829,12 @@
 
 extern "C"
 void FC_FUNC_(acoustic_enforce_free_surf_cuda,
-              ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
+              ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer,
                                                int* ABSORB_INSTEAD_OF_FREE_SURFACE) {
 
 TRACE("acoustic_enforce_free_surf_cuda");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   // checks if anything to do
   if( *ABSORB_INSTEAD_OF_FREE_SURFACE == 0 ){
@@ -800,35 +842,31 @@
     // does not absorb free surface, thus we enforce the potential to be zero at surface
 
     // block sizes
-    int num_blocks_x = mp->num_free_surface_faces;
-    int num_blocks_y = 1;
-    while(num_blocks_x > 65535) {
-      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-      num_blocks_y = num_blocks_y*2;
-    }
+    int num_blocks_x, num_blocks_y;
+    get_blocks_xy(mp->num_free_surface_faces,&num_blocks_x,&num_blocks_y);
+
     dim3 grid(num_blocks_x,num_blocks_y,1);
     dim3 threads(NGLL2,1,1);
 
     // sets potentials to zero at free surface
-    enforce_free_surface_cuda_kernel<<<grid,threads>>>(mp->d_potential_acoustic,
-                                                       mp->d_potential_dot_acoustic,
-                                                       mp->d_potential_dot_dot_acoustic,
-                                                       mp->num_free_surface_faces,
-                                                       mp->d_free_surface_ispec,
-                                                       mp->d_free_surface_ijk,
-                                                       mp->d_ibool,
-                                                       mp->d_ispec_is_acoustic);
+    enforce_free_surface_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_acoustic,
+                                                                             mp->d_potential_dot_acoustic,
+                                                                             mp->d_potential_dot_dot_acoustic,
+                                                                             mp->num_free_surface_faces,
+                                                                             mp->d_free_surface_ispec,
+                                                                             mp->d_free_surface_ijk,
+                                                                             mp->d_ibool,
+                                                                             mp->d_ispec_is_acoustic);
     // for backward/reconstructed potentials
     if(mp->simulation_type == 3) {
-      enforce_free_surface_cuda_kernel<<<grid,threads>>>(mp->d_b_potential_acoustic,
-                                                         mp->d_b_potential_dot_acoustic,
-                                                         mp->d_b_potential_dot_dot_acoustic,
-                                                         mp->num_free_surface_faces,
-                                                         mp->d_free_surface_ispec,
-                                                         mp->d_free_surface_ijk,
-                                                         mp->d_ibool,
-                                                         mp->d_ispec_is_acoustic);
-
+      enforce_free_surface_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_acoustic,
+                                                                               mp->d_b_potential_dot_acoustic,
+                                                                               mp->d_b_potential_dot_dot_acoustic,
+                                                                               mp->num_free_surface_faces,
+                                                                               mp->d_free_surface_ispec,
+                                                                               mp->d_free_surface_ijk,
+                                                                               mp->d_ibool,
+                                                                               mp->d_ispec_is_acoustic);
     }
   }
 

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_viscoelastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_viscoelastic_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_viscoelastic_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -38,13 +38,38 @@
 
 
 #ifdef USE_TEXTURES_FIELDS
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_tex;
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_veloc_tex;
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_tex;
+realw_texture d_displ_tex;
+realw_texture d_veloc_tex;
+realw_texture d_accel_tex;
+//backward/reconstructed
+realw_texture d_b_displ_tex;
+realw_texture d_b_veloc_tex;
+realw_texture d_b_accel_tex;
+
+//note: texture variables are implicitly static, and cannot be passed as arguments to cuda kernels;
+//      thus, 1) we thus use if-statements (FORWARD_OR_ADJOINT) to determine from which texture to fetch from
+//            2) we use templates
+//      since if-statements are a bit slower as the variable is only known at runtime, we use option 2)
+
+// templates definitions
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_displ(int x);
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_veloc(int x);
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_accel(int x);
+
+// templates for texture fetching
+// FORWARD_OR_ADJOINT == 1 <- forward arrays
+template<> __device__ float texfetch_displ<1>(int x) { return tex1Dfetch(d_displ_tex, x); }
+template<> __device__ float texfetch_veloc<1>(int x) { return tex1Dfetch(d_veloc_tex, x); }
+template<> __device__ float texfetch_accel<1>(int x) { return tex1Dfetch(d_accel_tex, x); }
+// FORWARD_OR_ADJOINT == 3 <- backward/reconstructed arrays
+template<> __device__ float texfetch_displ<3>(int x) { return tex1Dfetch(d_b_displ_tex, x); }
+template<> __device__ float texfetch_veloc<3>(int x) { return tex1Dfetch(d_b_veloc_tex, x); }
+template<> __device__ float texfetch_accel<3>(int x) { return tex1Dfetch(d_b_accel_tex, x); }
+
 #endif
 
 #ifdef USE_TEXTURES_CONSTANTS
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_tex;
+realw_texture d_hprime_xx_tex;
 #endif
 
 
@@ -60,16 +85,19 @@
                                                  int* d_ibool_interfaces_ext_mesh) {
 
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-  //int iinterface=0;
+  int ientry,iglob;
 
   for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
-    if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
-      d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)] =
-        d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)];
-      d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1] =
-        d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1];
-      d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2] =
-        d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2];
+    if( id < d_nibool_interfaces_ext_mesh[iinterface] ) {
+
+      // entry in interface array
+      ientry = id + max_nibool_interfaces_ext_mesh*iinterface;
+      // global index in wavefield
+      iglob = d_ibool_interfaces_ext_mesh[ientry] - 1;
+
+      d_send_accel_buffer[3*ientry] = d_accel[3*iglob];
+      d_send_accel_buffer[3*ientry + 1 ] = d_accel[3*iglob + 1];
+      d_send_accel_buffer[3*ientry + 2 ] = d_accel[3*iglob + 2];
     }
   }
 
@@ -81,29 +109,23 @@
 // (elements on boundary)
 extern "C"
 void FC_FUNC_(transfer_boun_accel_from_device,
-              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
-                                                    realw* send_accel_buffer,
-                                                    int* num_interfaces_ext_mesh,
-                                                    int* max_nibool_interfaces_ext_mesh,
-                                                    int* nibool_interfaces_ext_mesh,
-                                                    int* ibool_interfaces_ext_mesh,
-                                                    int* FORWARD_OR_ADJOINT){
-TRACE("transfer_boun_accel_from_device");
+              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer,
+                                               realw* accel,
+                                               realw* send_accel_buffer,
+                                               int* FORWARD_OR_ADJOINT){
+TRACE("\ttransfer_boun_accel_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  if( *num_interfaces_ext_mesh == 0 ) return;
+  // checks if anything to do
+  if( mp->size_mpi_buffer > 0 ){
 
-  if( mp->size_mpi_buffer > 0 ){
     int blocksize = BLOCKSIZE_TRANSFER;
     int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
-    int num_blocks_x = size_padded/blocksize;
-    int num_blocks_y = 1;
-    while(num_blocks_x > 65535) {
-      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-      num_blocks_y = num_blocks_y*2;
-    }
 
+    int num_blocks_x, num_blocks_y;
+    get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
     dim3 grid(num_blocks_x,num_blocks_y);
     dim3 threads(blocksize,1,1);
 
@@ -113,25 +135,41 @@
     // cudaEventCreate(&start);
     // cudaEventCreate(&stop);
     // cudaEventRecord( start, 0 );
+
     if(*FORWARD_OR_ADJOINT == 1) {
       prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
                                                                               mp->num_interfaces_ext_mesh,
                                                                               mp->max_nibool_interfaces_ext_mesh,
                                                                               mp->d_nibool_interfaces_ext_mesh,
                                                                               mp->d_ibool_interfaces_ext_mesh);
+      // synchronizes
+      //synchronize_cuda();
+      // explicitly waits until previous compute stream finishes
+      // (cudaMemcpy implicitly synchronizes all other cuda operations)
+      cudaStreamSynchronize(mp->compute_stream);
+
+      // copies buffer from GPU to CPU host
+      print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer,
+                              mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost),97001);
+
     }
     else if(*FORWARD_OR_ADJOINT == 3) {
-      prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_send_accel_buffer,
+      prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_b_send_accel_buffer,
                                                                               mp->num_interfaces_ext_mesh,
                                                                               mp->max_nibool_interfaces_ext_mesh,
                                                                               mp->d_nibool_interfaces_ext_mesh,
                                                                               mp->d_ibool_interfaces_ext_mesh);
+      // synchronizes
+      //synchronize_cuda();
+      // explicitly waits until previous compute stream finishes
+      // (cudaMemcpy implicitly synchronizes all other cuda operations)
+      cudaStreamSynchronize(mp->compute_stream);
+
+      // copies buffer from GPU to CPU host
+      print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer,
+                              mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost),97002);
     }
 
-    // copies buffer from GPU to CPU host
-    cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer,
-               mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost);
-
     // finish timing of kernel+memcpy
     // cudaEventRecord( stop, 0 );
     // cudaEventSynchronize( stop );
@@ -155,7 +193,7 @@
 
 // asynchronous transfer from device to host
 
-  TRACE("transfer_boundary_from_device_a");
+  TRACE("\ttransfer_boundary_from_device_a");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
@@ -163,63 +201,88 @@
 
     int blocksize = BLOCKSIZE_TRANSFER;
     int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
-    int num_blocks_x = size_padded/blocksize;
-    int num_blocks_y = 1;
-    while(num_blocks_x > 65535) {
-      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-      num_blocks_y = num_blocks_y*2;
-    }
+
+    int num_blocks_x, num_blocks_y;
+    get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
     dim3 grid(num_blocks_x,num_blocks_y);
     dim3 threads(blocksize,1,1);
 
     prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
-                                                                          mp->num_interfaces_ext_mesh,
-                                                                          mp->max_nibool_interfaces_ext_mesh,
-                                                                          mp->d_nibool_interfaces_ext_mesh,
-                                                                          mp->d_ibool_interfaces_ext_mesh);
-  // wait until kernel is finished before starting async memcpy
-#if CUDA_VERSION >= 4000
-    cudaDeviceSynchronize();
-#else
-    cudaThreadSynchronize();
-#endif
+                                                                            mp->num_interfaces_ext_mesh,
+                                                                            mp->max_nibool_interfaces_ext_mesh,
+                                                                            mp->d_nibool_interfaces_ext_mesh,
+                                                                            mp->d_ibool_interfaces_ext_mesh);
+    // waits until kernel is finished before starting async memcpy
+    //synchronize_cuda();
+    // waits until previous compute stream finishes
+    cudaStreamSynchronize(mp->compute_stream);
 
     cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
-                  mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost,mp->copy_stream);
+                    mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost,mp->copy_stream);
   }
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
+extern "C"
+void FC_FUNC_(transfer_boundary_to_device_a,
+              TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
+                                             realw* buffer_recv_vector_ext_mesh,
+                                             int* num_interfaces_ext_mesh,
+                                             int* max_nibool_interfaces_ext_mesh) {
+
+// asynchronous transfer from host to device
+
+  TRACE("transfer_boundary_to_device_a");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  if( mp->size_mpi_buffer > 0 ){
+    // copy on host memory
+    memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_buffer*sizeof(realw));
+
+    // asynchronous copy to GPU using copy_stream
+    cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+                    mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice,mp->copy_stream);
+  }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Assembly
+
+/* ----------------------------------------------------------------------------------------------- */
+
 __global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
                                                   int num_interfaces_ext_mesh,
                                                   int max_nibool_interfaces_ext_mesh,
                                                   int* d_nibool_interfaces_ext_mesh,
                                                   int* d_ibool_interfaces_ext_mesh) {
 
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
   //int bx = blockIdx.y*gridDim.x+blockIdx.x;
   //int tx = threadIdx.x;
-  //int iinterface=0;
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
+  int ientry,iglob;
+
   for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
-    if(id < d_nibool_interfaces_ext_mesh[iinterface]) {
+    if( id < d_nibool_interfaces_ext_mesh[iinterface] ) {
 
+      // entry in interface array
+      ientry = id + max_nibool_interfaces_ext_mesh*iinterface;
+      // global index in wavefield
+      iglob = d_ibool_interfaces_ext_mesh[ientry] - 1;
+
       // for testing atomic operations against not atomic operations (0.1ms vs. 0.04 ms)
-      // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)] +=
-      // d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)];
-      // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1] +=
-      // d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1];
-      // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2] +=
-      // d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2];
+      // d_accel[3*(iglob)] += d_send_accel_buffer[3*(ientry)];
+      // d_accel[3*(iglob)+1] += d_send_accel_buffer[3*(ientry)+1];
+      // d_accel[3*(iglob)+2] += d_send_accel_buffer[3*(ientry)+2];
 
-
-      atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)],
-                d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)]);
-      atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1],
-                d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1]);
-      atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2],
-                d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2]);
+      atomicAdd(&d_accel[3*iglob],d_send_accel_buffer[3*ientry]);
+      atomicAdd(&d_accel[3*iglob + 1],d_send_accel_buffer[3*ientry + 1]);
+      atomicAdd(&d_accel[3*iglob + 2],d_send_accel_buffer[3*ientry + 2]);
     }
   }
   // ! This step is done via previous function transfer_and_assemble...
@@ -231,31 +294,85 @@
   // ! enddo
 }
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
+// FORWARD_OR_ADJOINT == 1 for accel, and == 3 for b_accel
 extern "C"
-void FC_FUNC_(transfer_boundary_to_device_a,
-              TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
-                                             realw* buffer_recv_vector_ext_mesh,
-                                             int* num_interfaces_ext_mesh,
-                                             int* max_nibool_interfaces_ext_mesh) {
+void FC_FUNC_(transfer_asmbl_accel_to_device,
+              TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel,
+                                                    realw* buffer_recv_vector_ext_mesh,
+                                                    int* num_interfaces_ext_mesh,
+                                                    int* max_nibool_interfaces_ext_mesh,
+                                                    int* nibool_interfaces_ext_mesh,
+                                                    int* ibool_interfaces_ext_mesh,
+                                                    int* FORWARD_OR_ADJOINT) {
+TRACE("\ttransfer_asmbl_accel_to_device");
 
-// asynchronous transfer from host to device
-
-  TRACE("transfer_boundary_to_device_a");
-
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   if( mp->size_mpi_buffer > 0 ){
-    // copy on host memory
-    memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_buffer*sizeof(realw));
 
-    // cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
-    //        mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice,mp->compute_stream);
-    //printf("xfer to device\n");
-    cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
-                    mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice,mp->copy_stream);
+    //daniel: todo - check if this copy is only needed for adjoint simulation, otherwise it is called asynchronously?
+    if(*FORWARD_OR_ADJOINT == 1 ){
+      // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+      cudaStreamSynchronize(mp->copy_stream);
+    }
+    else if(*FORWARD_OR_ADJOINT == 3 ){
+      // explicitly synchronizes
+      // (cudaMemcpy implicitly synchronizes all other cuda operations)
+      synchronize_cuda();
+
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer, buffer_recv_vector_ext_mesh,
+                              mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice),97001);
+    }
+
+    int blocksize = BLOCKSIZE_TRANSFER;
+    int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+
+    int num_blocks_x, num_blocks_y;
+    get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+    dim3 grid(num_blocks_x,num_blocks_y);
+    dim3 threads(blocksize,1,1);
+
+    //double start_time = get_time();
+    // cudaEvent_t start, stop;
+    // realw time;
+    // cudaEventCreate(&start);
+    // cudaEventCreate(&stop);
+    // cudaEventRecord( start, 0 );
+
+    if(*FORWARD_OR_ADJOINT == 1) {
+      //assemble forward accel
+      assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
+                                                                               mp->num_interfaces_ext_mesh,
+                                                                               mp->max_nibool_interfaces_ext_mesh,
+                                                                               mp->d_nibool_interfaces_ext_mesh,
+                                                                               mp->d_ibool_interfaces_ext_mesh);
+    }
+    else if(*FORWARD_OR_ADJOINT == 3) {
+      //assemble adjoint accel
+      assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel, mp->d_b_send_accel_buffer,
+                                                                               mp->num_interfaces_ext_mesh,
+                                                                               mp->max_nibool_interfaces_ext_mesh,
+                                                                               mp->d_nibool_interfaces_ext_mesh,
+                                                                               mp->d_ibool_interfaces_ext_mesh);
+    }
+
+    // cudaEventRecord( stop, 0 );
+    // cudaEventSynchronize( stop );
+    // cudaEventElapsedTime( &time, start, stop );
+    // cudaEventDestroy( start );
+    // cudaEventDestroy( stop );
+    // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
   }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("transfer_asmbl_accel_to_device");
+#endif
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -277,13 +394,10 @@
 //
 //  int blocksize = BLOCKSIZE_TRANSFER;
 //  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
-//  int num_blocks_x = size_padded/blocksize;
-//  int num_blocks_y = 1;
-//  while(num_blocks_x > 65535) {
-//    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-//    num_blocks_y = num_blocks_y*2;
-//  }
 //
+//  int num_blocks_x, num_blocks_y;
+//  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+//
 //  //double start_time = get_time();
 //  dim3 grid(num_blocks_x,num_blocks_y);
 //  dim3 threads(blocksize,1,1);
@@ -323,86 +437,12 @@
 //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
 //  //double end_time = get_time();
 //  //printf("Elapsed time: %e\n",end_time-start_time);
-//  exit_on_cuda_error("transfer_asmbl_accel_to_device");
+//  exit_on_cuda_error("assemble_accel_on_device");
 //#endif
 //}
 
-/* ----------------------------------------------------------------------------------------------- */
 
-// FORWARD_OR_ADJOINT == 1 for accel, and == 3 for b_accel
-extern "C"
-void FC_FUNC_(transfer_asmbl_accel_to_device,
-              TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel,
-                                                    realw* buffer_recv_vector_ext_mesh,
-                                                    int* num_interfaces_ext_mesh,
-                                                    int* max_nibool_interfaces_ext_mesh,
-                                                    int* nibool_interfaces_ext_mesh,
-                                                    int* ibool_interfaces_ext_mesh,
-                                                    int* FORWARD_OR_ADJOINT) {
-TRACE("transfer_asmbl_accel_to_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-
-  if( mp->size_mpi_buffer > 0 ){
-
-    //daniel: todo - check if this copy is only needed for adjoint simulation, otherwise it is called asynchronously?
-    if(*FORWARD_OR_ADJOINT == 1 ){
-      // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
-      cudaStreamSynchronize(mp->copy_stream);
-    }
-    else if(*FORWARD_OR_ADJOINT == 3 ){
-      cudaMemcpy(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
-                 mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice);
-    }
-
-    int blocksize = BLOCKSIZE_TRANSFER;
-    int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
-    int num_blocks_x = size_padded/blocksize;
-    int num_blocks_y = 1;
-    while(num_blocks_x > 65535) {
-      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-      num_blocks_y = num_blocks_y*2;
-    }
-
-    //double start_time = get_time();
-    dim3 grid(num_blocks_x,num_blocks_y);
-    dim3 threads(blocksize,1,1);
-    // cudaEvent_t start, stop;
-    // realw time;
-    // cudaEventCreate(&start);
-    // cudaEventCreate(&stop);
-    // cudaEventRecord( start, 0 );
-    if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
-      assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
-                                                                               mp->num_interfaces_ext_mesh,
-                                                                               mp->max_nibool_interfaces_ext_mesh,
-                                                                               mp->d_nibool_interfaces_ext_mesh,
-                                                                               mp->d_ibool_interfaces_ext_mesh);
-    }
-    else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
-      assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer,
-                                                                               mp->num_interfaces_ext_mesh,
-                                                                               mp->max_nibool_interfaces_ext_mesh,
-                                                                               mp->d_nibool_interfaces_ext_mesh,
-                                                                               mp->d_ibool_interfaces_ext_mesh);
-    }
-
-    // cudaEventRecord( stop, 0 );
-    // cudaEventSynchronize( stop );
-    // cudaEventElapsedTime( &time, start, stop );
-    // cudaEventDestroy( start );
-    // cudaEventDestroy( stop );
-    // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //double end_time = get_time();
-  //printf("Elapsed time: %e\n",end_time-start_time);
-  exit_on_cuda_error("transfer_asmbl_accel_to_device");
-#endif
-}
-
-
 /* ----------------------------------------------------------------------------------------------- */
 
 // KERNEL 2
@@ -438,17 +478,10 @@
 // updates stress
 
 __device__ void compute_element_att_stress(int tx,int working_element,int NSPEC,
-                                           realw* R_xx,
-                                           realw* R_yy,
-                                           realw* R_xy,
-                                           realw* R_xz,
-                                           realw* R_yz,
-                                           realw* sigma_xx,
-                                           realw* sigma_yy,
-                                           realw* sigma_zz,
-                                           realw* sigma_xy,
-                                           realw* sigma_xz,
-                                           realw* sigma_yz) {
+                                           realw* R_xx,realw* R_yy,realw* R_xy,
+                                           realw* R_xz,realw* R_yz,
+                                           realw* sigma_xx,realw* sigma_yy,realw* sigma_zz,
+                                           realw* sigma_xy,realw* sigma_xz,realw* sigma_yz) {
 
   int i_sls,offset_sls;
   realw R_xx_val,R_yy_val;
@@ -650,13 +683,18 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+/*
+
+// unused
+// original elastic kernel, please leave this code here for reference...
+
 __global__ void Kernel_2_impl(int nb_blocks_to_compute,
                               int NGLOB,
                               int* d_ibool,
                               int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic,
                               int d_iphase,
                               int use_mesh_coloring_gpu,
-            realw d_deltat,
+                              realw d_deltat,
                               realw* d_displ,realw* d_veloc,realw* d_accel,
                               realw* d_xix, realw* d_xiy, realw* d_xiz,
                               realw* d_etax, realw* d_etay, realw* d_etaz,
@@ -676,36 +714,20 @@
                               realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
                               realw* alphaval,realw* betaval,realw* gammaval,
                               int ANISOTROPY,
-                              realw* d_c11store,
-                              realw* d_c12store,
-                              realw* d_c13store,
-                              realw* d_c14store,
-                              realw* d_c15store,
-                              realw* d_c16store,
-                              realw* d_c22store,
-                              realw* d_c23store,
-                              realw* d_c24store,
-                              realw* d_c25store,
-                              realw* d_c26store,
-                              realw* d_c33store,
-                              realw* d_c34store,
-                              realw* d_c35store,
-                              realw* d_c36store,
-                              realw* d_c44store,
-                              realw* d_c45store,
-                              realw* d_c46store,
-                              realw* d_c55store,
-                              realw* d_c56store,
-                              realw* d_c66store,
+                              realw* d_c11store,realw* d_c12store,realw* d_c13store,
+                              realw* d_c14store,realw* d_c15store,realw* d_c16store,
+                              realw* d_c22store,realw* d_c23store,realw* d_c24store,
+                              realw* d_c25store,realw* d_c26store,realw* d_c33store,
+                              realw* d_c34store,realw* d_c35store,realw* d_c36store,
+                              realw* d_c44store,realw* d_c45store,realw* d_c46store,
+                              realw* d_c55store,realw* d_c56store,realw* d_c66store,
                               int gravity,
                               realw* d_minus_g,
                               realw* d_minus_deriv_gravity,
                               realw* d_rhostore,
                               realw* wgll_cube){
 
-  /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
-  int bx = blockIdx.y*gridDim.x+blockIdx.x;
-  /* int bx = blockIdx.x; */
+  int bx = blockIdx.y*gridDim.x + blockIdx.x;
   int tx = threadIdx.x;
 
   const int NGLL3_ALIGN = NGLL3_PADDED;
@@ -784,7 +806,6 @@
     }
 #endif
 
-    // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
     iglob = d_ibool[working_element*NGLL3 + tx]-1;
 
 #ifdef USE_TEXTURES_FIELDS
@@ -1097,13 +1118,13 @@
       // computes deviatoric strain attenuation and/or for kernel calculations
       if(COMPUTE_AND_STORE_STRAIN) {
   realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
-  /*
-    epsilondev_xx[offset] = duxdxl - templ;
-    epsilondev_yy[offset] = duydyl - templ;
-    epsilondev_xy[offset] = 0.5f * duxdyl_plus_duydxl;
-    epsilondev_xz[offset] = 0.5f * duzdxl_plus_duxdzl;
-    epsilondev_yz[offset] = 0.5f * duzdyl_plus_duydzl;
-  */
+
+  //  epsilondev_xx[offset] = duxdxl - templ;
+  //  epsilondev_yy[offset] = duydyl - templ;
+  //  epsilondev_xy[offset] = 0.5f * duxdyl_plus_duydxl;
+  //  epsilondev_xz[offset] = 0.5f * duzdxl_plus_duxdzl;
+  //  epsilondev_yz[offset] = 0.5f * duzdyl_plus_duydzl;
+
   // local storage: stresses at this current time step
   epsilondev_xx_loc = duxdxl - templ;
   epsilondev_yy_loc = duydyl - templ;
@@ -1415,70 +1436,1352 @@
 
 } // kernel_2_impl()
 
+*/
+
 /* ----------------------------------------------------------------------------------------------- */
 
+// note: kernel_2 is split into two kernels:
+//       - a kernel without attenuation Kernel_2_noatt_impl() and
+//       - a kernel including attenuation Kernel_2_att_impl()
+//       this separation should help with performance
+
+
+// kernel without attenuation
+//
+// we use templates to distinguish between calls with forward or adjoint texture fields
+
+template<int FORWARD_OR_ADJOINT> __global__ void Kernel_2_noatt_impl(int nb_blocks_to_compute,
+                                                                      int NGLOB,
+                                                                      int* d_ibool,
+                                                                      int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic,
+                                                                      int d_iphase,
+                                                                      int use_mesh_coloring_gpu,
+                                                                      realw* d_displ,realw* d_veloc,realw* d_accel,
+                                                                      realw* d_xix, realw* d_xiy, realw* d_xiz,
+                                                                      realw* d_etax, realw* d_etay, realw* d_etaz,
+                                                                      realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+                                                                      realw* d_hprime_xx,
+                                                                      realw* d_hprimewgll_xx,
+                                                                      realw* d_wgllwgll_xy,realw* d_wgllwgll_xz,realw* d_wgllwgll_yz,
+                                                                      realw* d_kappav, realw* d_muv,
+                                                                      int COMPUTE_AND_STORE_STRAIN,
+                                                                      realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                                                      realw* epsilondev_xz,realw* epsilondev_yz,
+                                                                      realw* epsilon_trace_over_3,
+                                                                      int SIMULATION_TYPE,
+                                                                      int NSPEC,
+                                                                      realw* one_minus_sum_beta,realw* factor_common,
+                                                                      realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
+                                                                      realw* alphaval,realw* betaval,realw* gammaval,
+                                                                      int ANISOTROPY,
+                                                                      realw* d_c11store,realw* d_c12store,realw* d_c13store,
+                                                                      realw* d_c14store,realw* d_c15store,realw* d_c16store,
+                                                                      realw* d_c22store,realw* d_c23store,realw* d_c24store,
+                                                                      realw* d_c25store,realw* d_c26store,realw* d_c33store,
+                                                                      realw* d_c34store,realw* d_c35store,realw* d_c36store,
+                                                                      realw* d_c44store,realw* d_c45store,realw* d_c46store,
+                                                                      realw* d_c55store,realw* d_c56store,realw* d_c66store,
+                                                                      int gravity,
+                                                                      realw* d_minus_g,
+                                                                      realw* d_minus_deriv_gravity,
+                                                                      realw* d_rhostore,
+                                                                      realw* wgll_cube ){
+
+// elastic compute kernel without attenuation
+// holds for: ATTENUATION = .false.
+//            COMPUTE_AND_STORE_STRAIN = .true. or .false. (true for kernel simulations)
+
+  int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  int tx = threadIdx.x;
+
+  const int NGLL3_ALIGN = NGLL3_PADDED;
+
+  int K = (tx/NGLL2);
+  int J = ((tx-K*NGLL2)/NGLLX);
+  int I = (tx-K*NGLL2-J*NGLLX);
+
+  int active,offset;
+  int iglob = 0;
+  int working_element;
+
+  realw tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
+  realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+  realw duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
+  realw duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
+  realw duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+
+  realw fac1,fac2,fac3,lambdal,mul,lambdalplus2mul,kappal;
+  realw sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+  realw epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+
+  realw c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
+  realw sum_terms1,sum_terms2,sum_terms3;
+
+  // gravity variables
+  realw sigma_yx,sigma_zx,sigma_zy;
+  realw rho_s_H1,rho_s_H2,rho_s_H3;
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+  int l;
+  realw hp1,hp2,hp3;
+#endif
+
+  __shared__ realw s_dummyx_loc[NGLL3];
+  __shared__ realw s_dummyy_loc[NGLL3];
+  __shared__ realw s_dummyz_loc[NGLL3];
+
+  __shared__ realw s_tempx1[NGLL3];
+  __shared__ realw s_tempx2[NGLL3];
+  __shared__ realw s_tempx3[NGLL3];
+
+  __shared__ realw s_tempy1[NGLL3];
+  __shared__ realw s_tempy2[NGLL3];
+  __shared__ realw s_tempy3[NGLL3];
+
+  __shared__ realw s_tempz1[NGLL3];
+  __shared__ realw s_tempz2[NGLL3];
+  __shared__ realw s_tempz3[NGLL3];
+
+  __shared__ realw sh_hprime_xx[NGLL2];
+
+// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
+// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
+  active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+
+// copy from global memory to shared memory
+// each thread writes one of the NGLL^3 = 125 data points
+  if (active) {
+
+#ifdef USE_MESH_COLORING_GPU
+    working_element = bx;
+#else
+    //mesh coloring
+    if( use_mesh_coloring_gpu ){
+      working_element = bx;
+    }else{
+      // iphase-1 and working_element-1 for Fortran->C array conventions
+      working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
+    }
+#endif
+
+    iglob = d_ibool[working_element*NGLL3 + tx]-1;
+    // debug
+    //if( iglob < 0 || iglob >= NGLOB ){ printf("wrong iglob %d\n",iglob);  }
+
+#ifdef USE_TEXTURES_FIELDS
+    s_dummyx_loc[tx] = texfetch_displ<FORWARD_OR_ADJOINT>(iglob*3);
+    s_dummyy_loc[tx] = texfetch_displ<FORWARD_OR_ADJOINT>(iglob*3 + 1);
+    s_dummyz_loc[tx] = texfetch_displ<FORWARD_OR_ADJOINT>(iglob*3 + 2);
+#else
+    // changing iglob indexing to match fortran row changes fast style
+    s_dummyx_loc[tx] = d_displ[iglob*3];
+    s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+    s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+#endif
+  }
+
+  // JC JC here we will need to add GPU support for the new C-PML routines
+
+  if (tx < NGLL2) {
+#ifdef USE_TEXTURES_CONSTANTS
+    sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_tex,tx);
+#else
+    sh_hprime_xx[tx] = d_hprime_xx[tx];
+#endif
+  }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+  __syncthreads();
+
+  if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+    tempx1l = 0.f;
+    tempx2l = 0.f;
+    tempx3l = 0.f;
+
+    tempy1l = 0.f;
+    tempy2l = 0.f;
+    tempy3l = 0.f;
+
+    tempz1l = 0.f;
+    tempz2l = 0.f;
+    tempz3l = 0.f;
+
+    for (l=0;l<NGLLX;l++) {
+      hp1 = sh_hprime_xx[l*NGLLX+I];
+      offset = K*NGLL2+J*NGLLX+l;
+      tempx1l += s_dummyx_loc[offset]*hp1;
+      tempy1l += s_dummyy_loc[offset]*hp1;
+      tempz1l += s_dummyz_loc[offset]*hp1;
+
+      //assumes that hprime_xx = hprime_yy = hprime_zz
+      hp2 = sh_hprime_xx[l*NGLLX+J];
+      offset = K*NGLL2+l*NGLLX+I;
+      tempx2l += s_dummyx_loc[offset]*hp2;
+      tempy2l += s_dummyy_loc[offset]*hp2;
+      tempz2l += s_dummyz_loc[offset]*hp2;
+
+      hp3 = sh_hprime_xx[l*NGLLX+K];
+      offset = l*NGLL2+J*NGLLX+I;
+      tempx3l += s_dummyx_loc[offset]*hp3;
+      tempy3l += s_dummyy_loc[offset]*hp3;
+      tempz3l += s_dummyz_loc[offset]*hp3;
+    }
+
+    // JC JC here we will need to add GPU support for the new C-PML routines
+
+#else
+
+    tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+    tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+    tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+    tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+    tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+    tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+    tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+    tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+    tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+    // JC JC here we will need to add GPU support for the new C-PML routines
+
+#endif
+
+// compute derivatives of ux, uy and uz with respect to x, y and z
+    offset = working_element*NGLL3_ALIGN + tx;
+
+    xixl = d_xix[offset];
+    xiyl = d_xiy[offset];
+    xizl = d_xiz[offset];
+    etaxl = d_etax[offset];
+    etayl = d_etay[offset];
+    etazl = d_etaz[offset];
+    gammaxl = d_gammax[offset];
+    gammayl = d_gammay[offset];
+    gammazl = d_gammaz[offset];
+
+    duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
+    duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+    duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+
+    duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+    duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+    duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+
+    duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+    duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+    duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+
+    // JC JC here we will need to add GPU support for the new C-PML routines
+
+    // precompute some sums to save CPU time
+    duxdxl_plus_duydyl = duxdxl + duydyl;
+    duxdxl_plus_duzdzl = duxdxl + duzdzl;
+    duydyl_plus_duzdzl = duydyl + duzdzl;
+    duxdyl_plus_duydxl = duxdyl + duydxl;
+    duzdxl_plus_duxdzl = duzdxl + duxdzl;
+    duzdyl_plus_duydzl = duzdyl + duydzl;
+
+    // JC JC here we will need to add GPU support for the new C-PML routines
+
+    // computes deviatoric strain for kernel calculations
+    if(COMPUTE_AND_STORE_STRAIN) {
+      realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+      // local storage: stresses at this current time step
+      epsilondev_xx_loc = duxdxl - templ;
+      epsilondev_yy_loc = duydyl - templ;
+      epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+      epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+      epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+
+      if(SIMULATION_TYPE == 3) {
+        epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+      }
+    }
+
+    // compute elements with an elastic isotropic rheology
+    kappal = d_kappav[offset];
+    mul = d_muv[offset];
+
+    // full anisotropic case, stress calculations
+    if(ANISOTROPY){
+
+      c11 = d_c11store[offset];
+      c12 = d_c12store[offset];
+      c13 = d_c13store[offset];
+      c14 = d_c14store[offset];
+      c15 = d_c15store[offset];
+      c16 = d_c16store[offset];
+      c22 = d_c22store[offset];
+      c23 = d_c23store[offset];
+      c24 = d_c24store[offset];
+      c25 = d_c25store[offset];
+      c26 = d_c26store[offset];
+      c33 = d_c33store[offset];
+      c34 = d_c34store[offset];
+      c35 = d_c35store[offset];
+      c36 = d_c36store[offset];
+      c44 = d_c44store[offset];
+      c45 = d_c45store[offset];
+      c46 = d_c46store[offset];
+      c55 = d_c55store[offset];
+      c56 = d_c56store[offset];
+      c66 = d_c66store[offset];
+
+      sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl +
+                 c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl;
+      sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl +
+                 c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl;
+      sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl +
+                 c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl;
+      sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl +
+                 c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl;
+      sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl +
+                 c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl;
+      sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl +
+                 c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl;
+
+    }else{
+
+      // isotropic case
+
+      lambdalplus2mul = kappal + 1.33333333333333333333f * mul;  // 4./3. = 1.3333333
+      lambdal = lambdalplus2mul - 2.0f * mul;
+
+      // compute the six components of the stress tensor sigma
+      sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+      sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+      sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+
+      sigma_xy = mul*duxdyl_plus_duydxl;
+      sigma_xz = mul*duzdxl_plus_duxdzl;
+      sigma_yz = mul*duzdyl_plus_duydzl;
+    }
+
+    jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)-xiyl*(etaxl*gammazl-etazl*gammaxl)+xizl*(etaxl*gammayl-etayl*gammaxl));
+
+    // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
+    sigma_yx = sigma_xy;
+    sigma_zx = sigma_xz;
+    sigma_zy = sigma_yz;
+
+    if( gravity ){
+      //  computes non-symmetric terms for gravity
+      compute_element_gravity(tx,working_element,d_ibool,d_minus_g,d_minus_deriv_gravity,
+                              d_rhostore,wgll_cube,jacobianl,
+                              s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
+                              &sigma_xx,&sigma_yy,&sigma_xz,&sigma_yz,
+                              &rho_s_H1,&rho_s_H2,&rho_s_H3);
+    }
+
+    // form dot product with test vector, non-symmetric form
+    s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
+    s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
+    s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+
+    s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
+    s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
+    s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+
+    s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
+    s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
+    s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+
+  }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+  __syncthreads();
+
+  // JC JC here we will need to add GPU support for the new C-PML routines
+
+  if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+    tempx1l = 0.f;
+    tempy1l = 0.f;
+    tempz1l = 0.f;
+
+    tempx2l = 0.f;
+    tempy2l = 0.f;
+    tempz2l = 0.f;
+
+    tempx3l = 0.f;
+    tempy3l = 0.f;
+    tempz3l = 0.f;
+
+    for (l=0;l<NGLLX;l++) {
+      fac1 = d_hprimewgll_xx[I*NGLLX+l];
+      offset = K*NGLL2+J*NGLLX+l;
+      tempx1l += s_tempx1[offset]*fac1;
+      tempy1l += s_tempy1[offset]*fac1;
+      tempz1l += s_tempz1[offset]*fac1;
+
+      // assumes hprimewgll_xx == hprimewgll_yy == hprimewgll_zz
+      fac2 = d_hprimewgll_xx[J*NGLLX+l];
+      offset = K*NGLL2+l*NGLLX+I;
+      tempx2l += s_tempx2[offset]*fac2;
+      tempy2l += s_tempy2[offset]*fac2;
+      tempz2l += s_tempz2[offset]*fac2;
+
+      fac3 = d_hprimewgll_xx[K*NGLLX+l];
+      offset = l*NGLL2+J*NGLLX+I;
+      tempx3l += s_tempx3[offset]*fac3;
+      tempy3l += s_tempy3[offset]*fac3;
+      tempz3l += s_tempz3[offset]*fac3;
+    }
+
+#else
+
+    tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+            + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+            + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+            + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+            + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+    tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+            + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+            + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+            + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+            + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+    tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+            + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+            + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+            + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+            + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+    tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+            + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+            + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+            + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+            + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+
+    tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+            + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+            + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+            + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+            + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+
+    tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+            + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+            + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+            + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+            + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+
+    tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+            + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+            + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+            + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+            + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
+    tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+            + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+            + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+            + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+            + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
+    tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+            + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+            + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+            + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+            + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
+#endif
+
+    fac1 = d_wgllwgll_yz[K*NGLLX+J];
+    fac2 = d_wgllwgll_xz[K*NGLLX+I];
+    fac3 = d_wgllwgll_xy[J*NGLLX+I];
+
+    sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+    sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+    sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+
+    // adds gravity term
+    if( gravity ){
+      sum_terms1 += rho_s_H1;
+      sum_terms2 += rho_s_H2;
+      sum_terms3 += rho_s_H3;
+    }
+
+#ifdef USE_MESH_COLORING_GPU
+    // no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+    d_accel[iglob*3]     = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3) + sum_terms1;
+    d_accel[iglob*3 + 1] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 1) + sum_terms2;
+    d_accel[iglob*3 + 2] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 2) + sum_terms3;
+#else
+    d_accel[iglob*3]     += sum_terms1;
+    d_accel[iglob*3 + 1] += sum_terms2;
+    d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
+
+    // JC JC here we will need to add GPU support for the new C-PML routines
+
+#else // MESH_COLORING
+
+    //mesh coloring
+    if( use_mesh_coloring_gpu ){
+
+      // no atomic operation needed, colors don't share global points between elements
+#ifdef USE_TEXTURES_FIELDS
+      d_accel[iglob*3]     = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3) + sum_terms1;
+      d_accel[iglob*3 + 1] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 1) + sum_terms2;
+      d_accel[iglob*3 + 2] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 2) + sum_terms3;
+#else
+      d_accel[iglob*3]     += sum_terms1;
+      d_accel[iglob*3 + 1] += sum_terms2;
+      d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
+
+    }else {
+
+      // for testing purposes only: w/out atomic updates
+      //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
+      //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
+      //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
+      // w/out atomic update
+      //d_accel[iglob*3]     += sum_terms1;
+      //d_accel[iglob*3 + 1] += sum_terms2;
+      //d_accel[iglob*3 + 2] += sum_terms3;
+
+      atomicAdd(&d_accel[iglob*3], sum_terms1);
+      atomicAdd(&d_accel[iglob*3+1], sum_terms2);
+      atomicAdd(&d_accel[iglob*3+2], sum_terms3);
+
+    } // if(use_mesh_coloring_gpu)
+
+#endif // MESH_COLORING
+
+    // save deviatoric strain for Runge-Kutta scheme
+    if( COMPUTE_AND_STORE_STRAIN ){
+      int ijk_ispec = tx + working_element*NGLL3;
+
+      // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+      epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
+      epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
+      epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
+      epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
+      epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
+    }
+
+  } // if(active)
+
+  // JC JC here we will need to add GPU support for the new C-PML routines
+
+} // kernel_2_noatt_impl()
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// kernel with attenuation
+//
+// we use templates to distinguish between calls with forward or adjoint texture fields
+
+template<int FORWARD_OR_ADJOINT> __global__ void Kernel_2_att_impl(int nb_blocks_to_compute,
+                                                                  int NGLOB,
+                                                                  int* d_ibool,
+                                                                  int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic,
+                                                                  int d_iphase,
+                                                                  int use_mesh_coloring_gpu,
+                                                                  realw d_deltat,
+                                                                  realw* d_displ,realw* d_veloc,realw* d_accel,
+                                                                  realw* d_xix, realw* d_xiy, realw* d_xiz,
+                                                                  realw* d_etax, realw* d_etay, realw* d_etaz,
+                                                                  realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+                                                                  realw* d_hprime_xx,
+                                                                  realw* d_hprimewgll_xx,
+                                                                  realw* d_wgllwgll_xy,realw* d_wgllwgll_xz,realw* d_wgllwgll_yz,
+                                                                  realw* d_kappav, realw* d_muv,
+                                                                  realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                                                  realw* epsilondev_xz,realw* epsilondev_yz,
+                                                                  realw* epsilon_trace_over_3,
+                                                                  int SIMULATION_TYPE,
+                                                                  int NSPEC,
+                                                                  realw* one_minus_sum_beta,realw* factor_common,
+                                                                  realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
+                                                                  realw* alphaval,realw* betaval,realw* gammaval,
+                                                                  int ANISOTROPY,
+                                                                  realw* d_c11store,realw* d_c12store,realw* d_c13store,
+                                                                  realw* d_c14store,realw* d_c15store,realw* d_c16store,
+                                                                  realw* d_c22store,realw* d_c23store,realw* d_c24store,
+                                                                  realw* d_c25store,realw* d_c26store,realw* d_c33store,
+                                                                  realw* d_c34store,realw* d_c35store,realw* d_c36store,
+                                                                  realw* d_c44store,realw* d_c45store,realw* d_c46store,
+                                                                  realw* d_c55store,realw* d_c56store,realw* d_c66store,
+                                                                  int gravity,
+                                                                  realw* d_minus_g,
+                                                                  realw* d_minus_deriv_gravity,
+                                                                  realw* d_rhostore,
+                                                                  realw* wgll_cube){
+
+
+// elastic compute kernel with attenuation
+// holds for: ATTENUATION = .true.
+//            COMPUTE_AND_STORE_STRAIN = .true. (always true for attenuation)
+
+  int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  int tx = threadIdx.x;
+
+  const int NGLL3_ALIGN = NGLL3_PADDED;
+
+  int K = (tx/NGLL2);
+  int J = ((tx-K*NGLL2)/NGLLX);
+  int I = (tx-K*NGLL2-J*NGLLX);
+
+  int active,offset;
+  int iglob = 0;
+  int working_element;
+
+  realw tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
+  realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+  realw duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
+  realw duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
+  realw duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+
+  realw tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
+  realw duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
+  realw duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
+
+  realw fac1,fac2,fac3,lambdal,mul,lambdalplus2mul,kappal;
+  realw sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+  realw epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+
+  realw c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
+  realw sum_terms1,sum_terms2,sum_terms3;
+
+  // gravity variables
+  realw sigma_yx,sigma_zx,sigma_zy;
+  realw rho_s_H1,rho_s_H2,rho_s_H3;
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+  int l;
+  realw hp1,hp2,hp3;
+#endif
+
+  __shared__ realw s_dummyx_loc[NGLL3];
+  __shared__ realw s_dummyy_loc[NGLL3];
+  __shared__ realw s_dummyz_loc[NGLL3];
+
+  __shared__ realw s_dummyx_loc_att[NGLL3];
+  __shared__ realw s_dummyy_loc_att[NGLL3];
+  __shared__ realw s_dummyz_loc_att[NGLL3];
+
+  __shared__ realw s_tempx1[NGLL3];
+  __shared__ realw s_tempx2[NGLL3];
+  __shared__ realw s_tempx3[NGLL3];
+
+  __shared__ realw s_tempy1[NGLL3];
+  __shared__ realw s_tempy2[NGLL3];
+  __shared__ realw s_tempy3[NGLL3];
+
+  __shared__ realw s_tempz1[NGLL3];
+  __shared__ realw s_tempz2[NGLL3];
+  __shared__ realw s_tempz3[NGLL3];
+
+  __shared__ realw sh_hprime_xx[NGLL2];
+
+// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
+// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
+  active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+
+// copy from global memory to shared memory
+// each thread writes one of the NGLL^3 = 125 data points
+  if (active) {
+
+#ifdef USE_MESH_COLORING_GPU
+    working_element = bx;
+#else
+    //mesh coloring
+    if( use_mesh_coloring_gpu ){
+      working_element = bx;
+    }else{
+      // iphase-1 and working_element-1 for Fortran->C array conventions
+      working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
+    }
+#endif
+
+    iglob = d_ibool[working_element*NGLL3 + tx]-1;
+
+#ifdef USE_TEXTURES_FIELDS
+    s_dummyx_loc[tx] = texfetch_displ<FORWARD_OR_ADJOINT>(iglob*3);
+    s_dummyy_loc[tx] = texfetch_displ<FORWARD_OR_ADJOINT>(iglob*3 + 1);
+    s_dummyz_loc[tx] = texfetch_displ<FORWARD_OR_ADJOINT>(iglob*3 + 2);
+#else
+    // changing iglob indexing to match fortran row changes fast style
+    s_dummyx_loc[tx] = d_displ[iglob*3];
+    s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+    s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+#endif
+
+  // JC JC here we will need to add GPU support for the new C-PML routines
+
+  // attenuation
+  // use first order Taylor expansion of displacement for local storage of stresses
+  // at this current time step, to fix attenuation in a consistent way
+#ifdef USE_TEXTURES_FIELDS
+  s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * texfetch_veloc<FORWARD_OR_ADJOINT>(iglob*3);
+  s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * texfetch_veloc<FORWARD_OR_ADJOINT>(iglob*3 + 1);
+  s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * texfetch_veloc<FORWARD_OR_ADJOINT>(iglob*3 + 2);
+#else
+  s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
+  s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
+  s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
+#endif
+  }
+
+
+  if (tx < NGLL2) {
+#ifdef USE_TEXTURES_CONSTANTS
+    sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_tex,tx);
+#else
+    sh_hprime_xx[tx] = d_hprime_xx[tx];
+#endif
+  }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+  __syncthreads();
+
+  if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+    tempx1l = 0.f;
+    tempx2l = 0.f;
+    tempx3l = 0.f;
+
+    tempy1l = 0.f;
+    tempy2l = 0.f;
+    tempy3l = 0.f;
+
+    tempz1l = 0.f;
+    tempz2l = 0.f;
+    tempz3l = 0.f;
+
+    for (l=0;l<NGLLX;l++) {
+      hp1 = sh_hprime_xx[l*NGLLX+I];
+      offset = K*NGLL2+J*NGLLX+l;
+      tempx1l += s_dummyx_loc[offset]*hp1;
+      tempy1l += s_dummyy_loc[offset]*hp1;
+      tempz1l += s_dummyz_loc[offset]*hp1;
+
+      //assumes that hprime_xx = hprime_yy = hprime_zz
+      hp2 = sh_hprime_xx[l*NGLLX+J];
+      offset = K*NGLL2+l*NGLLX+I;
+      tempx2l += s_dummyx_loc[offset]*hp2;
+      tempy2l += s_dummyy_loc[offset]*hp2;
+      tempz2l += s_dummyz_loc[offset]*hp2;
+
+      hp3 = sh_hprime_xx[l*NGLLX+K];
+      offset = l*NGLL2+J*NGLLX+I;
+      tempx3l += s_dummyx_loc[offset]*hp3;
+      tempy3l += s_dummyy_loc[offset]*hp3;
+      tempz3l += s_dummyz_loc[offset]*hp3;
+    }
+
+    // JC JC here we will need to add GPU support for the new C-PML routines
+
+    // attenuation
+    // temporary variables used for fixing attenuation in a consistent way
+    tempx1l_att = 0.f;
+    tempx2l_att = 0.f;
+    tempx3l_att = 0.f;
+
+    tempy1l_att = 0.f;
+    tempy2l_att = 0.f;
+    tempy3l_att = 0.f;
+
+    tempz1l_att = 0.f;
+    tempz2l_att = 0.f;
+    tempz3l_att = 0.f;
+
+    for (l=0;l<NGLLX;l++) {
+      hp1 = sh_hprime_xx[l*NGLLX+I];
+      offset = K*NGLL2+J*NGLLX+l;
+      tempx1l_att += s_dummyx_loc_att[offset]*hp1;
+      tempy1l_att += s_dummyy_loc_att[offset]*hp1;
+      tempz1l_att += s_dummyz_loc_att[offset]*hp1;
+
+      hp2 = sh_hprime_xx[l*NGLLX+J];
+      offset = K*NGLL2+l*NGLLX+I;
+      tempx2l_att += s_dummyx_loc_att[offset]*hp2;
+      tempy2l_att += s_dummyy_loc_att[offset]*hp2;
+      tempz2l_att += s_dummyz_loc_att[offset]*hp2;
+
+      hp3 = sh_hprime_xx[l*NGLLX+K];
+      offset = l*NGLL2+J*NGLLX+I;
+      tempx3l_att += s_dummyx_loc_att[offset]*hp3;
+      tempy3l_att += s_dummyy_loc_att[offset]*hp3;
+      tempz3l_att += s_dummyz_loc_att[offset]*hp3;
+    }
+
+#else
+
+    tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+    tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+    tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+    tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+    tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+    tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+    tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+    tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+    tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+    // JC JC here we will need to add GPU support for the new C-PML routines
+
+    // attenuation
+    // temporary variables used for fixing attenuation in a consistent way
+    tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+                  + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+                  + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+                  + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+                  + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+    tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+                  + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+                  + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+                  + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+                  + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+    tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+                  + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+                  + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+                  + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+                  + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+    tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+                  + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+                  + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+                  + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+                  + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+    tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+                  + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+                  + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+                  + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+                  + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+    tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+                  + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+                  + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+                  + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+                  + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+    tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+                  + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+                  + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+                  + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+                  + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+    tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+                  + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+                  + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+                  + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+                  + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+    tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+                  + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+                  + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+                  + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+                  + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+#endif
+
+// compute derivatives of ux, uy and uz with respect to x, y and z
+    offset = working_element*NGLL3_ALIGN + tx;
+
+    xixl = d_xix[offset];
+    xiyl = d_xiy[offset];
+    xizl = d_xiz[offset];
+    etaxl = d_etax[offset];
+    etayl = d_etay[offset];
+    etazl = d_etaz[offset];
+    gammaxl = d_gammax[offset];
+    gammayl = d_gammay[offset];
+    gammazl = d_gammaz[offset];
+
+    duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
+    duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+    duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+
+    duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+    duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+    duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+
+    duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+    duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+    duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+
+    // JC JC here we will need to add GPU support for the new C-PML routines
+
+    // precompute some sums to save CPU time
+    duxdxl_plus_duydyl = duxdxl + duydyl;
+    duxdxl_plus_duzdzl = duxdxl + duzdzl;
+    duydyl_plus_duzdzl = duydyl + duzdzl;
+    duxdyl_plus_duydxl = duxdyl + duydxl;
+    duzdxl_plus_duxdzl = duzdxl + duxdzl;
+    duzdyl_plus_duydzl = duzdyl + duydzl;
+
+    // JC JC here we will need to add GPU support for the new C-PML routines
+
+    // attenuation
+    // temporary variables used for fixing attenuation in a consistent way
+    duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
+    duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
+    duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
+
+    duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
+    duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
+    duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
+
+    duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
+    duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
+    duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
+
+    // precompute some sums to save CPU time
+    duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
+    duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
+    duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
+
+    // attenuation
+    // computes deviatoric strain attenuation and/or for kernel calculations
+    realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
+    // local storage: stresses at this current time step
+    epsilondev_xx_loc = duxdxl_att - templ;
+    epsilondev_yy_loc = duydyl_att - templ;
+    epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
+    epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
+    epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
+
+    if(SIMULATION_TYPE == 3) {
+      epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+    }
+
+    // compute elements with an elastic isotropic rheology
+    kappal = d_kappav[offset];
+    mul = d_muv[offset];
+
+    // attenuation
+    // use unrelaxed parameters if attenuation
+    mul  = mul * one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
+
+    // full anisotropic case, stress calculations
+    if(ANISOTROPY){
+
+      c11 = d_c11store[offset];
+      c12 = d_c12store[offset];
+      c13 = d_c13store[offset];
+      c14 = d_c14store[offset];
+      c15 = d_c15store[offset];
+      c16 = d_c16store[offset];
+      c22 = d_c22store[offset];
+      c23 = d_c23store[offset];
+      c24 = d_c24store[offset];
+      c25 = d_c25store[offset];
+      c26 = d_c26store[offset];
+      c33 = d_c33store[offset];
+      c34 = d_c34store[offset];
+      c35 = d_c35store[offset];
+      c36 = d_c36store[offset];
+      c44 = d_c44store[offset];
+      c45 = d_c45store[offset];
+      c46 = d_c46store[offset];
+      c55 = d_c55store[offset];
+      c56 = d_c56store[offset];
+      c66 = d_c66store[offset];
+
+      sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl +
+                 c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl;
+      sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl +
+                 c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl;
+      sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl +
+                 c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl;
+      sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl +
+                 c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl;
+      sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl +
+                 c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl;
+      sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl +
+                 c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl;
+
+    }else{
+
+      // isotropic case
+
+      lambdalplus2mul = kappal + 1.33333333333333333333f * mul;  // 4./3. = 1.3333333
+      lambdal = lambdalplus2mul - 2.0f * mul;
+
+      // compute the six components of the stress tensor sigma
+      sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+      sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+      sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+
+      sigma_xy = mul*duxdyl_plus_duydxl;
+      sigma_xz = mul*duzdxl_plus_duxdzl;
+      sigma_yz = mul*duzdyl_plus_duydzl;
+    }
+
+    // attenuation
+    // subtracts memory variables if attenuation
+    compute_element_att_stress(tx,working_element,NSPEC,
+                               R_xx,R_yy,R_xy,R_xz,R_yz,
+                               &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
+
+    jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)-xiyl*(etaxl*gammazl-etazl*gammaxl)+xizl*(etaxl*gammayl-etayl*gammaxl));
+
+    // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
+    sigma_yx = sigma_xy;
+    sigma_zx = sigma_xz;
+    sigma_zy = sigma_yz;
+
+    if( gravity ){
+      //  computes non-symmetric terms for gravity
+      compute_element_gravity(tx,working_element,d_ibool,d_minus_g,d_minus_deriv_gravity,
+                              d_rhostore,wgll_cube,jacobianl,
+                              s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
+                              &sigma_xx,&sigma_yy,&sigma_xz,&sigma_yz,
+                              &rho_s_H1,&rho_s_H2,&rho_s_H3);
+    }
+
+    // form dot product with test vector, non-symmetric form
+    s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
+    s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
+    s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+
+    s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
+    s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
+    s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+
+    s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
+    s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
+    s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+
+  }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+  __syncthreads();
+
+  // JC JC here we will need to add GPU support for the new C-PML routines
+
+  if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+    tempx1l = 0.f;
+    tempy1l = 0.f;
+    tempz1l = 0.f;
+
+    tempx2l = 0.f;
+    tempy2l = 0.f;
+    tempz2l = 0.f;
+
+    tempx3l = 0.f;
+    tempy3l = 0.f;
+    tempz3l = 0.f;
+
+    for (l=0;l<NGLLX;l++) {
+
+      fac1 = d_hprimewgll_xx[I*NGLLX+l];
+      offset = K*NGLL2+J*NGLLX+l;
+      tempx1l += s_tempx1[offset]*fac1;
+      tempy1l += s_tempy1[offset]*fac1;
+      tempz1l += s_tempz1[offset]*fac1;
+
+      // assumes hprimewgll_xx == hprimewgll_yy == hprimewgll_zz
+      fac2 = d_hprimewgll_xx[J*NGLLX+l];
+      offset = K*NGLL2+l*NGLLX+I;
+      tempx2l += s_tempx2[offset]*fac2;
+      tempy2l += s_tempy2[offset]*fac2;
+      tempz2l += s_tempz2[offset]*fac2;
+
+      fac3 = d_hprimewgll_xx[K*NGLLX+l];
+      offset = l*NGLL2+J*NGLLX+I;
+      tempx3l += s_tempx3[offset]*fac3;
+      tempy3l += s_tempy3[offset]*fac3;
+      tempz3l += s_tempz3[offset]*fac3;
+
+    }
+#else
+
+    tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+            + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+            + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+            + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+            + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+    tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+            + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+            + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+            + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+            + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+    tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+            + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+            + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+            + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+            + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+    tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+            + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+            + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+            + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+            + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+
+    tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+            + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+            + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+            + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+            + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+
+    tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+            + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+            + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+            + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+            + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+
+    tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+            + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+            + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+            + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+            + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
+    tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+            + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+            + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+            + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+            + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
+    tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+            + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+            + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+            + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+            + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
+#endif
+
+    fac1 = d_wgllwgll_yz[K*NGLLX+J];
+    fac2 = d_wgllwgll_xz[K*NGLLX+I];
+    fac3 = d_wgllwgll_xy[J*NGLLX+I];
+
+    sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+    sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+    sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+
+    // adds gravity term
+    if( gravity ){
+      sum_terms1 += rho_s_H1;
+      sum_terms2 += rho_s_H2;
+      sum_terms3 += rho_s_H3;
+    }
+
+#ifdef USE_MESH_COLORING_GPU
+    // no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+    d_accel[iglob*3]     = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3) + sum_terms1;
+    d_accel[iglob*3 + 1] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 1) + sum_terms2;
+    d_accel[iglob*3 + 2] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 2) + sum_terms3;
+#else
+    d_accel[iglob*3]     += sum_terms1;
+    d_accel[iglob*3 + 1] += sum_terms2;
+    d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
+
+    // JC JC here we will need to add GPU support for the new C-PML routines
+
+#else // MESH_COLORING
+
+    //mesh coloring
+    if( use_mesh_coloring_gpu ){
+
+      // no atomic operation needed, colors don't share global points between elements
+#ifdef USE_TEXTURES_FIELDS
+      d_accel[iglob*3]     = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3) + sum_terms1;
+      d_accel[iglob*3 + 1] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 1) + sum_terms2;
+      d_accel[iglob*3 + 2] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 2) + sum_terms3;
+#else
+      d_accel[iglob*3]     += sum_terms1;
+      d_accel[iglob*3 + 1] += sum_terms2;
+      d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
+
+    }
+    else {
+
+      // for testing purposes only: w/out atomic updates
+      //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
+      //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
+      //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
+      // w/out atomic update
+      //d_accel[iglob*3]     += sum_terms1;
+      //d_accel[iglob*3 + 1] += sum_terms2;
+      //d_accel[iglob*3 + 2] += sum_terms3;
+
+      atomicAdd(&d_accel[iglob*3], sum_terms1);
+      atomicAdd(&d_accel[iglob*3+1], sum_terms2);
+      atomicAdd(&d_accel[iglob*3+2], sum_terms3);
+
+    } // if(use_mesh_coloring_gpu)
+
+#endif // MESH_COLORING
+
+    // attenuation
+    // update memory variables based upon the Runge-Kutta scheme
+    compute_element_att_memory(tx,working_element,NSPEC,
+                              d_muv,
+                              factor_common,alphaval,betaval,gammaval,
+                              R_xx,R_yy,R_xy,R_xz,R_yz,
+                              epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
+                              epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc);
+
+    // save deviatoric strain for Runge-Kutta scheme
+    int ijk_ispec = tx + working_element*NGLL3;
+
+    // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+    epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
+    epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
+    epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
+    epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
+    epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
+
+  } // if(active)
+
+  // JC JC here we will need to add GPU support for the new C-PML routines
+
+} // kernel_2_att_impl()
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
 void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
               int COMPUTE_AND_STORE_STRAIN,
               int ATTENUATION,int ANISOTROPY,
               int* d_ibool,
-              realw* d_xix,
-              realw* d_xiy,
-              realw* d_xiz,
-              realw* d_etax,
-              realw* d_etay,
-              realw* d_etaz,
-              realw* d_gammax,
-              realw* d_gammay,
-              realw* d_gammaz,
+              realw* d_xix,realw* d_xiy,realw* d_xiz,
+              realw* d_etax,realw* d_etay,realw* d_etaz,
+              realw* d_gammax,realw* d_gammay,realw* d_gammaz,
               realw* d_kappav,
               realw* d_muv,
-              realw* d_epsilondev_xx,
-              realw* d_epsilondev_yy,
-              realw* d_epsilondev_xy,
-              realw* d_epsilondev_xz,
-              realw* d_epsilondev_yz,
+              realw* d_epsilondev_xx,realw* d_epsilondev_yy,realw* d_epsilondev_xy,
+              realw* d_epsilondev_xz,realw* d_epsilondev_yz,
               realw* d_epsilon_trace_over_3,
               realw* d_one_minus_sum_beta,
               realw* d_factor_common,
-              realw* d_R_xx,
-              realw* d_R_yy,
-              realw* d_R_xy,
-              realw* d_R_xz,
-              realw* d_R_yz,
-              realw* d_b_epsilondev_xx,
-              realw* d_b_epsilondev_yy,
-              realw* d_b_epsilondev_xy,
-              realw* d_b_epsilondev_xz,
-              realw* d_b_epsilondev_yz,
+              realw* d_R_xx,realw* d_R_yy,realw* d_R_xy,
+              realw* d_R_xz,realw* d_R_yz,
+              realw* d_b_epsilondev_xx,realw* d_b_epsilondev_yy,realw* d_b_epsilondev_xy,
+              realw* d_b_epsilondev_xz,realw* d_b_epsilondev_yz,
               realw* d_b_epsilon_trace_over_3,
-              realw* d_b_R_xx,
-              realw* d_b_R_yy,
-              realw* d_b_R_xy,
-              realw* d_b_R_xz,
-              realw* d_b_R_yz,
-              realw* d_c11store,
-              realw* d_c12store,
-              realw* d_c13store,
-              realw* d_c14store,
-              realw* d_c15store,
-              realw* d_c16store,
-              realw* d_c22store,
-              realw* d_c23store,
-              realw* d_c24store,
-              realw* d_c25store,
-              realw* d_c26store,
-              realw* d_c33store,
-              realw* d_c34store,
-              realw* d_c35store,
-              realw* d_c36store,
-              realw* d_c44store,
-              realw* d_c45store,
-              realw* d_c46store,
-              realw* d_c55store,
-              realw* d_c56store,
-              realw* d_c66store,
+              realw* d_b_R_xx,realw* d_b_R_yy,realw* d_b_R_xy,
+              realw* d_b_R_xz,realw* d_b_R_yz,
+              realw* d_c11store,realw* d_c12store,realw* d_c13store,
+              realw* d_c14store,realw* d_c15store,realw* d_c16store,
+              realw* d_c22store,realw* d_c23store,realw* d_c24store,
+              realw* d_c25store,realw* d_c26store,realw* d_c33store,
+              realw* d_c34store,realw* d_c35store,realw* d_c36store,
+              realw* d_c44store,realw* d_c45store,realw* d_c46store,
+              realw* d_c55store,realw* d_c56store,realw* d_c66store,
               realw* d_rhostore){
 
+  TRACE("\tKernel_2");
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("before kernel Kernel 2");
 #endif
@@ -1487,14 +2790,11 @@
   /* grid_2_x = nb_elem_color; */
   /* nb_elem_color is just how many blocks we are computing now */
 
-  int num_blocks_x = nb_blocks_to_compute;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int blocksize = NGLL3_PADDED;
 
-  int blocksize = NGLL3_PADDED;
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(nb_blocks_to_compute,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
@@ -1505,123 +2805,176 @@
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
 
-  Kernel_2_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
-                                                        mp->NGLOB_AB,
-                                                        d_ibool,
-                                                        mp->d_phase_ispec_inner_elastic,
-                                                        mp->num_phase_ispec_elastic,
-                                                        d_iphase,
-                                                        mp->use_mesh_coloring_gpu,
-                    d_deltat,
-                                                        mp->d_displ,mp->d_veloc,mp->d_accel,
-                                                        d_xix, d_xiy, d_xiz,
-                                                        d_etax, d_etay, d_etaz,
-                                                        d_gammax, d_gammay, d_gammaz,
-                                                        mp->d_hprime_xx,
-                                                        mp->d_hprimewgll_xx,
-                                                        mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
-                                                        d_kappav, d_muv,
-                                                        COMPUTE_AND_STORE_STRAIN,
-                                                        d_epsilondev_xx,
-                                                        d_epsilondev_yy,
-                                                        d_epsilondev_xy,
-                                                        d_epsilondev_xz,
-                                                        d_epsilondev_yz,
-                                                        d_epsilon_trace_over_3,
-                                                        mp->simulation_type,
-                                                        ATTENUATION,mp->NSPEC_AB,
-                                                        d_one_minus_sum_beta,
-                                                        d_factor_common,
-                                                        d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
-                                                        mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
-                                                        ANISOTROPY,
-                                                        d_c11store,
-                                                        d_c12store,
-                                                        d_c13store,
-                                                        d_c14store,
-                                                        d_c15store,
-                                                        d_c16store,
-                                                        d_c22store,
-                                                        d_c23store,
-                                                        d_c24store,
-                                                        d_c25store,
-                                                        d_c26store,
-                                                        d_c33store,
-                                                        d_c34store,
-                                                        d_c35store,
-                                                        d_c36store,
-                                                        d_c44store,
-                                                        d_c45store,
-                                                        d_c46store,
-                                                        d_c55store,
-                                                        d_c56store,
-                                                        d_c66store,
-                                                        mp->gravity,
-                                                        mp->d_minus_g,
-                                                        mp->d_minus_deriv_gravity,
-                                                        d_rhostore,
-                                                        mp->d_wgll_cube);
+  if( ATTENUATION ){
+    // debug
+    //printf("Running Kernel_2 with attenuation\n");
 
+    // compute kernels with attenuation
+    // forward wavefields -> FORWARD_OR_ADJOINT == 1
+    Kernel_2_att_impl<1><<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
+                                                                mp->NGLOB_AB,
+                                                                d_ibool,
+                                                                mp->d_phase_ispec_inner_elastic,
+                                                                mp->num_phase_ispec_elastic,
+                                                                d_iphase,
+                                                                mp->use_mesh_coloring_gpu,
+                                                                d_deltat,
+                                                                mp->d_displ,mp->d_veloc,mp->d_accel,
+                                                                d_xix, d_xiy, d_xiz,
+                                                                d_etax, d_etay, d_etaz,
+                                                                d_gammax, d_gammay, d_gammaz,
+                                                                mp->d_hprime_xx,
+                                                                mp->d_hprimewgll_xx,
+                                                                mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                                d_kappav, d_muv,
+                                                                d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
+                                                                d_epsilondev_xz,d_epsilondev_yz,
+                                                                d_epsilon_trace_over_3,
+                                                                mp->simulation_type,
+                                                                mp->NSPEC_AB,
+                                                                d_one_minus_sum_beta,
+                                                                d_factor_common,
+                                                                d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+                                                                mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+                                                                ANISOTROPY,
+                                                                d_c11store,d_c12store,d_c13store,
+                                                                d_c14store,d_c15store,d_c16store,
+                                                                d_c22store,d_c23store,d_c24store,
+                                                                d_c25store,d_c26store,d_c33store,
+                                                                d_c34store,d_c35store,d_c36store,
+                                                                d_c44store,d_c45store,d_c46store,
+                                                                d_c55store,d_c56store,d_c66store,
+                                                                mp->gravity,
+                                                                mp->d_minus_g,
+                                                                mp->d_minus_deriv_gravity,
+                                                                d_rhostore,
+                                                                mp->d_wgll_cube);
 
-  if(mp->simulation_type == 3) {
-    Kernel_2_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
-                                                           mp->NGLOB_AB,
-                                                           d_ibool,
-                                                           mp->d_phase_ispec_inner_elastic,
-                                                           mp->num_phase_ispec_elastic,
-                                                           d_iphase,
-                                                           mp->use_mesh_coloring_gpu,
-                 d_deltat,
-                                                           mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
-                                                           d_xix, d_xiy, d_xiz,
-                                                           d_etax, d_etay, d_etaz,
-                                                           d_gammax, d_gammay, d_gammaz,
-                                                           mp->d_hprime_xx,
-                                                           mp->d_hprimewgll_xx,
-                                                           mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
-                                                           d_kappav, d_muv,
-                                                           COMPUTE_AND_STORE_STRAIN,
-                                                           d_b_epsilondev_xx,
-                                                           d_b_epsilondev_yy,
-                                                           d_b_epsilondev_xy,
-                                                           d_b_epsilondev_xz,
-                                                           d_b_epsilondev_yz,
-                                                           d_b_epsilon_trace_over_3,
-                                                           mp->simulation_type,
-                                                           ATTENUATION,mp->NSPEC_AB,
-                                                           d_one_minus_sum_beta,
-                                                           d_factor_common,
-                                                           d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
-                                                           mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
-                                                           ANISOTROPY,
-                                                           d_c11store,
-                                                           d_c12store,
-                                                           d_c13store,
-                                                           d_c14store,
-                                                           d_c15store,
-                                                           d_c16store,
-                                                           d_c22store,
-                                                           d_c23store,
-                                                           d_c24store,
-                                                           d_c25store,
-                                                           d_c26store,
-                                                           d_c33store,
-                                                           d_c34store,
-                                                           d_c35store,
-                                                           d_c36store,
-                                                           d_c44store,
-                                                           d_c45store,
-                                                           d_c46store,
-                                                           d_c55store,
-                                                           d_c56store,
-                                                           d_c66store,
-                                                           mp->gravity,
-                                                           mp->d_minus_g,
-                                                           mp->d_minus_deriv_gravity,
-                                                           d_rhostore,
-                                                           mp->d_wgll_cube);
+    if(mp->simulation_type == 3) {
+      // backward/reconstructed wavefields -> FORWARD_OR_ADJOINT == 3
+      Kernel_2_att_impl<3><<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
+                                                                   mp->NGLOB_AB,
+                                                                   d_ibool,
+                                                                   mp->d_phase_ispec_inner_elastic,
+                                                                   mp->num_phase_ispec_elastic,
+                                                                   d_iphase,
+                                                                   mp->use_mesh_coloring_gpu,
+                                                                   d_deltat,
+                                                                   mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
+                                                                   d_xix, d_xiy, d_xiz,
+                                                                   d_etax, d_etay, d_etaz,
+                                                                   d_gammax, d_gammay, d_gammaz,
+                                                                   mp->d_hprime_xx,
+                                                                   mp->d_hprimewgll_xx,
+                                                                   mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                                   d_kappav, d_muv,
+                                                                   d_b_epsilondev_xx,d_b_epsilondev_yy,d_b_epsilondev_xy,
+                                                                   d_b_epsilondev_xz,d_b_epsilondev_yz,
+                                                                   d_b_epsilon_trace_over_3,
+                                                                   mp->simulation_type,
+                                                                   mp->NSPEC_AB,
+                                                                   d_one_minus_sum_beta,
+                                                                   d_factor_common,
+                                                                   d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
+                                                                   mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
+                                                                   ANISOTROPY,
+                                                                   d_c11store,d_c12store,d_c13store,
+                                                                   d_c14store,d_c15store,d_c16store,
+                                                                   d_c22store,d_c23store,d_c24store,
+                                                                   d_c25store,d_c26store,d_c33store,
+                                                                   d_c34store,d_c35store,d_c36store,
+                                                                   d_c44store,d_c45store,d_c46store,
+                                                                   d_c55store,d_c56store,d_c66store,
+                                                                   mp->gravity,
+                                                                   mp->d_minus_g,
+                                                                   mp->d_minus_deriv_gravity,
+                                                                   d_rhostore,
+                                                                   mp->d_wgll_cube);
+    }
+  }else{
+    // debug
+    //printf("Running Kernel_2 without attenuation\n");
+
+    // compute kernels without attenuation
+    // forward wavefields -> FORWARD_OR_ADJOINT == 1
+    Kernel_2_noatt_impl<1><<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
+                                                                  mp->NGLOB_AB,
+                                                                  d_ibool,
+                                                                  mp->d_phase_ispec_inner_elastic,mp->num_phase_ispec_elastic,
+                                                                  d_iphase,
+                                                                  mp->use_mesh_coloring_gpu,
+                                                                  mp->d_displ,mp->d_veloc,mp->d_accel,
+                                                                  d_xix, d_xiy, d_xiz,
+                                                                  d_etax, d_etay, d_etaz,
+                                                                  d_gammax, d_gammay, d_gammaz,
+                                                                  mp->d_hprime_xx,
+                                                                  mp->d_hprimewgll_xx,
+                                                                  mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                                  d_kappav, d_muv,
+                                                                  COMPUTE_AND_STORE_STRAIN,
+                                                                  d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
+                                                                  d_epsilondev_xz,d_epsilondev_yz,
+                                                                  d_epsilon_trace_over_3,
+                                                                  mp->simulation_type,
+                                                                  mp->NSPEC_AB,
+                                                                  d_one_minus_sum_beta,d_factor_common,
+                                                                  d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+                                                                  mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+                                                                  ANISOTROPY,
+                                                                  d_c11store,d_c12store,d_c13store,
+                                                                  d_c14store,d_c15store,d_c16store,
+                                                                  d_c22store,d_c23store,d_c24store,
+                                                                  d_c25store,d_c26store,d_c33store,
+                                                                  d_c34store,d_c35store,d_c36store,
+                                                                  d_c44store,d_c45store,d_c46store,
+                                                                  d_c55store,d_c56store,d_c66store,
+                                                                  mp->gravity,
+                                                                  mp->d_minus_g,
+                                                                  mp->d_minus_deriv_gravity,
+                                                                  d_rhostore,
+                                                                  mp->d_wgll_cube );
+
+    // backward/reconstructed wavefield
+    if(mp->simulation_type == 3) {
+      // backward/reconstructed wavefields -> FORWARD_OR_ADJOINT == 3
+      Kernel_2_noatt_impl<3><<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
+                                                                     mp->NGLOB_AB,
+                                                                     d_ibool,
+                                                                     mp->d_phase_ispec_inner_elastic,mp->num_phase_ispec_elastic,
+                                                                     d_iphase,
+                                                                     mp->use_mesh_coloring_gpu,
+                                                                     mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
+                                                                     d_xix, d_xiy, d_xiz,
+                                                                     d_etax, d_etay, d_etaz,
+                                                                     d_gammax, d_gammay, d_gammaz,
+                                                                     mp->d_hprime_xx,
+                                                                     mp->d_hprimewgll_xx,
+                                                                     mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                                     d_kappav, d_muv,
+                                                                     COMPUTE_AND_STORE_STRAIN,
+                                                                     d_b_epsilondev_xx,d_b_epsilondev_yy,d_b_epsilondev_xy,
+                                                                     d_b_epsilondev_xz,d_b_epsilondev_yz,
+                                                                     d_b_epsilon_trace_over_3,
+                                                                     mp->simulation_type,
+                                                                     mp->NSPEC_AB,
+                                                                     d_one_minus_sum_beta,d_factor_common,
+                                                                     d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
+                                                                     mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
+                                                                     ANISOTROPY,
+                                                                     d_c11store,d_c12store,d_c13store,
+                                                                     d_c14store,d_c15store,d_c16store,
+                                                                     d_c22store,d_c23store,d_c24store,
+                                                                     d_c25store,d_c26store,d_c33store,
+                                                                     d_c34store,d_c35store,d_c36store,
+                                                                     d_c44store,d_c45store,d_c46store,
+                                                                     d_c55store,d_c56store,d_c66store,
+                                                                     mp->gravity,
+                                                                     mp->d_minus_g,
+                                                                     mp->d_minus_deriv_gravity,
+                                                                     d_rhostore,
+                                                                     mp->d_wgll_cube );
+    }
   }
-
   // cudaEventRecord( stop, 0 );
   // cudaEventSynchronize( stop );
   // cudaEventElapsedTime( &time, start, stop );
@@ -1629,10 +2982,10 @@
   // cudaEventDestroy( stop );
   // printf("Kernel2 Execution Time: %f ms\n",time);
 
-  /* cudaThreadSynchronize(); */
-  /* LOG("Kernel 2 finished"); */
+  // cudaThreadSynchronize(); //
+  // LOG("Kernel 2 finished"); //
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("Kernel_2_impl ");
+  exit_on_cuda_error("Kernel_2_impl");
 #endif
 }
 
@@ -1641,21 +2994,21 @@
 
 extern "C"
 void FC_FUNC_(compute_forces_viscoelastic_cuda,
-              COMPUTE_FORCES_VISCOELASTIC_CUDA)(long* Mesh_pointer_f,
-                                           int* iphase,
-             realw* deltat,
-                                           int* nspec_outer_elastic,
-                                           int* nspec_inner_elastic,
-                                           int* COMPUTE_AND_STORE_STRAIN,
-                                           int* ATTENUATION,
-                                           int* ANISOTROPY) {
+              COMPUTE_FORCES_VISCOELASTIC_CUDA)(long* Mesh_pointer,
+                                                int* iphase,
+                                                realw* deltat,
+                                                int* nspec_outer_elastic,
+                                                int* nspec_inner_elastic,
+                                                int* COMPUTE_AND_STORE_STRAIN,
+                                                int* ATTENUATION,
+                                                int* ANISOTROPY) {
 
-  TRACE("compute_forces_viscoelastic_cuda");
+  TRACE("\tcompute_forces_viscoelastic_cuda");
   // EPIK_TRACER("compute_forces_viscoelastic_cuda");
   //printf("Running compute_forces\n");
   //double start_time = get_time();
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
   int num_elements;
 
@@ -1669,14 +3022,12 @@
 
   // mesh coloring
   if( mp->use_mesh_coloring_gpu ){
-
     // note: array offsets require sorted arrays, such that e.g. ibool starts with elastic elements
     //         and followed by acoustic ones.
     //         elastic elements also start with outer than inner element ordering
-
     int nb_colors,nb_blocks_to_compute;
     int istart;
-    int color_offset,color_offset_nonpadded,color_offset_nonpadded_att2;
+    int offset,offset_nonpadded,offset_nonpadded_att2;
 
     // sets up color loop
     if( *iphase == 1 ){
@@ -1685,18 +3036,18 @@
       istart = 0;
 
       // array offsets
-      color_offset = 0;
-      color_offset_nonpadded = 0;
-      color_offset_nonpadded_att2 = 0;
+      offset = 0;
+      offset_nonpadded = 0;
+      offset_nonpadded_att2 = 0;
     }else{
       // inner elements (start after outer elements)
       nb_colors = mp->num_colors_outer_elastic + mp->num_colors_inner_elastic;
       istart = mp->num_colors_outer_elastic;
 
       // array offsets
-      color_offset = (*nspec_outer_elastic) * NGLL3_PADDED;
-      color_offset_nonpadded = (*nspec_outer_elastic) * NGLL3;
-      color_offset_nonpadded_att2 = (*nspec_outer_elastic) * NGLL3 * N_SLS;
+      offset = (*nspec_outer_elastic) * NGLL3_PADDED;
+      offset_nonpadded = (*nspec_outer_elastic) * NGLL3;
+      offset_nonpadded_att2 = (*nspec_outer_elastic) * NGLL3 * N_SLS;
     }
 
     // loops over colors
@@ -1713,145 +3064,75 @@
       Kernel_2(nb_blocks_to_compute,mp,*iphase,*deltat,
                *COMPUTE_AND_STORE_STRAIN,
                *ATTENUATION,*ANISOTROPY,
-               mp->d_ibool + color_offset_nonpadded,
-               mp->d_xix + color_offset,
-               mp->d_xiy + color_offset,
-               mp->d_xiz + color_offset,
-               mp->d_etax + color_offset,
-               mp->d_etay + color_offset,
-               mp->d_etaz + color_offset,
-               mp->d_gammax + color_offset,
-               mp->d_gammay + color_offset,
-               mp->d_gammaz + color_offset,
-               mp->d_kappav + color_offset,
-               mp->d_muv + color_offset,
-               mp->d_epsilondev_xx + color_offset_nonpadded,
-               mp->d_epsilondev_yy + color_offset_nonpadded,
-               mp->d_epsilondev_xy + color_offset_nonpadded,
-               mp->d_epsilondev_xz + color_offset_nonpadded,
-               mp->d_epsilondev_yz + color_offset_nonpadded,
-               mp->d_epsilon_trace_over_3 + color_offset_nonpadded,
-               mp->d_one_minus_sum_beta + color_offset_nonpadded,
-               mp->d_factor_common + color_offset_nonpadded_att2,
-               mp->d_R_xx + color_offset_nonpadded,
-               mp->d_R_yy + color_offset_nonpadded,
-               mp->d_R_xy + color_offset_nonpadded,
-               mp->d_R_xz + color_offset_nonpadded,
-               mp->d_R_yz + color_offset_nonpadded,
-               mp->d_b_epsilondev_xx + color_offset_nonpadded,
-               mp->d_b_epsilondev_yy + color_offset_nonpadded,
-               mp->d_b_epsilondev_xy + color_offset_nonpadded,
-               mp->d_b_epsilondev_xz + color_offset_nonpadded,
-               mp->d_b_epsilondev_yz + color_offset_nonpadded,
-               mp->d_b_epsilon_trace_over_3 + color_offset_nonpadded,
-               mp->d_b_R_xx + color_offset_nonpadded,
-               mp->d_b_R_yy + color_offset_nonpadded,
-               mp->d_b_R_xy + color_offset_nonpadded,
-               mp->d_b_R_xz + color_offset_nonpadded,
-               mp->d_b_R_yz + color_offset_nonpadded,
-               mp->d_c11store + color_offset,
-               mp->d_c12store + color_offset,
-               mp->d_c13store + color_offset,
-               mp->d_c14store + color_offset,
-               mp->d_c15store + color_offset,
-               mp->d_c16store + color_offset,
-               mp->d_c22store + color_offset,
-               mp->d_c23store + color_offset,
-               mp->d_c24store + color_offset,
-               mp->d_c25store + color_offset,
-               mp->d_c26store + color_offset,
-               mp->d_c33store + color_offset,
-               mp->d_c34store + color_offset,
-               mp->d_c35store + color_offset,
-               mp->d_c36store + color_offset,
-               mp->d_c44store + color_offset,
-               mp->d_c45store + color_offset,
-               mp->d_c46store + color_offset,
-               mp->d_c55store + color_offset,
-               mp->d_c56store + color_offset,
-               mp->d_c66store + color_offset,
-               mp->d_rhostore + color_offset);
+               mp->d_ibool + offset_nonpadded,
+               mp->d_xix + offset,mp->d_xiy + offset,mp->d_xiz + offset,
+               mp->d_etax + offset,mp->d_etay + offset,mp->d_etaz + offset,
+               mp->d_gammax + offset,mp->d_gammay + offset,mp->d_gammaz + offset,
+               mp->d_kappav + offset,
+               mp->d_muv + offset,
+               mp->d_epsilondev_xx + offset_nonpadded,mp->d_epsilondev_yy + offset_nonpadded,mp->d_epsilondev_xy + offset_nonpadded,
+               mp->d_epsilondev_xz + offset_nonpadded,mp->d_epsilondev_yz + offset_nonpadded,
+               mp->d_epsilon_trace_over_3 + offset_nonpadded,
+               mp->d_one_minus_sum_beta + offset_nonpadded,
+               mp->d_factor_common + offset_nonpadded_att2,
+               mp->d_R_xx + offset_nonpadded,mp->d_R_yy + offset_nonpadded,mp->d_R_xy + offset_nonpadded,
+               mp->d_R_xz + offset_nonpadded,mp->d_R_yz + offset_nonpadded,
+               mp->d_b_epsilondev_xx + offset_nonpadded,mp->d_b_epsilondev_yy + offset_nonpadded,mp->d_b_epsilondev_xy + offset_nonpadded,
+               mp->d_b_epsilondev_xz + offset_nonpadded,mp->d_b_epsilondev_yz + offset_nonpadded,
+               mp->d_b_epsilon_trace_over_3 + offset_nonpadded,
+               mp->d_b_R_xx + offset_nonpadded,mp->d_b_R_yy + offset_nonpadded,mp->d_b_R_xy + offset_nonpadded,
+               mp->d_b_R_xz + offset_nonpadded,mp->d_b_R_yz + offset_nonpadded,
+               mp->d_c11store + offset,mp->d_c12store + offset,mp->d_c13store + offset,
+               mp->d_c14store + offset,mp->d_c15store + offset,mp->d_c16store + offset,
+               mp->d_c22store + offset,mp->d_c23store + offset,mp->d_c24store + offset,
+               mp->d_c25store + offset,mp->d_c26store + offset,mp->d_c33store + offset,
+               mp->d_c34store + offset,mp->d_c35store + offset,mp->d_c36store + offset,
+               mp->d_c44store + offset,mp->d_c45store + offset,mp->d_c46store + offset,
+               mp->d_c55store + offset,mp->d_c56store + offset,mp->d_c66store + offset,
+               mp->d_rhostore + offset);
 
       // for padded and aligned arrays
-      color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+      offset += nb_blocks_to_compute * NGLL3_PADDED;
       // for no-aligned arrays
-      color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+      offset_nonpadded += nb_blocks_to_compute * NGLL3;
       // for factor_common array
-      color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
+      offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
 
-      //daniel: we use the same stream, so kernels are executed one after the other
-      // synchronizes in case we run on only 1 process to avoid race-conditions
-      //if( mp->NPROC == 1 ){
-      //  // Wait until previous compute stream finishes.
-      //  cudaStreamSynchronize(mp->compute_stream);
-      //}
+      //note: we use the same stream, so kernels are executed one after the other
+      //      thus, there should be no need to synchronize in case we run on only 1 process to avoid race-conditions
 
     }
 
   }else{
-
     // no mesh coloring: uses atomic updates
-
     Kernel_2(num_elements,mp,*iphase,*deltat,
              *COMPUTE_AND_STORE_STRAIN,
              *ATTENUATION,*ANISOTROPY,
              mp->d_ibool,
-             mp->d_xix,
-             mp->d_xiy,
-             mp->d_xiz,
-             mp->d_etax,
-             mp->d_etay,
-             mp->d_etaz,
-             mp->d_gammax,
-             mp->d_gammay,
-             mp->d_gammaz,
+             mp->d_xix,mp->d_xiy,mp->d_xiz,
+             mp->d_etax,mp->d_etay,mp->d_etaz,
+             mp->d_gammax,mp->d_gammay,mp->d_gammaz,
              mp->d_kappav,
              mp->d_muv,
-             mp->d_epsilondev_xx,
-             mp->d_epsilondev_yy,
-             mp->d_epsilondev_xy,
-             mp->d_epsilondev_xz,
-             mp->d_epsilondev_yz,
+             mp->d_epsilondev_xx,mp->d_epsilondev_yy,mp->d_epsilondev_xy,
+             mp->d_epsilondev_xz,mp->d_epsilondev_yz,
              mp->d_epsilon_trace_over_3,
              mp->d_one_minus_sum_beta,
              mp->d_factor_common,
-             mp->d_R_xx,
-             mp->d_R_yy,
-             mp->d_R_xy,
-             mp->d_R_xz,
-             mp->d_R_yz,
-             mp->d_b_epsilondev_xx,
-             mp->d_b_epsilondev_yy,
-             mp->d_b_epsilondev_xy,
-             mp->d_b_epsilondev_xz,
-             mp->d_b_epsilondev_yz,
+             mp->d_R_xx,mp->d_R_yy,mp->d_R_xy,
+             mp->d_R_xz,mp->d_R_yz,
+             mp->d_b_epsilondev_xx,mp->d_b_epsilondev_yy,mp->d_b_epsilondev_xy,
+             mp->d_b_epsilondev_xz,mp->d_b_epsilondev_yz,
              mp->d_b_epsilon_trace_over_3,
-             mp->d_b_R_xx,
-             mp->d_b_R_yy,
-             mp->d_b_R_xy,
-             mp->d_b_R_xz,
-             mp->d_b_R_yz,
-             mp->d_c11store,
-             mp->d_c12store,
-             mp->d_c13store,
-             mp->d_c14store,
-             mp->d_c15store,
-             mp->d_c16store,
-             mp->d_c22store,
-             mp->d_c23store,
-             mp->d_c24store,
-             mp->d_c25store,
-             mp->d_c26store,
-             mp->d_c33store,
-             mp->d_c34store,
-             mp->d_c35store,
-             mp->d_c36store,
-             mp->d_c44store,
-             mp->d_c45store,
-             mp->d_c46store,
-             mp->d_c55store,
-             mp->d_c56store,
-             mp->d_c66store,
+             mp->d_b_R_xx,mp->d_b_R_yy,mp->d_b_R_xy,
+             mp->d_b_R_xz,mp->d_b_R_yz,
+             mp->d_c11store,mp->d_c12store,mp->d_c13store,
+             mp->d_c14store,mp->d_c15store,mp->d_c16store,
+             mp->d_c22store,mp->d_c23store,mp->d_c24store,
+             mp->d_c25store,mp->d_c26store,mp->d_c33store,
+             mp->d_c34store,mp->d_c35store,mp->d_c36store,
+             mp->d_c44store,mp->d_c45store,mp->d_c46store,
+             mp->d_c55store,mp->d_c56store,mp->d_c66store,
              mp->d_rhostore);
   }
 }
@@ -1860,13 +3141,13 @@
 
 extern "C"
 void FC_FUNC_(sync_copy_from_device,
-              SYNC_copy_FROM_DEVICE)(long* Mesh_pointer_f,
+              SYNC_copy_FROM_DEVICE)(long* Mesh_pointer,
                                      int* iphase,
                                      realw* send_buffer) {
 
   TRACE("sync_copy_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
   // Wait until async-memcpy of outer elements is finished and start MPI.
   if( *iphase != 2 ){ exit_on_cuda_error("sync_copy_from_device must be called for iphase == 2"); }
@@ -1877,8 +3158,7 @@
 
     // There have been problems using the pinned-memory with MPI, so
     // we copy the buffer into a non-pinned region.
-    memcpy(send_buffer,mp->h_send_accel_buffer,
-           mp->size_mpi_buffer*sizeof(float));
+    memcpy(send_buffer,mp->h_send_accel_buffer,mp->size_mpi_buffer*sizeof(float));
   }
   // memory copy is now finished, so non-blocking MPI send can proceed
 }

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_kernels_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_kernels_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -45,20 +45,119 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+__global__ void compute_kernels_ani_cudakernel(int* ispec_is_elastic,
+                                           int* ibool,
+                                           realw* accel,
+                                           realw* b_displ,
+                                           realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                           realw* epsilondev_xz,realw* epsilondev_yz,
+                                           realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                           realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+                                           realw* rho_kl,
+                                           realw deltat,
+                                           realw* cijkl_kl,
+                                           realw* epsilon_trace_over_3,
+                                           realw* b_epsilon_trace_over_3,
+                                           int NSPEC_AB) {
+
+  int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+  int ijk = threadIdx.x;
+  int ijk_ispec = ijk + NGLL3*ispec;
+  int ijk21_ispec = ijk + 21*NGLL3*ispec;
+
+  realw prod[21];
+  realw eps[6];
+  realw b_eps[6];
+  realw epsdev[6];
+  realw b_epsdev[6];
+  realw eps_trace_over_3,b_eps_trace_over_3;
+  int i,j;
+
+  // handles case when there is 1 extra block (due to rectangular grid)
+  if(ispec < NSPEC_AB) {
+
+    // elastic elements only
+    if( ispec_is_elastic[ispec] ) {
+      int iglob = ibool[ijk_ispec] - 1 ;
+
+      // anisotropic kernels:
+      // density kernel
+      rho_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_displ[3*iglob]+
+                                     accel[3*iglob+1]*b_displ[3*iglob+1]+
+                                     accel[3*iglob+2]*b_displ[3*iglob+2]);
+
+
+      // anisotropic kernel
+      epsdev[0] = epsilondev_xx[ijk_ispec];
+      epsdev[1] = epsilondev_yy[ijk_ispec];
+      epsdev[2] = epsilondev_xy[ijk_ispec];
+      epsdev[3] = epsilondev_xz[ijk_ispec];
+      epsdev[4] = epsilondev_yz[ijk_ispec];
+
+      b_epsdev[0] = b_epsilondev_xx[ijk_ispec];
+      b_epsdev[1] = b_epsilondev_yy[ijk_ispec];
+      b_epsdev[2] = b_epsilondev_xy[ijk_ispec];
+      b_epsdev[3] = b_epsilondev_xz[ijk_ispec];
+      b_epsdev[4] = b_epsilondev_yz[ijk_ispec];
+
+      eps_trace_over_3 = epsilon_trace_over_3[ijk_ispec];
+      b_eps_trace_over_3 = b_epsilon_trace_over_3[ijk_ispec];
+
+      //! Building of the local matrix of the strain tensor
+      //! for the adjoint field and the regular backward field
+      //!eps11 et eps22
+      eps[0] = epsdev[0] + eps_trace_over_3;
+      eps[1] = epsdev[1] + eps_trace_over_3;
+      //!eps33
+      eps[2] = -(eps[0]+eps[1])+3*eps_trace_over_3;
+      //!eps23
+      eps[3] = epsdev[4];
+      //!eps13
+      eps[4] = epsdev[3];
+      //!eps12
+      eps[5] = epsdev[2];
+
+      // backward arrays
+      b_eps[0] = b_epsdev[0] + b_eps_trace_over_3;
+      b_eps[1] = b_epsdev[1] + b_eps_trace_over_3;
+      b_eps[2] = -(b_eps[0]+b_eps[1])+3*b_eps_trace_over_3;
+      b_eps[3] = b_epsdev[4];
+      b_eps[4] = b_epsdev[3];
+      b_eps[5] = b_epsdev[2];
+
+      //! Computing the 21 strain products without assuming eps(i)*b_eps(j) = eps(j)*b_eps(i)
+      int p = 0;
+      for( i=0; i<6; i++){
+        for( j=i; j<6; j++){
+          prod[p] = eps[i] * b_eps[j];
+          if( j > i ){
+            prod[p] = prod[p] + eps[j]*b_eps[i];
+            if( j > 2 && i < 3 ){ prod[p] = prod[p]*2; }
+          }
+          if(i > 2 ){ prod[p] = prod[p]*4; }
+          p++;
+        }
+      }
+
+      // all 21 anisotropic coefficients
+      for( i=0; i<21; i++){
+        cijkl_kl[i+ijk21_ispec] += deltat * prod[i];
+      }
+
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
 __global__ void compute_kernels_cudakernel(int* ispec_is_elastic,
                                            int* ibool,
                                            realw* accel,
                                            realw* b_displ,
-                                           realw* epsilondev_xx,
-                                           realw* epsilondev_yy,
-                                           realw* epsilondev_xy,
-                                           realw* epsilondev_xz,
-                                           realw* epsilondev_yz,
-                                           realw* b_epsilondev_xx,
-                                           realw* b_epsilondev_yy,
-                                           realw* b_epsilondev_xy,
-                                           realw* b_epsilondev_xz,
-                                           realw* b_epsilondev_yz,
+                                           realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                           realw* epsilondev_xz,realw* epsilondev_yz,
+                                           realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                           realw* b_epsilondev_xz,realw* b_epsilondev_yz,
                                            realw* rho_kl,
                                            realw deltat,
                                            realw* mu_kl,
@@ -68,15 +167,14 @@
                                            int NSPEC_AB) {
 
   int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+  int ijk = threadIdx.x;
+  int ijk_ispec = ijk + NGLL3*ispec;
 
   // handles case when there is 1 extra block (due to rectangular grid)
   if(ispec < NSPEC_AB) {
 
     // elastic elements only
     if( ispec_is_elastic[ispec] ) {
-
-      int ijk = threadIdx.x;
-      int ijk_ispec = ijk + NGLL3*ispec;
       int iglob = ibool[ijk_ispec] - 1 ;
 
       // isotropic kernels:
@@ -109,43 +207,62 @@
 void FC_FUNC_(compute_kernels_elastic_cuda,
               COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
                                             realw* deltat_f) {
-TRACE("compute_kernels_elastic_cuda");
 
+  TRACE("compute_kernels_elastic_cuda");
+
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
   realw deltat = *deltat_f;
 
-  int num_blocks_x = mp->NSPEC_AB;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->NSPEC_AB,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  compute_kernels_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
-                                               mp->d_accel, mp->d_b_displ,
-                                               mp->d_epsilondev_xx,
-                                               mp->d_epsilondev_yy,
-                                               mp->d_epsilondev_xy,
-                                               mp->d_epsilondev_xz,
-                                               mp->d_epsilondev_yz,
-                                               mp->d_b_epsilondev_xx,
-                                               mp->d_b_epsilondev_yy,
-                                               mp->d_b_epsilondev_xy,
-                                               mp->d_b_epsilondev_xz,
-                                               mp->d_b_epsilondev_yz,
-                                               mp->d_rho_kl,
-                                               deltat,
-                                               mp->d_mu_kl,
-                                               mp->d_kappa_kl,
-                                               mp->d_epsilon_trace_over_3,
-                                               mp->d_b_epsilon_trace_over_3,
-                                               mp->NSPEC_AB);
+  if( mp->anisotropic_kl ){
+    compute_kernels_ani_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
+                                                     mp->d_accel, mp->d_b_displ,
+                                                     mp->d_epsilondev_xx,
+                                                     mp->d_epsilondev_yy,
+                                                     mp->d_epsilondev_xy,
+                                                     mp->d_epsilondev_xz,
+                                                     mp->d_epsilondev_yz,
+                                                     mp->d_b_epsilondev_xx,
+                                                     mp->d_b_epsilondev_yy,
+                                                     mp->d_b_epsilondev_xy,
+                                                     mp->d_b_epsilondev_xz,
+                                                     mp->d_b_epsilondev_yz,
+                                                     mp->d_rho_kl,
+                                                     deltat,
+                                                     mp->d_cijkl_kl,
+                                                     mp->d_epsilon_trace_over_3,
+                                                     mp->d_b_epsilon_trace_over_3,
+                                                     mp->NSPEC_AB);
 
+  }else{
+    compute_kernels_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
+                                                 mp->d_accel, mp->d_b_displ,
+                                                 mp->d_epsilondev_xx,
+                                                 mp->d_epsilondev_yy,
+                                                 mp->d_epsilondev_xy,
+                                                 mp->d_epsilondev_xz,
+                                                 mp->d_epsilondev_yz,
+                                                 mp->d_b_epsilondev_xx,
+                                                 mp->d_b_epsilondev_yy,
+                                                 mp->d_b_epsilondev_xy,
+                                                 mp->d_b_epsilondev_xz,
+                                                 mp->d_b_epsilondev_yz,
+                                                 mp->d_rho_kl,
+                                                 deltat,
+                                                 mp->d_mu_kl,
+                                                 mp->d_kappa_kl,
+                                                 mp->d_epsilon_trace_over_3,
+                                                 mp->d_b_epsilon_trace_over_3,
+                                                 mp->NSPEC_AB);
+  }
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_kernels_elastic_cuda");
 #endif
@@ -171,12 +288,13 @@
                                                            realw deltat,
                                                            int num_free_surface_faces) {
   int iface = blockIdx.x + blockIdx.y*gridDim.x;
+  int igll = threadIdx.x;
+  int ipoin = igll + NGLL2*iface;
 
   if(iface < num_free_surface_faces) {
 
     int ispec = free_surface_ispec[iface]-1;
-    int igll = threadIdx.x;
-    int ipoin = igll + NGLL2*iface;
+
     int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1 ;
     int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
     int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
@@ -206,20 +324,18 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
-             3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice);
+  // checks if anything to do
+  if( mp->num_free_surface_faces == 0 ) return;
 
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->num_free_surface_faces,&num_blocks_x,&num_blocks_y);
 
-  int num_blocks_x = mp->num_free_surface_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(NGLL2,1,1);
 
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+                          NDIM*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),81000);
+
   compute_kernels_strength_noise_cuda_kernel<<<grid,threads>>>(mp->d_displ,
                                                                mp->d_free_surface_ispec,
                                                                mp->d_free_surface_ijk,
@@ -250,15 +366,9 @@
                                         realw* scalar_field,
                                         realw* vector_field_element,
                                         realw* d_hprime_xx,
-                                        realw* d_xix,
-                                        realw* d_xiy,
-                                        realw* d_xiz,
-                                        realw* d_etax,
-                                        realw* d_etay,
-                                        realw* d_etaz,
-                                        realw* d_gammax,
-                                        realw* d_gammay,
-                                        realw* d_gammaz,
+                                        realw* d_xix,realw* d_xiy,realw* d_xiz,
+                                        realw* d_etax,realw* d_etay,realw* d_etaz,
+                                        realw* d_gammax,realw* d_gammay,realw* d_gammaz,
                                         realw rhol,
                                         int gravity) {
 
@@ -268,7 +378,6 @@
   realw rho_invl;
   int l,offset,offset1,offset2,offset3;
 
-  //const int NGLLX = 5;
   const int NGLL3_ALIGN = NGLL3_PADDED;
 
   int K = (ijk/NGLL2);
@@ -334,15 +443,9 @@
                                                 realw* rhostore,
                                                 realw* kappastore,
                                                 realw* d_hprime_xx,
-                                                realw* d_xix,
-                                                realw* d_xiy,
-                                                realw* d_xiz,
-                                                realw* d_etax,
-                                                realw* d_etay,
-                                                realw* d_etaz,
-                                                realw* d_gammax,
-                                                realw* d_gammay,
-                                                realw* d_gammaz,
+                                                realw* d_xix,realw* d_xiy,realw* d_xiz,
+                                                realw* d_etax,realw* d_etay,realw* d_etaz,
+                                                realw* d_gammax,realw* d_gammay,realw* d_gammaz,
                                                 realw* potential_dot_dot_acoustic,
                                                 realw* b_potential_acoustic,
                                                 realw* b_potential_dot_dot_acoustic,
@@ -353,59 +456,65 @@
                                                 int gravity) {
 
   int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+  int ijk = threadIdx.x;
 
+  // local and global indices
+  int ijk_ispec = ijk + NGLL3*ispec;
+  int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
+  int iglob;
+
+  // shared memory between all threads within this block
+  __shared__ realw scalar_field_displ[NGLL3];
+  __shared__ realw scalar_field_accel[NGLL3];
+
+  int active = 0;
+
   // handles case when there is 1 extra block (due to rectangular grid)
   if( ispec < NSPEC_AB ){
-
     // acoustic elements only
-    if( ispec_is_acoustic[ispec] ) {
+    if( ispec_is_acoustic[ispec] ){
+      active = 1;
 
-      int ijk = threadIdx.x;
-
-      // local and global indices
-      int ijk_ispec = ijk + NGLL3*ispec;
-      int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
-      int iglob = ibool[ijk_ispec] - 1;
-
-      realw accel_elm[3];
-      realw b_displ_elm[3];
-      realw rhol,kappal;
-
-      // shared memory between all threads within this block
-      __shared__ realw scalar_field_displ[NGLL3];
-      __shared__ realw scalar_field_accel[NGLL3];
-
       // copy field values
+      iglob = ibool[ijk_ispec] - 1;
       scalar_field_displ[ijk] = b_potential_acoustic[iglob];
       scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob];
-      __syncthreads();
+    }
+  }
 
-      // gets material parameter
-      rhol = rhostore[ijk_ispec_padded];
+  // synchronizes threads
+  __syncthreads();
 
-      // displacement vector from backward field
-      compute_gradient_kernel(ijk,ispec,scalar_field_displ,b_displ_elm,
-                              d_hprime_xx,
-                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
-                              rhol,gravity);
+  if( active ){
+    realw accel_elm[3];
+    realw b_displ_elm[3];
+    realw rhol,kappal;
 
-      // acceleration vector
-      compute_gradient_kernel(ijk,ispec,scalar_field_accel,accel_elm,
-                              d_hprime_xx,
-                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
-                              rhol,gravity);
+    // gets material parameter
+    rhol = rhostore[ijk_ispec_padded];
 
-      // density kernel
-      rho_ac_kl[ijk_ispec] -= deltat * rhol * (accel_elm[0]*b_displ_elm[0] +
-                                               accel_elm[1]*b_displ_elm[1] +
-                                               accel_elm[2]*b_displ_elm[2]);
+    // displacement vector from backward field
+    compute_gradient_kernel(ijk,ispec,scalar_field_displ,b_displ_elm,
+                            d_hprime_xx,
+                            d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+                            rhol,gravity);
 
-      // bulk modulus kernel
-      kappal = kappastore[ijk_ispec];
-      kappa_ac_kl[ijk_ispec] -= deltat / kappal * potential_dot_dot_acoustic[iglob]
-                                                * b_potential_dot_dot_acoustic[iglob];
-    }
-  }
+    // acceleration vector
+    compute_gradient_kernel(ijk,ispec,scalar_field_accel,accel_elm,
+                            d_hprime_xx,
+                            d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+                            rhol,gravity);
+
+    // density kernel
+    rho_ac_kl[ijk_ispec] -= deltat * rhol * (accel_elm[0]*b_displ_elm[0] +
+                                             accel_elm[1]*b_displ_elm[1] +
+                                             accel_elm[2]*b_displ_elm[2]);
+
+    // bulk modulus kernel
+    kappal = kappastore[ijk_ispec];
+    kappa_ac_kl[ijk_ispec] -= deltat / kappal * potential_dot_dot_acoustic[iglob]
+                                              * b_potential_dot_dot_acoustic[iglob];
+  } // active
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -413,8 +522,7 @@
 
 extern "C"
 void FC_FUNC_(compute_kernels_acoustic_cuda,
-              COMPUTE_KERNELS_ACOUSTIC_CUDA)(
-                                             long* Mesh_pointer,
+              COMPUTE_KERNELS_ACOUSTIC_CUDA)(long* Mesh_pointer,
                                              realw* deltat_f) {
 
 TRACE("compute_kernels_acoustic_cuda");
@@ -424,12 +532,8 @@
   int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
   realw deltat = *deltat_f;
 
-  int num_blocks_x = mp->NSPEC_AB;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->NSPEC_AB,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
@@ -439,15 +543,9 @@
                                                     mp->d_rhostore,
                                                     mp->d_kappastore,
                                                     mp->d_hprime_xx,
-                                                    mp->d_xix,
-                                                    mp->d_xiy,
-                                                    mp->d_xiz,
-                                                    mp->d_etax,
-                                                    mp->d_etay,
-                                                    mp->d_etaz,
-                                                    mp->d_gammax,
-                                                    mp->d_gammay,
-                                                    mp->d_gammaz,
+                                                    mp->d_xix,mp->d_xiy,mp->d_xiz,
+                                                    mp->d_etax,mp->d_etay,mp->d_etaz,
+                                                    mp->d_gammax,mp->d_gammay,mp->d_gammaz,
                                                     mp->d_potential_dot_dot_acoustic,
                                                     mp->d_b_potential_acoustic,
                                                     mp->d_b_potential_dot_dot_acoustic,
@@ -477,15 +575,14 @@
                                                    int NSPEC_AB) {
 
   int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+  int ijk = threadIdx.x;
+  int ijk_ispec = ijk + NGLL3*ispec;
 
   // handles case when there is 1 extra block (due to rectangular grid)
   if(ispec < NSPEC_AB) {
 
     // elastic elements only
     if( ispec_is_elastic[ispec] ) {
-
-      int ijk = threadIdx.x;
-      int ijk_ispec = ijk + NGLL3*ispec;
       int iglob = ibool[ijk_ispec] - 1 ;
 
       // approximate hessian
@@ -504,72 +601,72 @@
                                                    realw* b_potential_dot_dot_acoustic,
                                                    realw* rhostore,
                                                    realw* d_hprime_xx,
-                                                   realw* d_xix,
-                                                   realw* d_xiy,
-                                                   realw* d_xiz,
-                                                   realw* d_etax,
-                                                   realw* d_etay,
-                                                   realw* d_etaz,
-                                                   realw* d_gammax,
-                                                   realw* d_gammay,
-                                                   realw* d_gammaz,
+                                                   realw* d_xix,realw* d_xiy,realw* d_xiz,
+                                                   realw* d_etax,realw* d_etay,realw* d_etaz,
+                                                   realw* d_gammax,realw* d_gammay,realw* d_gammaz,
                                                    realw* hess_kl,
                                                    realw deltat,
                                                    int NSPEC_AB,
                                                    int gravity) {
 
   int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+  int ijk = threadIdx.x;
+  int ijk_ispec = ijk + NGLL3*ispec;
+  int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
+  int iglob;
 
+  // shared memory between all threads within this block
+  __shared__ realw scalar_field_accel[NGLL3];
+  __shared__ realw scalar_field_b_accel[NGLL3];
+
+  int active = 0;
+
   // handles case when there is 1 extra block (due to rectangular grid)
   if(ispec < NSPEC_AB) {
 
     // acoustic elements only
     if( ispec_is_acoustic[ispec] ){
+      active = 1;
 
-      // local and global indices
-      int ijk = threadIdx.x;
-      int ijk_ispec = ijk + NGLL3*ispec;
-      int iglob = ibool[ijk_ispec] - 1 ;
+      // global indices
+      iglob = ibool[ijk_ispec] - 1 ;
 
-      int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
-
-      realw accel_elm[3];
-      realw b_accel_elm[3];
-      realw rhol;
-
-      // shared memory between all threads within this block
-      __shared__ realw scalar_field_accel[NGLL3];
-      __shared__ realw scalar_field_b_accel[NGLL3];
-
       // copy field values
       scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob];
       scalar_field_b_accel[ijk] = b_potential_dot_dot_acoustic[iglob];
-      __syncthreads();
+    }
+  }
 
-      // gets material parameter
-      rhol = rhostore[ijk_ispec_padded];
+  // synchronizes threads
+  __syncthreads();
 
-      // acceleration vector
-      compute_gradient_kernel(ijk,ispec,
-                              scalar_field_accel,accel_elm,
-                              d_hprime_xx,
-                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
-                              rhol,gravity);
+  if( active ){
+    realw accel_elm[3];
+    realw b_accel_elm[3];
+    realw rhol;
 
-      // acceleration vector from backward field
-      compute_gradient_kernel(ijk,ispec,
-                              scalar_field_b_accel,b_accel_elm,
-                              d_hprime_xx,
-                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
-                              rhol,gravity);
-      // approximates hessian
-      hess_kl[ijk_ispec] += deltat * (accel_elm[0]*b_accel_elm[0] +
-                                      accel_elm[1]*b_accel_elm[1] +
-                                      accel_elm[2]*b_accel_elm[2]);
+    // gets material parameter
+    rhol = rhostore[ijk_ispec_padded];
 
-    } // ispec_is_acoustic
+    // acceleration vector
+    compute_gradient_kernel(ijk,ispec,
+                            scalar_field_accel,accel_elm,
+                            d_hprime_xx,
+                            d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+                            rhol,gravity);
 
-  }
+    // acceleration vector from backward field
+    compute_gradient_kernel(ijk,ispec,
+                            scalar_field_b_accel,b_accel_elm,
+                            d_hprime_xx,
+                            d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+                            rhol,gravity);
+    // approximates hessian
+    hess_kl[ijk_ispec] += deltat * (accel_elm[0]*b_accel_elm[0] +
+                                    accel_elm[1]*b_accel_elm[1] +
+                                    accel_elm[2]*b_accel_elm[2]);
+
+  } // active
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -587,12 +684,8 @@
   int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
   realw deltat = *deltat_f;
 
-  int num_blocks_x = mp->NSPEC_AB;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->NSPEC_AB,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
@@ -614,15 +707,9 @@
                                                          mp->d_b_potential_dot_dot_acoustic,
                                                          mp->d_rhostore,
                                                          mp->d_hprime_xx,
-                                                         mp->d_xix,
-                                                         mp->d_xiy,
-                                                         mp->d_xiz,
-                                                         mp->d_etax,
-                                                         mp->d_etay,
-                                                         mp->d_etaz,
-                                                         mp->d_gammax,
-                                                         mp->d_gammay,
-                                                         mp->d_gammaz,
+                                                         mp->d_xix,mp->d_xiy,mp->d_xiz,
+                                                         mp->d_etax,mp->d_etay,mp->d_etaz,
+                                                         mp->d_gammax,mp->d_gammay,mp->d_gammaz,
                                                          mp->d_hess_ac_kl,
                                                          deltat,
                                                          mp->NSPEC_AB,

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_acoustic_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_acoustic_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -122,16 +122,18 @@
 
 extern "C"
 void FC_FUNC_(compute_stacey_acoustic_cuda,
-              COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+              COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer,
                                             int* phase_is_innerf,
-                                            int* SAVE_FORWARDf,
                                             realw* h_b_absorb_potential) {
 TRACE("compute_stacey_acoustic_cuda");
   //double start_time = get_time();
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  // checks if anything to do
+  if( mp->d_num_abs_boundary_faces == 0 ) return;
+
   int phase_is_inner          = *phase_is_innerf;
-  int SAVE_FORWARD            = *SAVE_FORWARDf;
 
   // way 1: Elapsed time: 4.385948e-03
   // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
@@ -141,18 +143,14 @@
   // > NGLLSQUARE==NGLL2==25, no further check inside kernel
   int blocksize = NGLL2;
 
-  int num_blocks_x = mp->d_num_abs_boundary_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->d_num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
   //  adjoint simulations: reads in absorbing boundary
-  if (mp->simulation_type == 3 && mp->d_num_abs_boundary_faces > 0 ){
+  if (mp->simulation_type == 3 ){
     // copies array to GPU
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,h_b_absorb_potential,
                                        mp->d_b_reclen_potential,cudaMemcpyHostToDevice),7700);
@@ -170,7 +168,7 @@
                                                    mp->d_ispec_is_acoustic,
                                                    phase_is_inner,
                                                    mp->simulation_type,
-                                                   SAVE_FORWARD,
+                                                   mp->save_forward,
                                                    mp->d_num_abs_boundary_faces,
                                                    mp->d_b_potential_dot_acoustic,
                                                    mp->d_b_potential_dot_dot_acoustic,
@@ -178,7 +176,8 @@
                                                    mp->gravity);
 
   //  adjoint simulations: stores absorbed wavefield part
-  if (mp->simulation_type == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ){
+  if (mp->simulation_type == 1 && mp->save_forward ){
+    // (cudaMemcpy implicitly synchronizes all other cuda operations)
     // copies array to CPU
     print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_potential,mp->d_b_absorb_potential,
                                        mp->d_b_reclen_potential,cudaMemcpyDeviceToHost),7701);

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_viscoelastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_viscoelastic_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_viscoelastic_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -54,7 +54,6 @@
                                               int SIMULATION_TYPE,
                                               int SAVE_FORWARD,
                                               int num_abs_boundary_faces,
-                                              realw* b_accel,
                                               realw* b_absorb_field) {
 
   int igll = threadIdx.x; // tx
@@ -110,12 +109,7 @@
       atomicAdd(&accel[iglob*3+1],-ty*jacobianw);
       atomicAdd(&accel[iglob*3+2],-tz*jacobianw);
 
-      if(SIMULATION_TYPE == 3) {
-        atomicAdd(&b_accel[iglob*3  ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]);
-        atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]);
-        atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]);
-      }
-      else if(SAVE_FORWARD && SIMULATION_TYPE == 1) {
+      if(SAVE_FORWARD && SIMULATION_TYPE == 1) {
         b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)] = tx*jacobianw;
         b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)] = ty*jacobianw;
         b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)] = tz*jacobianw;
@@ -127,23 +121,63 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+__global__ void compute_stacey_elastic_sim3_kernel(int* abs_boundary_ispec,
+                                                   int* abs_boundary_ijk,
+                                                   int* ibool,
+                                                   int* ispec_is_inner,
+                                                   int* ispec_is_elastic,
+                                                   int phase_is_inner,
+                                                   int num_abs_boundary_faces,
+                                                   realw* b_accel,
+                                                   realw* b_absorb_field) {
 
+  int igll = threadIdx.x; // tx
+  int iface = blockIdx.x + gridDim.x*blockIdx.y; // bx
+
+  int i,j,k,iglob,ispec;
+
+  // don't compute points outside NGLLSQUARE==NGLL2==25
+  // way 2: no further check needed since blocksize = 25
+  if( iface < num_abs_boundary_faces){
+
+  //if(igll < NGLL2 && iface < num_abs_boundary_faces) {
+
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = abs_boundary_ispec[iface]-1;
+
+    if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) {
+
+      i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+      j = abs_boundary_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+      k = abs_boundary_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+
+      iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
+
+      atomicAdd(&b_accel[iglob*3  ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]);
+      atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]);
+      atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]);
+    }
+  } // num_abs_boundary_faces
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
 extern "C"
 void FC_FUNC_(compute_stacey_viscoelastic_cuda,
-              COMPUTE_STACEY_VISCOELASTIC_CUDA)(long* Mesh_pointer_f,
+              COMPUTE_STACEY_VISCOELASTIC_CUDA)(long* Mesh_pointer,
                                            int* phase_is_innerf,
-                                           int* SAVE_FORWARDf,
-                                           realw* h_b_absorb_field) {
+                                           realw* b_absorb_field) {
 
-TRACE("compute_stacey_viscoelastic_cuda");
+  TRACE("\tcompute_stacey_viscoelastic_cuda");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  // check
+  // checks if anything to do
   if( mp->d_num_abs_boundary_faces == 0 ) return;
 
   int phase_is_inner    = *phase_is_innerf;
-  int SAVE_FORWARD      = *SAVE_FORWARDf;
 
   // way 1
   // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
@@ -153,19 +187,15 @@
   // > NGLLSQUARE==NGLL2==25, no further check inside kernel
   int blocksize = NGLL2;
 
-  int num_blocks_x = mp->d_num_abs_boundary_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->d_num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  if(mp->simulation_type == 3 && mp->d_num_abs_boundary_faces > 0) {
-    // The read is done in fortran
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,h_b_absorb_field,
+  if(mp->simulation_type == 3 ) {
+    // reading is done in fortran routine
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,b_absorb_field,
                                        mp->d_b_reclen_field,cudaMemcpyHostToDevice),7700);
   }
 
@@ -173,7 +203,7 @@
   exit_on_cuda_error("between cudamemcpy and compute_stacey_elastic_kernel");
 #endif
 
-  compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc,
+  compute_stacey_elastic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc,
                                                   mp->d_accel,
                                                   mp->d_abs_boundary_ispec,
                                                   mp->d_abs_boundary_ijk,
@@ -186,11 +216,23 @@
                                                   mp->d_ispec_is_elastic,
                                                   phase_is_inner,
                                                   mp->simulation_type,
-                                                  SAVE_FORWARD,
+                                                  mp->save_forward,
                                                   mp->d_num_abs_boundary_faces,
-                                                  mp->d_b_accel,
                                                   mp->d_b_absorb_field);
 
+  // adjoint simulations
+  if(mp->simulation_type == 3 ){
+    compute_stacey_elastic_sim3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_abs_boundary_ispec,
+                                                         mp->d_abs_boundary_ijk,
+                                                         mp->d_ibool,
+                                                         mp->d_ispec_is_inner,
+                                                         mp->d_ispec_is_elastic,
+                                                         phase_is_inner,
+                                                         mp->d_num_abs_boundary_faces,
+                                                         mp->d_b_accel,
+                                                         mp->d_b_absorb_field);
+  }
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_stacey_elastic_kernel");
 #endif
@@ -199,11 +241,15 @@
   // if (mp->simulation_type == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) &
   //   write(IOABS,rec=it) b_reclen_field,b_absorb_field,b_reclen_field
 
-  if(mp->simulation_type == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ) {
-    print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_field,mp->d_b_absorb_field,
+  if(mp->simulation_type == 1 && mp->save_forward ) {
+    // explicitly wait until compute stream is done
+    // (cudaMemcpy implicitly synchronizes all other cuda operations)
+    cudaStreamSynchronize(mp->compute_stream);
+
+    // copies absorb_field values to CPU
+    print_CUDA_error_if_any(cudaMemcpy(b_absorb_field,mp->d_b_absorb_field,
                                        mp->d_b_reclen_field,cudaMemcpyDeviceToHost),7701);
-    // The write is done in fortran
-    // write_abs_(&fid,(char*)b_absorb_field,&b_reclen_field,&it);
+    // writing is done in fortran routine
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -52,14 +52,20 @@
                                        realw deltat,
                                        realw deltatsqover2,
                                        realw deltatover2) {
+
+  // two dimensional array of blocks on grid where each block has one dimensional array of threads
+  //int tid = threadIdx.x;
+  //int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  //int id = tid + bx*blockDim.x;
+
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
   if(id < size) {
     displ[id] = displ[id] + deltat*veloc[id] + deltatsqover2*accel[id];
     veloc[id] = veloc[id] + deltatover2*accel[id];
-    accel[id] = 0; // can do this using memset...not sure if faster
+    accel[id] = 0.0f; // can do this using memset...not sure if faster,probably not
   }
 }
 
@@ -67,8 +73,7 @@
 
 extern "C"
 void FC_FUNC_(it_update_displacement_cuda,
-              IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
-                                          int* size_F,
+              IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer,
                                           realw* deltat_F,
                                           realw* deltatsqover2_F,
                                           realw* deltatover2_F,
@@ -76,48 +81,36 @@
                                           realw* b_deltatsqover2_F,
                                           realw* b_deltatover2_F) {
 
-TRACE("it_update_displacement_cuda");
+  TRACE("\tit_update_displacement_cuda");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
-  //int i,device;
+  realw deltat = *deltat_F;
+  realw deltatsqover2 = *deltatsqover2_F;
+  realw deltatover2 = *deltatover2_F;
 
-  int size = *size_F;
-  //cublasStatus status;
+  int size = NDIM * mp->NGLOB_AB;
 
   int blocksize = BLOCKSIZE_KERNEL1;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
 
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
+  // debug
+  //realw max_d,max_v,max_a;
+  //max_d = get_device_array_maximum_value(mp->d_displ, size);
+  //max_v = get_device_array_maximum_value(mp->d_veloc, size);
+  //max_a = get_device_array_maximum_value(mp->d_accel, size);
+  //printf("rank %d - max displ: %f veloc: %f accel: %f\n",mp->myrank,max_d,max_v,max_a);
 
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-//  exit_on_cuda_error("Before UpdateDispVeloc_kernel");
-//#endif
-
-  realw deltat = *deltat_F;
-  realw deltatsqover2 = *deltatsqover2_F;
-  realw deltatover2 = *deltatover2_F;
-
   //launch kernel
   UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ,mp->d_veloc,mp->d_accel,
-                                           size,deltat,deltatsqover2,deltatover2);
+                                                                size,deltat,deltatsqover2,deltatover2);
 
-  //cudaThreadSynchronize();
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-//  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-//  // sync and check to catch errors from previous async operations
-//  exit_on_cuda_error("UpdateDispVeloc_kernel");
-//#endif
-
   // kernel for backward fields
   if(mp->simulation_type == 3) {
     realw b_deltat = *b_deltat_F;
@@ -125,12 +118,7 @@
     realw b_deltatover2 = *b_deltatover2_F;
 
     UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
-                                             size,b_deltat,b_deltatsqover2,b_deltatover2);
-
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-//    //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-//    exit_on_cuda_error("after SIM_TYPE==3 UpdateDispVeloc_kernel");
-//#endif
+                                                                  size,b_deltat,b_deltatsqover2,b_deltatover2);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -152,10 +140,11 @@
                                        realw deltat,
                                        realw deltatsqover2,
                                        realw deltatover2) {
+
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
   if(id < size) {
     potential_acoustic[id] = potential_acoustic[id]
                             + deltat*potential_dot_acoustic[id]
@@ -164,7 +153,7 @@
     potential_dot_acoustic[id] = potential_dot_acoustic[id]
                                 + deltatover2*potential_dot_dot_acoustic[id];
 
-    potential_dot_dot_acoustic[id] = 0;
+    potential_dot_dot_acoustic[id] = 0.0f;
   }
 }
 
@@ -172,56 +161,50 @@
 
 extern "C"
 void FC_FUNC_(it_update_displacement_ac_cuda,
-              it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
-                                               int* size_F,
+              it_update_displacement_ac_cuda)(long* Mesh_pointer,
                                                realw* deltat_F,
                                                realw* deltatsqover2_F,
                                                realw* deltatover2_F,
                                                realw* b_deltat_F,
                                                realw* b_deltatsqover2_F,
                                                realw* b_deltatover2_F) {
-TRACE("it_update_displacement_ac_cuda");
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  TRACE("\tit_update_displacement_ac_cuda");
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
-  //int i,device;
-  int size = *size_F;
-  //cublasStatus status;
+  int size = mp->NGLOB_AB;
 
   int blocksize = BLOCKSIZE_KERNEL1;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
 
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
+  //launch kernel
+  // forward wavefields
   realw deltat = *deltat_F;
   realw deltatsqover2 = *deltatsqover2_F;
   realw deltatover2 = *deltatover2_F;
 
-  //launch kernel
   UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_acoustic,
-                                           mp->d_potential_dot_acoustic,
-                                           mp->d_potential_dot_dot_acoustic,
-                                           size,deltat,deltatsqover2,deltatover2);
+                                                                 mp->d_potential_dot_acoustic,
+                                                                 mp->d_potential_dot_dot_acoustic,
+                                                                 size,deltat,deltatsqover2,deltatover2);
 
+  // backward/reconstructed wavefields
   if(mp->simulation_type == 3) {
     realw b_deltat = *b_deltat_F;
     realw b_deltatsqover2 = *b_deltatsqover2_F;
     realw b_deltatover2 = *b_deltatover2_F;
 
     UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_acoustic,
-                                             mp->d_b_potential_dot_acoustic,
-                                             mp->d_b_potential_dot_dot_acoustic,
-                                             size,b_deltat,b_deltatsqover2,b_deltatover2);
+                                                                  mp->d_b_potential_dot_acoustic,
+                                                                  mp->d_b_potential_dot_dot_acoustic,
+                                                                  size,b_deltat,b_deltatsqover2,b_deltatover2);
   }
 
-  //cudaThreadSynchronize();
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
   exit_on_cuda_error("it_update_displacement_ac_cuda");
@@ -244,10 +227,11 @@
                                      realw* rmassx,
                                      realw* rmassy,
                                      realw* rmassz) {
+
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
   if(id < size) {
     accel[3*id] = accel[3*id]*rmassx[id];
     accel[3*id+1] = accel[3*id+1]*rmassy[id];
@@ -268,8 +252,8 @@
                                            realw* rmassz) {
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
   if(id < size) {
     accel[3*id] = accel[3*id]*rmassx[id];
     accel[3*id+1] = accel[3*id+1]*rmassy[id];
@@ -283,10 +267,11 @@
                                            realw* accel,
                                            int size,
                                            realw deltatover2) {
+
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
   if(id < size) {
     veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
     veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
@@ -299,65 +284,60 @@
 extern "C"
 void FC_FUNC_(kernel_3_a_cuda,
               KERNEL_3_A_CUDA)(long* Mesh_pointer,
-                               int* size_F,
                                realw* deltatover2_F,
                                realw* b_deltatover2_F,
                                int* APPROXIMATE_OCEAN_LOAD) {
-TRACE("kernel_3_a_cuda");
 
-   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+  TRACE("\tkernel_3_a_cuda");
 
-   int size = *size_F;
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
-   realw deltatover2 = *deltatover2_F;
-   realw b_deltatover2 = *b_deltatover2_F;
+  int size = mp->NGLOB_AB;
 
-   int blocksize = BLOCKSIZE_KERNEL3;
-   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+  int blocksize = BLOCKSIZE_KERNEL3;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
 
-   int num_blocks_x = size_padded/blocksize;
-   int num_blocks_y = 1;
-   while(num_blocks_x > 65535) {
-     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-     num_blocks_y = num_blocks_y*2;
-   }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
-   dim3 grid(num_blocks_x,num_blocks_y);
-   dim3 threads(blocksize,1,1);
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
 
-   // check whether we can update accel and veloc, or only accel at this point
-   if( *APPROXIMATE_OCEAN_LOAD == 0 ){
-     // updates both, accel and veloc
-     kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,
-                                                                   mp->d_accel,
-                                                                   size, deltatover2,
+  // check whether we can update accel and veloc, or only accel at this point
+  if( *APPROXIMATE_OCEAN_LOAD == 0 ){
+   realw deltatover2 = *deltatover2_F;
+
+   // updates both, accel and veloc
+   kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,
+                                                                 mp->d_accel,
+                                                                 size, deltatover2,
+                                                                 mp->d_rmassx,mp->d_rmassy,mp->d_rmassz);
+   if(mp->simulation_type == 3) {
+     realw b_deltatover2 = *b_deltatover2_F;
+     kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,
+                                                                   mp->d_b_accel,
+                                                                   size, b_deltatover2,
                                                                    mp->d_rmassx,mp->d_rmassy,mp->d_rmassz);
+   }
+  }else{
+   // updates only accel
+   kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_accel,
+                                                                       size,
+                                                                       mp->d_rmassx,
+                                                                       mp->d_rmassy,
+                                                                       mp->d_rmassz);
 
-     if(mp->simulation_type == 3) {
-       kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,
-                                                                     mp->d_b_accel,
-                                                                     size, b_deltatover2,
-                                                                     mp->d_rmassx,mp->d_rmassy,mp->d_rmassz);
-     }
-   }else{
-     // updates only accel
-     kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_accel,
+   if(mp->simulation_type == 3) {
+     kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_accel,
                                                                          size,
                                                                          mp->d_rmassx,
                                                                          mp->d_rmassy,
                                                                          mp->d_rmassz);
-
-     if(mp->simulation_type == 3) {
-       kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_accel,
-                                                                           size,
-                                                                           mp->d_rmassx,
-                                                                           mp->d_rmassy,
-                                                                           mp->d_rmassz);
-     }
    }
+  }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-   //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
   exit_on_cuda_error("after kernel 3 a");
 #endif
 }
@@ -367,36 +347,31 @@
 extern "C"
 void FC_FUNC_(kernel_3_b_cuda,
               KERNEL_3_B_CUDA)(long* Mesh_pointer,
-                             int* size_F,
-                             realw* deltatover2_F,
-                             realw* b_deltatover2_F) {
-  TRACE("kernel_3_b_cuda");
+                               realw* deltatover2_F,
+                               realw* b_deltatover2_F) {
+  TRACE("\tkernel_3_b_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-  int size = *size_F;
 
-  realw deltatover2 = *deltatover2_F;
-  realw b_deltatover2 = *b_deltatover2_F;
+  int size = mp->NGLOB_AB;
 
   int blocksize = BLOCKSIZE_KERNEL3;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
 
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
+  realw deltatover2 = *deltatover2_F;
   // updates only veloc at this point
   kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,
                                                                       mp->d_accel,
                                                                       size,deltatover2);
 
   if(mp->simulation_type == 3) {
+    realw b_deltatover2 = *b_deltatover2_F;
     kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,
                                                                         mp->d_b_accel,
                                                                         size,b_deltatover2);
@@ -421,10 +396,11 @@
 __global__ void kernel_3_a_acoustic_cuda_device(realw* potential_dot_dot_acoustic,
                                                 int size,
                                                 realw* rmass_acoustic) {
+
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
   if(id < size) {
     // multiplies pressure with the inverse of the mass matrix
     potential_dot_dot_acoustic[id] = potential_dot_dot_acoustic[id]*rmass_acoustic[id];
@@ -440,8 +416,8 @@
                                                 realw* rmass_acoustic) {
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
   if(id < size) {
     // Newmark time scheme: corrector term
     potential_dot_acoustic[id] = potential_dot_acoustic[id] + deltatover2*potential_dot_dot_acoustic[id];
@@ -451,36 +427,34 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
-                             long* Mesh_pointer,
-                             int* size_F) {
+void FC_FUNC_(kernel_3_a_acoustic_cuda,
+              KERNEL_3_ACOUSTIC_CUDA)(long* Mesh_pointer ) {
 
 TRACE("kernel_3_a_acoustic_cuda");
 
-   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-   int size = *size_F;
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
-   int blocksize = BLOCKSIZE_KERNEL3;
-   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-   int num_blocks_x = size_padded/blocksize;
-   int num_blocks_y = 1;
-   while(num_blocks_x > 65535) {
-     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-     num_blocks_y = num_blocks_y*2;
-   }
-   dim3 grid(num_blocks_x,num_blocks_y);
-   dim3 threads(blocksize,1,1);
+  int size = mp->NGLOB_AB;
 
-   kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_dot_acoustic,
+  int blocksize = BLOCKSIZE_KERNEL3;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                     size,
+                                                     mp->d_rmass_acoustic);
+
+  if(mp->simulation_type == 3) {
+   kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_dot_acoustic,
                                                        size,
                                                        mp->d_rmass_acoustic);
+  }
 
-   if(mp->simulation_type == 3) {
-     kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_dot_acoustic,
-                                                         size,
-                                                         mp->d_rmass_acoustic);
-   }
-
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
   exit_on_cuda_error("after kernel 3 a");
@@ -490,41 +464,40 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
-                                                             long* Mesh_pointer,
-                                                             int* size_F,
-                                                             realw* deltatover2_F,
-                                                             realw* b_deltatover2_F) {
+void FC_FUNC_(kernel_3_b_acoustic_cuda,
+              KERNEL_3_ACOUSTIC_CUDA)(long* Mesh_pointer,
+                                      realw* deltatover2_F,
+                                      realw* b_deltatover2_F) {
 
 TRACE("kernel_3_b_acoustic_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-  int size = *size_F;
 
-  realw deltatover2 = *deltatover2_F;
-  realw b_deltatover2 = *b_deltatover2_F;
+  int size = mp->NGLOB_AB;
 
   int blocksize = BLOCKSIZE_KERNEL3;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
+  realw deltatover2 = *deltatover2_F;
+
   kernel_3_b_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_acoustic,
-                                                    mp->d_potential_dot_dot_acoustic,
-                                                    size, deltatover2,
-                                                    mp->d_rmass_acoustic);
+                                                      mp->d_potential_dot_dot_acoustic,
+                                                      size, deltatover2,
+                                                      mp->d_rmass_acoustic);
 
   if(mp->simulation_type == 3) {
+    realw b_deltatover2 = *b_deltatover2_F;
+
     kernel_3_b_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_acoustic,
-                                                      mp->d_b_potential_dot_dot_acoustic,
-                                                      size, b_deltatover2,
-                                                      mp->d_rmass_acoustic);
+                                                        mp->d_b_potential_dot_dot_acoustic,
+                                                        size, b_deltatover2,
+                                                        mp->d_rmass_acoustic);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h	2013-08-20 14:13:26 UTC (rev 22718)
@@ -76,14 +76,6 @@
 // maximum function
 #define MAX(x,y)                    (((x) < (y)) ? (y) : (x))
 
-// utility functions: defined in check_fields_cuda.cu
-double get_time();
-void get_free_memory(double* free_db, double* used_db, double* total_db);
-void print_CUDA_error_if_any(cudaError_t err, int num);
-void pause_for_debugger(int pause);
-void exit_on_cuda_error(char* kernel_name);
-void exit_on_error(char* info);
-
 /* ----------------------------------------------------------------------------------------------- */
 
 // cuda constant arrays
@@ -108,13 +100,6 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-// type of "working" variables: see also CUSTOM_REAL
-// double precision temporary variables leads to 10% performance decrease
-// in Kernel_2_impl (not very much..)
-typedef float realw;
-
-/* ----------------------------------------------------------------------------------------------- */
-
 // (optional) pre-processing directive used in kernels: if defined check that it is also set in src/shared/constants.h:
 // leads up to ~ 5% performance increase
 //#define USE_MESH_COLORING_GPU
@@ -124,7 +109,7 @@
 // Texture memory usage:
 // requires CUDA version >= 4.0, see check below
 // Use textures for d_displ and d_accel -- 10% performance boost
-#define USE_TEXTURES_FIELDS
+//#define USE_TEXTURES_FIELDS
 
 // Using texture memory for the hprime-style constants is slower on
 // Fermi generation hardware, but *may* be faster on Kepler
@@ -157,28 +142,59 @@
 #define BLOCKSIZE_KERNEL3 128
 #define BLOCKSIZE_TRANSFER 256
 
+// maximum grid dimension in one direction of GPU
+#define MAXIMUM_GRID_DIM 65535
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // indexing
-
 #define INDEX2(xsize,x,y) x + (y)*xsize
 
 #define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
-//#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
 
 #define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
-//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
 
 #define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*(j))))
-//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + (j)*xsize*ysize*zsize*isize
 
 #define INDEX6(xsize,ysize,zsize,isize,jsize,x,y,z,i,j,k) x + xsize*(y + ysize*(z + zsize*(i + isize*(j + jsize*k))))
 
 #define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*z) + (i)*NGLL3_PADDED
-//#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*NGLL3_PADDED
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// custom type declarations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// type of "working" variables: see also CUSTOM_REAL
+// double precision temporary variables leads to 10% performance decrease
+// in Kernel_2_impl (not very much..)
+typedef float realw;
+
+// textures
+typedef texture<float, cudaTextureType1D, cudaReadModeElementType> realw_texture;
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// utility functions: defined in check_fields_cuda.cu
+
+/* ----------------------------------------------------------------------------------------------- */
+
+double get_time();
+void get_free_memory(double* free_db, double* used_db, double* total_db);
+void print_CUDA_error_if_any(cudaError_t err, int num);
+void pause_for_debugger(int pause);
+void exit_on_cuda_error(char* kernel_name);
+void exit_on_error(char* info);
+void synchronize_cuda();
+void synchronize_mpi();
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y);
+realw get_device_array_maximum_value(realw* array,int size);
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
 // mesh pointer wrapper structure
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -189,11 +205,12 @@
   int NSPEC_AB;
   int NGLOB_AB;
 
+  // mpi process
   int myrank;
-  int NPROC;
 
   // constants
   int simulation_type;
+  int save_forward;
   int use_mesh_coloring_gpu;
   int absorbing_conditions;
   int gravity;
@@ -229,33 +246,25 @@
   realw* d_wgllwgll_xy; realw* d_wgllwgll_xz; realw* d_wgllwgll_yz;
   realw* d_wgll_cube;
 
-#ifdef USE_TEXTURES_CONSTANTS
-  const textureReference* d_hprime_xx_tex_ptr;
-  realw* d_hprime_xx_tex;
-#endif
-
-
   // A buffer for mpi-send/recv, which is duplicated in fortran but is
   // allocated with pinned memory to facilitate asynchronus device <->
   // host memory transfers
   float* h_send_accel_buffer;
   float* h_send_b_accel_buffer;
+
   float* send_buffer;
   float* h_recv_accel_buffer;
   float* h_recv_b_accel_buffer;
   float* recv_buffer;
+
   int size_mpi_buffer;
+  int size_mpi_buffer_potential;
 
-  // buffers and constants for the MPI-send required for async-memcpy
-  // + non-blocking MPI
-  //daniel: check if needed
-  //float* buffer_recv_vector_ext_mesh;
+  // mpi interfaces
   int num_interfaces_ext_mesh;
   int max_nibool_interfaces_ext_mesh;
-  //int* nibool_interfaces_ext_mesh;
-  //int* my_neighbours_ext_mesh;
-  //int* request_send_vector_ext_mesh;
-  //int* request_recv_vector_ext_mesh;
+  int* d_nibool_interfaces_ext_mesh;
+  int* d_ibool_interfaces_ext_mesh;
 
   // overlapped memcpy streams
   cudaStream_t compute_stream;
@@ -271,13 +280,6 @@
   // backward/reconstructed elastic wavefield
   realw* d_b_displ; realw* d_b_veloc; realw* d_b_accel;
 
-#ifdef USE_TEXTURES_FIELDS
-  // Texture references for fast non-coalesced scattered access
-  const textureReference* d_displ_tex_ref_ptr;
-  const textureReference* d_veloc_tex_ref_ptr;
-  const textureReference* d_accel_tex_ref_ptr;
-#endif
-
   // elastic elements
   int* d_ispec_is_elastic;
 
@@ -296,11 +298,8 @@
 
   // mpi buffer
   realw* d_send_accel_buffer;
+  realw* d_b_send_accel_buffer;
 
-  // interfaces
-  int* d_nibool_interfaces_ext_mesh;
-  int* d_ibool_interfaces_ext_mesh;
-
   //used for absorbing stacey boundaries
   int d_num_abs_boundary_faces;
   int* d_abs_boundary_ispec;
@@ -426,9 +425,11 @@
   realw* d_b_gammaval;
 
   // sensitivity kernels
+  int anisotropic_kl;
   realw* d_rho_kl;
   realw* d_mu_kl;
   realw* d_kappa_kl;
+  realw* d_cijkl_kl;
 
   // noise sensitivity kernel
   realw* d_Sigma_kl;
@@ -468,6 +469,7 @@
 
   // mpi buffer
   realw* d_send_potential_dot_dot_buffer;
+  realw* d_b_send_potential_dot_dot_buffer;
 
   realw* d_b_absorb_potential;
   int d_b_reclen_potential;

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/noise_tomography_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/noise_tomography_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -102,15 +102,15 @@
 
 // randomize displ for testing
 extern "C"
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer,realw* h_displ) {
 TRACE("make_displ_rand");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
   // realw* displ_rnd = (realw*)malloc(mp->NGLOB_AB*3*sizeof(realw));
   for(int i=0;i<mp->NGLOB_AB*3;i++) {
     h_displ[i] = rand();
   }
-  cudaMemcpy(mp->d_displ,h_displ,mp->NGLOB_AB*3*sizeof(realw),cudaMemcpyHostToDevice);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,h_displ,mp->NGLOB_AB*3*sizeof(realw),cudaMemcpyHostToDevice),44001);
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -145,18 +145,15 @@
 
 extern "C"
 void FC_FUNC_(transfer_surface_to_host,
-              TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
+              TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer,
                                         realw* h_noise_surface_movie) {
 TRACE("transfer_surface_to_host");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
-  int num_blocks_x = mp->num_free_surface_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->num_free_surface_faces,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y,1);
   dim3 threads(NGLL2,1,1);
 
@@ -167,8 +164,8 @@
                                                     mp->d_displ,
                                                     mp->d_noise_surface_movie);
 
-  cudaMemcpy(h_noise_surface_movie,mp->d_noise_surface_movie,
-             3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost);
+  print_CUDA_error_if_any(cudaMemcpy(h_noise_surface_movie,mp->d_noise_surface_movie,
+                                     3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost),44002);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("transfer_surface_to_host");
@@ -250,25 +247,22 @@
 
 extern "C"
 void FC_FUNC_(noise_read_add_surface_movie_cu,
-              NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
+              NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer,
                                                realw* h_noise_surface_movie,
                                                int* NOISE_TOMOGRAPHYf) {
-TRACE("noise_read_add_surface_movie_cu");
+  TRACE("noise_read_add_surface_movie_cu");
 
   // EPIK_TRACER("noise_read_add_surface_movie_cu");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
   int NOISE_TOMOGRAPHY = *NOISE_TOMOGRAPHYf;
 
-  cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
-             3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+                                     3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),44003);
 
-  int num_blocks_x = mp->num_free_surface_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->num_free_surface_faces,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y,1);
   dim3 threads(NGLL2,1,1);
 

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_constants_cuda.h	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_constants_cuda.h	2013-08-20 14:13:26 UTC (rev 22718)
@@ -31,7 +31,19 @@
 
 typedef float realw;  // type of "working" variables
 
+// CUDA version >= 5.0 needed for new symbol addressing and texture binding
+#if CUDA_VERSION < 5000
+  #ifndef USE_OLDER_CUDA4_GPU
+    #define USE_OLDER_CUDA4_GPU
+  #endif
+#else
+  #undef USE_OLDER_CUDA4_GPU
+#endif
 
+#ifdef USE_OLDER_CUDA4_GPU
+#pragma message ("\nCompiling with: USE_OLDER_CUDA4_GPU enabled\n")
+#endif
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // CONSTANT arrays setup
@@ -279,51 +291,4 @@
 
 }
 
-
-/* ----------------------------------------------------------------------------------------------- */
-
-/* CUDA specific things from specfem3D_kernels.cu */
-
-// older TEXTURE usage. For now just acoustic simulations. See usage
-// of USE_TEXTURES_FIELDS elsewhere in code for elastic case
-#ifdef USE_TEXTURES
-
-// declaration of textures
-texture<realw, 1, cudaReadModeElementType> tex_potential_acoustic;
-texture<realw, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
-
-
-  void bindTexturesPotential(realw* d_potential_acoustic)
-  {
-    cudaError_t err;
-
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
-
-    err = cudaBindTexture(NULL,tex_potential_acoustic, d_potential_acoustic,
-                          channelDescFloat, NGLOB*sizeof(realw));
-    if (err != cudaSuccess)
-    {
-      fprintf(stderr, "Error in bindTexturesPotential for potential_acoustic: %s\n", cudaGetErrorString(err));
-      exit(1);
-    }
-  }
-
-  void bindTexturesPotential_dot_dot(realw* d_potential_dot_dot_acoustic)
-  {
-    cudaError_t err;
-
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
-
-    err = cudaBindTexture(NULL,tex_potential_dot_dot_acoustic, d_potential_dot_dot_acoustic,
-                          channelDescFloat, NGLOB*sizeof(realw));
-    if (err != cudaSuccess)
-    {
-      fprintf(stderr, "Error in bindTexturesPotential_dot_dot for potential_dot_dot_acoustic: %s\n", cudaGetErrorString(err));
-      exit(1);
-    }
-  }
-
-#endif // USE_TEXTURES
-
-
 #endif //CUDA_HEADER_H

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -44,13 +44,23 @@
 #ifdef USE_OLDER_CUDA4_GPU
 #else
   #ifdef USE_TEXTURES_FIELDS
-extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_tex;
-extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_veloc_tex;
-extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_tex;
+    // elastic
+    extern realw_texture d_displ_tex;
+    extern realw_texture d_veloc_tex;
+    extern realw_texture d_accel_tex;
+    // backward/reconstructed
+    extern realw_texture d_b_displ_tex;
+    extern realw_texture d_b_veloc_tex;
+    extern realw_texture d_b_accel_tex;
+    // acoustic
+    extern realw_texture d_potential_tex;
+    extern realw_texture d_potential_dot_dot_tex;
+    // backward/reconstructed
+    extern realw_texture d_b_potential_tex;
+    extern realw_texture d_b_potential_dot_dot_tex;
   #endif
-
   #ifdef USE_TEXTURES_CONSTANTS
-extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_tex;
+    extern realw_texture d_hprime_xx_tex;
   #endif
 #endif
 
@@ -64,7 +74,7 @@
 
 // copies integer array from CPU host to GPU device
 void copy_todevice_int(void** d_array_addr_ptr,int* h_array,int size){
-  TRACE("copy_todevice_int");
+  TRACE("  copy_todevice_int");
 
   // allocates memory on GPU
   //
@@ -86,7 +96,7 @@
 
 // copies integer array from CPU host to GPU device
 void copy_todevice_realw(void** d_array_addr_ptr,realw* h_array,int size){
-  TRACE("copy_todevice_realw");
+  TRACE("  copy_todevice_realw");
 
   // allocates memory on GPU
   print_CUDA_error_if_any(cudaMalloc((void**)d_array_addr_ptr,size*sizeof(realw)),
@@ -106,19 +116,15 @@
 extern "C"
 void FC_FUNC_(prepare_constants_device,
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
-                                        int* h_NGLLX,
-                                        int* NSPEC_AB, int* NGLOB_AB,
+                                        int* h_NGLLX, int* NSPEC_AB, int* NGLOB_AB,
                                         realw* h_xix, realw* h_xiy, realw* h_xiz,
                                         realw* h_etax, realw* h_etay, realw* h_etaz,
                                         realw* h_gammax, realw* h_gammay, realw* h_gammaz,
                                         realw* h_kappav, realw* h_muv,
                                         int* h_ibool,
-                                        int* num_interfaces_ext_mesh,
-                                        int* max_nibool_interfaces_ext_mesh,
-                                        int* h_nibool_interfaces_ext_mesh,
-                                        int* h_ibool_interfaces_ext_mesh,
-                                        realw* h_hprime_xx,
-                                        realw* h_hprimewgll_xx,
+                                        int* num_interfaces_ext_mesh, int* max_nibool_interfaces_ext_mesh,
+                                        int* h_nibool_interfaces_ext_mesh, int* h_ibool_interfaces_ext_mesh,
+                                        realw* h_hprime_xx, realw* h_hprimewgll_xx,
                                         realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
                                         int* ABSORBING_CONDITIONS,
                                         int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
@@ -126,24 +132,18 @@
                                         realw* h_abs_boundary_jacobian2Dw,
                                         int* h_num_abs_boundary_faces,
                                         int* h_ispec_is_inner,
-                                        int* NSOURCES,
-                                        int* nsources_local_f,
+                                        int* NSOURCES, int* nsources_local_f,
                                         realw* h_sourcearrays,
-                                        int* h_islice_selected_source,
-                                        int* h_ispec_selected_source,
-                                        int* h_number_receiver_global,
-                                        int* h_ispec_selected_rec,
+                                        int* h_islice_selected_source, int* h_ispec_selected_source,
+                                        int* h_number_receiver_global, int* h_ispec_selected_rec,
                                         int* nrec,int* nrec_local,
                                         int* SIMULATION_TYPE,
                                         int* USE_MESH_COLORING_GPU_f,
                                         int* nspec_acoustic,int* nspec_elastic,
-                                        int* my_neighbours_ext_mesh,
-                                        int* request_send_vector_ext_mesh,
-                                        int* request_recv_vector_ext_mesh,
-                                        realw* buffer_recv_vector_ext_mesh
-                                        ) {
+                                        int* h_myrank,
+                                        int* SAVE_FORWARD ) {
 
-TRACE("prepare_constants_device");
+  TRACE("prepare_constants_device");
 
   // allocates mesh parameter structure
   Mesh* mp = (Mesh*) malloc( sizeof(Mesh) );
@@ -155,14 +155,8 @@
     exit_on_error("NGLLX must be 5 for CUDA devices");
   }
 
-  // sets number of processes
-#ifdef WITH_MPI
-  int nproc;
-  MPI_Comm_size(MPI_COMM_WORLD,&nproc);
-  mp->NPROC = nproc;
-#else
-  mp->NPROC = 1;
-#endif
+  // sets processes mpi rank
+  mp->myrank = *h_myrank;
 
   // sets global parameters
   mp->NSPEC_AB = *NSPEC_AB;
@@ -171,6 +165,7 @@
   // constants
   mp->simulation_type = *SIMULATION_TYPE;
   mp->absorbing_conditions = *ABSORBING_CONDITIONS;
+  mp->save_forward = *SAVE_FORWARD;
 
   // sets constant arrays
   setConst_hprime_xx(h_hprime_xx,mp);
@@ -192,21 +187,21 @@
   #ifdef USE_TEXTURES_CONSTANTS
   {
     #ifdef USE_OLDER_CUDA4_GPU
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
       const textureReference* d_hprime_xx_tex_ptr;
       print_CUDA_error_if_any(cudaGetTextureReference(&d_hprime_xx_tex_ptr, "d_hprime_xx_tex"), 4101);
-      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
       print_CUDA_error_if_any(cudaBindTexture(0, d_hprime_xx_tex_ptr, mp->d_hprime_xx, &channelDesc, sizeof(realw)*(NGLL2)), 4001);
    #else
       cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-      print_CUDA_error_if_any(cudaBindTexture(0, &d_hprime_xx, mp->d_hprime_xx, &channelDesc, sizeof(realw)*(NGLL2)), 4001);
+      print_CUDA_error_if_any(cudaBindTexture(0, &d_hprime_xx_tex, mp->d_hprime_xx, &channelDesc, sizeof(realw)*(NGLL2)), 4001);
    #endif
   }
   #endif
 
-  /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
+  // mesh
+  // Assuming NGLLX=5. Padded is then 128 (5^3+3)
   int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
 
-  // mesh
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(realw)),1001);
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(realw)),1002);
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz, size_padded*sizeof(realw)),1003);
@@ -248,113 +243,37 @@
   // global indexing
   copy_todevice_int((void**)&mp->d_ibool,h_ibool,NGLL3*(mp->NSPEC_AB));
 
-  //print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool,size_padded*sizeof(int)),1021);
-  //print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool, h_ibool,
-  //                                  NGLL3*(mp->NSPEC_AB)*sizeof(int),cudaMemcpyHostToDevice),1022);
-
-
   // prepare interprocess-edge exchange information
   mp->num_interfaces_ext_mesh = *num_interfaces_ext_mesh;
   mp->max_nibool_interfaces_ext_mesh = *max_nibool_interfaces_ext_mesh;
   if( mp->num_interfaces_ext_mesh > 0 ){
-
     copy_todevice_int((void**)&mp->d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
                       mp->num_interfaces_ext_mesh);
-
-    //print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_ext_mesh,
-    //                                   (mp->num_interfaces_ext_mesh)*sizeof(int)),1201);
-    //print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
-    //                                   (mp->num_interfaces_ext_mesh)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
     copy_todevice_int((void**)&mp->d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
                       (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh));
-
-    //print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_ext_mesh,
-    //                                   (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh)*sizeof(int)),1203);
-    //print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
-    //                                   (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh)*sizeof(int),
-    //                                   cudaMemcpyHostToDevice),1204);
   }
 
-  // Allocate pinned mpi-buffers.
-  // MPI buffers use pinned memory allocated by cudaMallocHost, which
-  // enables the use of asynchronous memory copies from host <->
-  // device
-  int size_mpi_buffer = 3 * (mp->num_interfaces_ext_mesh) * (mp->max_nibool_interfaces_ext_mesh);
-  // send buffer
-  mp->size_mpi_buffer = size_mpi_buffer;
-  if( mp->size_mpi_buffer > 0 ){
-    print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
-    mp->send_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
-    // adjoint
-    print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_b_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
-    // mp->b_send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
-
-    // receive buffer
-    print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
-    mp->recv_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
-  }
-
-  //daniel: check if needed
-  //mp->nibool_interfaces_ext_mesh = h_nibool_interfaces_ext_mesh;
-  //mp->my_neighbours_ext_mesh = my_neighbours_ext_mesh;
-  //mp->request_send_vector_ext_mesh = request_send_vector_ext_mesh;
-  //mp->request_recv_vector_ext_mesh = request_recv_vector_ext_mesh;
-  //mp->buffer_recv_vector_ext_mesh = buffer_recv_vector_ext_mesh;
-
   // setup two streams, one for compute and one for host<->device memory copies
   // compute stream
   cudaStreamCreate(&mp->compute_stream);
   // copy stream (needed to transfer mpi buffers)
-  if( mp->size_mpi_buffer > 0 ){
+  if( mp->num_interfaces_ext_mesh * mp->max_nibool_interfaces_ext_mesh > 0 ){
     cudaStreamCreate(&mp->copy_stream);
-    //cudaStreamCreate(&mp->b_copy_stream);
   }
 
   // inner elements
   copy_todevice_int((void**)&mp->d_ispec_is_inner,h_ispec_is_inner,mp->NSPEC_AB);
 
-//  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,mp->NSPEC_AB*sizeof(int)),1205);
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner,
- //                                    mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),1206);
-
   // absorbing boundaries
   mp->d_num_abs_boundary_faces = *h_num_abs_boundary_faces;
   if( mp->absorbing_conditions && mp->d_num_abs_boundary_faces > 0 ){
     copy_todevice_int((void**)&mp->d_abs_boundary_ispec,h_abs_boundary_ispec,mp->d_num_abs_boundary_faces);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ispec),
-//                                       (mp->d_num_abs_boundary_faces)*sizeof(int)),1101);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ispec, h_abs_boundary_ispec,
-//                                       (mp->d_num_abs_boundary_faces)*sizeof(int),
-//                                       cudaMemcpyHostToDevice),1102);
-
     copy_todevice_int((void**)&mp->d_abs_boundary_ijk,h_abs_boundary_ijk,
                       3*NGLL2*(mp->d_num_abs_boundary_faces));
-
-//    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ijk),
-//                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int)),1103);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ijk, h_abs_boundary_ijk,
-//                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int),
-//                                       cudaMemcpyHostToDevice),1104);
-
     copy_todevice_realw((void**)&mp->d_abs_boundary_normal,h_abs_boundary_normal,
                         NDIM*NGLL2*(mp->d_num_abs_boundary_faces));
-
-//    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_normal),
-//                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1105);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_normal, h_abs_boundary_normal,
-//                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),1106);
-
     copy_todevice_realw((void**)&mp->d_abs_boundary_jacobian2Dw,h_abs_boundary_jacobian2Dw,
                         NGLL2*(mp->d_num_abs_boundary_faces));
-
-//    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_jacobian2Dw),
-//                                       NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1107);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
-//                                       NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
- //                                      cudaMemcpyHostToDevice),1108);
   }
 
   // sources
@@ -415,19 +334,15 @@
 
 extern "C"
 void FC_FUNC_(prepare_fields_acoustic_device,
-              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
-                                              realw* rmass_acoustic,
-                                              realw* rhostore,
-                                              realw* kappastore,
-                                              int* num_phase_ispec_acoustic,
-                                              int* phase_ispec_inner_acoustic,
+              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer,
+                                              realw* rmass_acoustic, realw* rhostore, realw* kappastore,
+                                              int* num_phase_ispec_acoustic, int* phase_ispec_inner_acoustic,
                                               int* ispec_is_acoustic,
                                               int* NOISE_TOMOGRAPHY,
                                               int* num_free_surface_faces,
                                               int* free_surface_ispec,
                                               int* free_surface_ijk,
-                                              int* b_reclen_potential,
-                                              realw* b_absorb_potential,
+                                              int* b_reclen_potential, realw* b_absorb_potential,
                                               int* ELASTIC_SIMULATION,
                                               int* num_coupling_ac_el_faces,
                                               int* coupling_ac_el_ispec,
@@ -440,31 +355,50 @@
 
   TRACE("prepare_fields_acoustic_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-  /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
-  int size_padded = NGLL3_PADDED * mp->NSPEC_AB;
-//  int size_nonpadded = NGLL3 * mp->NSPEC_AB;
-  int size_glob = mp->NGLOB_AB;
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   // allocates arrays on device (GPU)
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size_glob),2001);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size_glob),2002);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),2003);
+  int size = mp->NGLOB_AB;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size),2001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size),2002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size),2003);
+  // initializes values to zero
+  //print_CUDA_error_if_any(cudaMemset(mp->d_potential_acoustic,0,sizeof(realw)*size),2007);
+  //print_CUDA_error_if_any(cudaMemset(mp->d_potential_dot_acoustic,0,sizeof(realw)*size),2007);
+  //print_CUDA_error_if_any(cudaMemset(mp->d_potential_dot_dot_acoustic,0,sizeof(realw)*size),2007);
 
+  #ifdef USE_TEXTURES_FIELDS
+  {
+    #ifdef USE_OLDER_CUDA4_GPU
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      const textureReference* d_potential_tex_ref_ptr;
+      print_CUDA_error_if_any(cudaGetTextureReference(&d_potential_tex_ref_ptr, "d_potential_tex"), 2001);
+      print_CUDA_error_if_any(cudaBindTexture(0, d_potential_tex_ref_ptr, mp->d_potential_acoustic, &channelDesc, sizeof(realw)*size), 2001);
+
+      const textureReference* d_potential_dot_dot_tex_ref_ptr;
+      print_CUDA_error_if_any(cudaGetTextureReference(&d_potential_dot_dot_tex_ref_ptr, "d_potential_dot_dot_tex"), 2003);
+      print_CUDA_error_if_any(cudaBindTexture(0, d_potential_dot_dot_tex_ref_ptr, mp->d_potential_dot_dot_acoustic, &channelDesc, sizeof(realw)*size), 2003);
+    #else
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      print_CUDA_error_if_any(cudaBindTexture(0, &d_potential_tex, mp->d_potential_acoustic, &channelDesc, sizeof(realw)*size), 2001);
+      print_CUDA_error_if_any(cudaBindTexture(0, &d_potential_dot_dot_tex, mp->d_potential_dot_dot_acoustic, &channelDesc, sizeof(realw)*size), 2003);
+    #endif
+  }
+  #endif
+
   // mpi buffer
-  if( mp->num_interfaces_ext_mesh > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),
-                      (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),2004);
+  mp->size_mpi_buffer_potential = (mp->num_interfaces_ext_mesh) * (mp->max_nibool_interfaces_ext_mesh);
+  if( mp->size_mpi_buffer_potential > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),mp->size_mpi_buffer_potential *sizeof(realw)),2004);
   }
 
   // mass matrix
   copy_todevice_realw((void**)&mp->d_rmass_acoustic,rmass_acoustic,mp->NGLOB_AB);
 
-//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),2005);
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
-//                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
-
+  // density
   // padded array
+  // Assuming NGLLX==5. Padded is then 128 (5^3+3)
+  int size_padded = NGLL3_PADDED * mp->NSPEC_AB;
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),2006);
   // transfer constant element data with padding
   for(int i=0; i < mp->NSPEC_AB; i++) {
@@ -475,64 +409,34 @@
   // non-padded array
   copy_todevice_realw((void**)&mp->d_kappastore,kappastore,NGLL3*mp->NSPEC_AB);
 
-//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),2007);
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
-//                                     NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),2105);
-
   // phase elements
   mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
-
   copy_todevice_int((void**)&mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
                     2*mp->num_phase_ispec_acoustic);
-
-//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic),
-//                                      mp->num_phase_ispec_acoustic*2*sizeof(int)),2008);
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
-//                                     mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),2101);
-
   copy_todevice_int((void**)&mp->d_ispec_is_acoustic,ispec_is_acoustic,mp->NSPEC_AB);
 
-//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic),
-//                                     mp->NSPEC_AB*sizeof(int)),2009);
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic,
-//                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),2102);
-
   // free surface
   if( *NOISE_TOMOGRAPHY == 0 ){
     // allocate surface arrays
     mp->num_free_surface_faces = *num_free_surface_faces;
     if( mp->num_free_surface_faces > 0 ){
-
       copy_todevice_int((void**)&mp->d_free_surface_ispec,free_surface_ispec,mp->num_free_surface_faces);
-
-//      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
-//                                       mp->num_free_surface_faces*sizeof(int)),2201);
-//      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
-//                                       mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2203);
-
-
       copy_todevice_int((void**)&mp->d_free_surface_ijk,free_surface_ijk,
                         3*NGLL2*mp->num_free_surface_faces);
-
-//      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
-//                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int)),2202);
-//      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-//                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2204);
     }
   }
 
   // absorbing boundaries
-  if( mp->absorbing_conditions ){
-    mp->d_b_reclen_potential = *b_reclen_potential;
-
-    copy_todevice_realw((void**)&mp->d_b_absorb_potential,b_absorb_potential,mp->d_b_reclen_potential);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),2301);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,
-//                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),2302);
+  if( mp->absorbing_conditions && mp->d_num_abs_boundary_faces > 0 ){
+    // absorb_field array used for file i/o
+    if(mp->simulation_type == 3 || ( mp->simulation_type == 1 && mp->save_forward )){
+      // note: b_reclen_potential is record length in bytes ( CUSTOM_REAL * NGLLSQUARE * num_abs_boundary_faces )
+      mp->d_b_reclen_potential = *b_reclen_potential;
+      print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_b_absorb_potential,mp->d_b_reclen_potential),2201);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,mp->d_b_reclen_potential,cudaMemcpyHostToDevice),2202);
+    }
   }
 
-
   // for seismograms
   if( mp->nrec_local > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),
@@ -542,40 +446,14 @@
     if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
   }
 
-
   // coupling with elastic parts
   if( *ELASTIC_SIMULATION && *num_coupling_ac_el_faces > 0 ){
-
     copy_todevice_int((void**)&mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,(*num_coupling_ac_el_faces));
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ispec),
-//                                       (*num_coupling_ac_el_faces)*sizeof(int)),2601);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,
-//                                       (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2602);
-
     copy_todevice_int((void**)&mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,3*NGLL2*(*num_coupling_ac_el_faces));
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk),
-//                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),2603);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,
-//                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2604);
-
     copy_todevice_realw((void**)&mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
                         3*NGLL2*(*num_coupling_ac_el_faces));
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal),
-//                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2605);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
-//                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2606);
-
     copy_todevice_realw((void**)&mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
                         NGLL2*(*num_coupling_ac_el_faces));
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw),
-//                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2607);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
-//                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2608);
-
   }
 
   // mesh coloring
@@ -595,41 +473,65 @@
 
 extern "C"
 void FC_FUNC_(prepare_fields_acoustic_adj_dev,
-              PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
+              PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer,
                                               int* APPROXIMATE_HESS_KL) {
 
   TRACE("prepare_fields_acoustic_adj_dev");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
-  int size_glob = mp->NGLOB_AB;
-
   // kernel simulations
   if( mp->simulation_type != 3 ) return;
 
   // allocates backward/reconstructed arrays on device (GPU)
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size_glob),3014);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size_glob),3015);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size_glob),3016);
+  int size = mp->NGLOB_AB;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size),3014);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size),3015);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size),3016);
+  // initializes values to zero
+  //print_CUDA_error_if_any(cudaMemset(mp->d_b_potential_acoustic,0,sizeof(realw)*size),3007);
+  //print_CUDA_error_if_any(cudaMemset(mp->d_b_potential_dot_acoustic,0,sizeof(realw)*size),3007);
+  //print_CUDA_error_if_any(cudaMemset(mp->d_b_potential_dot_dot_acoustic,0,sizeof(realw)*size),3007);
 
-  // allocates kernels
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3017);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3018);
+  #ifdef USE_TEXTURES_FIELDS
+  {
+    #ifdef USE_OLDER_CUDA4_GPU
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      const textureReference* d_b_potential_tex_ref_ptr;
+      print_CUDA_error_if_any(cudaGetTextureReference(&d_b_potential_tex_ref_ptr, "d_b_potential_tex"), 3001);
+      print_CUDA_error_if_any(cudaBindTexture(0, d_b_potential_tex_ref_ptr, mp->d_b_potential_acoustic, &channelDesc, sizeof(realw)*size), 3001);
 
+      const textureReference* d_b_potential_dot_dot_tex_ref_ptr;
+      print_CUDA_error_if_any(cudaGetTextureReference(&d_b_potential_dot_dot_tex_ref_ptr, "d_b_potential_dot_dot_tex"),3003);
+      print_CUDA_error_if_any(cudaBindTexture(0, d_b_potential_dot_dot_tex_ref_ptr, mp->d_b_potential_dot_dot_acoustic, &channelDesc, sizeof(realw)*size), 3003);
+    #else
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      print_CUDA_error_if_any(cudaBindTexture(0, &d_b_potential_tex, mp->d_b_potential_acoustic, &channelDesc, sizeof(realw)*size), 3001);
+      print_CUDA_error_if_any(cudaBindTexture(0, &d_b_potential_dot_dot_tex, mp->d_b_potential_dot_dot_acoustic, &channelDesc, sizeof(realw)*size), 3003);
+    #endif
+  }
+  #endif
+
+  // allocates kernels
+  size = NGLL3*mp->NSPEC_AB;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),size*sizeof(realw)),3017);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),size*sizeof(realw)),3018);
   // initializes kernel values to zero
-  print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0,
-                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),3019);
-  print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0,
-                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),3020);
+  print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0,size*sizeof(realw)),3019);
+  print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0,size*sizeof(realw)),3020);
 
   // preconditioner
   if( *APPROXIMATE_HESS_KL ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3030);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),size*sizeof(realw)),3030);
     // initializes with zeros
-    print_CUDA_error_if_any(cudaMemset(mp->d_hess_ac_kl,0,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),3031);
+    print_CUDA_error_if_any(cudaMemset(mp->d_hess_ac_kl,0,size*sizeof(realw)),3031);
   }
 
+  // mpi buffer
+  if( mp->size_mpi_buffer_potential > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_potential_dot_dot_buffer),mp->size_mpi_buffer_potential*sizeof(realw)),3014);
+  }
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_fields_acoustic_adj_dev");
 #endif
@@ -644,19 +546,13 @@
 
 extern "C"
 void FC_FUNC_(prepare_fields_elastic_device,
-              PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
-                                             int* size,
-                                             realw* rmassx,
-                                             realw* rmassy,
-                                             realw* rmassz,
-                                             realw* rho_vp,
-                                             realw* rho_vs,
+              PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer,
+                                             realw* rmassx, realw* rmassy, realw* rmassz,
+                                             realw* rho_vp, realw* rho_vs,
                                              int* num_phase_ispec_elastic,
                                              int* phase_ispec_inner_elastic,
                                              int* ispec_is_elastic,
-                                             realw* h_b_absorb_field,
-                                             int* h_b_reclen_field,
-                                             int* SAVE_FORWARD,
+                                             realw* b_absorb_field, int* b_reclen_field,
                                              int* COMPUTE_AND_STORE_STRAIN,
                                              realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
                                              realw* epsilondev_xz,realw* epsilondev_yz,
@@ -677,309 +573,214 @@
                                              int* num_colors_inner_elastic,
                                              int* num_elem_colors_elastic,
                                              int* ANISOTROPY,
-                                             realw *c11store,
-                                             realw *c12store,
-                                             realw *c13store,
-                                             realw *c14store,
-                                             realw *c15store,
-                                             realw *c16store,
-                                             realw *c22store,
-                                             realw *c23store,
-                                             realw *c24store,
-                                             realw *c25store,
-                                             realw *c26store,
-                                             realw *c33store,
-                                             realw *c34store,
-                                             realw *c35store,
-                                             realw *c36store,
-                                             realw *c44store,
-                                             realw *c45store,
-                                             realw *c46store,
-                                             realw *c55store,
-                                             realw *c56store,
-                                             realw *c66store){
+                                             realw *c11store,realw *c12store,realw *c13store,
+                                             realw *c14store,realw *c15store,realw *c16store,
+                                             realw *c22store,realw *c23store,realw *c24store,
+                                             realw *c25store,realw *c26store,realw *c33store,
+                                             realw *c34store,realw *c35store,realw *c36store,
+                                             realw *c44store,realw *c45store,realw *c46store,
+                                             realw *c55store,realw *c56store,realw *c66store ){
 
-TRACE("prepare_fields_elastic_device");
+  TRACE("prepare_fields_elastic_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-  /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
-  int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
-//  int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+  int size;
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),4001);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),4002);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),4003);
+  // debug
+  //printf("prepare_fields_elastic_device: rank %d - wavefield setup\n",mp->myrank);
+  //synchronize_mpi();
 
+  // elastic wavefields
+  size = NDIM * mp->NGLOB_AB;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*size),4001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*size),4002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*size),4003);
+  // initializes values to zero
+  //print_CUDA_error_if_any(cudaMemset(mp->d_displ,0,sizeof(realw)*size),4007);
+  //print_CUDA_error_if_any(cudaMemset(mp->d_veloc,0,sizeof(realw)*size),4007);
+  //print_CUDA_error_if_any(cudaMemset(mp->d_accel,0,sizeof(realw)*size),4007);
+
   #ifdef USE_TEXTURES_FIELDS
   {
     #ifdef USE_OLDER_CUDA4_GPU
-      print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_tex_ref_ptr, "d_displ_tex"), 4001);
       cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-      print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_tex_ref_ptr, mp->d_displ, &channelDesc, sizeof(realw)*(*size)), 4001);
-    #else
-      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-      print_CUDA_error_if_any(cudaBindTexture(0, &d_displ_tex, mp->d_displ, &channelDesc, sizeof(realw)*(*size)), 4001);
-    #endif
-  }
+      const textureReference* d_displ_tex_ref_ptr;
+      print_CUDA_error_if_any(cudaGetTextureReference(&d_displ_tex_ref_ptr, "d_displ_tex"), 4001);
+      print_CUDA_error_if_any(cudaBindTexture(0, d_displ_tex_ref_ptr, mp->d_displ, &channelDesc, sizeof(realw)*size), 4001);
 
-  {
-    #ifdef USE_OLDER_CUDA4_GPU
-      print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_veloc_tex_ref_ptr, "d_veloc_tex"), 4002);
-      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-      print_CUDA_error_if_any(cudaBindTexture(0, mp->d_veloc_tex_ref_ptr, mp->d_veloc, &channelDesc, sizeof(realw)*(*size)), 4002);
-    #else
-      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-      print_CUDA_error_if_any(cudaBindTexture(0, &d_veloc_tex, mp->d_veloc, &channelDesc, sizeof(realw)*(*size)), 4002);
-    #endif
-  }
+      const textureReference* d_veloc_tex_ref_ptr;
+      print_CUDA_error_if_any(cudaGetTextureReference(&d_veloc_tex_ref_ptr, "d_veloc_tex"), 4002);
+      print_CUDA_error_if_any(cudaBindTexture(0, d_veloc_tex_ref_ptr, mp->d_veloc, &channelDesc, sizeof(realw)*size), 4002);
 
-  {
-    #ifdef USE_OLDER_CUDA4_GPU
-      print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_tex_ref_ptr, "d_accel_tex"), 4003);
-      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-      print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_tex_ref_ptr, mp->d_accel, &channelDesc, sizeof(realw)*(*size)), 4003);
+      const textureReference* d_accel_tex_ref_ptr;
+      print_CUDA_error_if_any(cudaGetTextureReference(&d_accel_tex_ref_ptr, "d_accel_tex"), 4003);
+      print_CUDA_error_if_any(cudaBindTexture(0, d_accel_tex_ref_ptr, mp->d_accel, &channelDesc, sizeof(realw)*size), 4003);
     #else
       cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-      print_CUDA_error_if_any(cudaBindTexture(0, &d_accel_tex, mp->d_accel, &channelDesc, sizeof(realw)*(*size)), 4003);
+      print_CUDA_error_if_any(cudaBindTexture(0, &d_displ_tex, mp->d_displ, &channelDesc, sizeof(realw)*size), 4001);
+      print_CUDA_error_if_any(cudaBindTexture(0, &d_veloc_tex, mp->d_veloc, &channelDesc, sizeof(realw)*size), 4002);
+      print_CUDA_error_if_any(cudaBindTexture(0, &d_accel_tex, mp->d_accel, &channelDesc, sizeof(realw)*size), 4003);
     #endif
   }
   #endif
 
-  // mpi buffer
+  // debug
+  //synchronize_mpi();
+
+  // MPI buffer
+  mp->size_mpi_buffer = NDIM * (mp->num_interfaces_ext_mesh) * (mp->max_nibool_interfaces_ext_mesh);
   if( mp->size_mpi_buffer > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
-                                      mp->size_mpi_buffer*sizeof(realw)),4004);
+    // note: Allocate pinned mpi-buffers.
+    //       MPI buffers use pinned memory allocated by cudaMallocHost, which
+    //       enables the use of asynchronous memory copies from host <-> device
+    // send buffer
+    print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
+    //mp->send_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
+    // adjoint
+    //print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_b_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
+    // mp->b_send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+    // receive buffer
+    print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
+    mp->recv_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
+
+    // non-pinned buffer
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),mp->size_mpi_buffer*sizeof(realw)),4004);
+    // adjoint
+    if( mp->simulation_type == 3 ){
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer),mp->size_mpi_buffer*sizeof(realw)),4004);
+    }
   }
 
+  // debug
+  //printf("prepare_fields_elastic_device: rank %d - mass matrix\n",mp->myrank);
+  //synchronize_mpi();
+
   // mass matrix
   copy_todevice_realw((void**)&mp->d_rmassx,rmassx,mp->NGLOB_AB);
   copy_todevice_realw((void**)&mp->d_rmassy,rmassy,mp->NGLOB_AB);
   copy_todevice_realw((void**)&mp->d_rmassz,rmassz,mp->NGLOB_AB);
 
-//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
-//  // transfer element data
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
-//                                     sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4010);
-
-
   // element indices
   copy_todevice_int((void**)&mp->d_ispec_is_elastic,ispec_is_elastic,mp->NSPEC_AB);
 
-//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),4009);
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic,
-//                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),4012);
-
   // phase elements
   mp->num_phase_ispec_elastic = *num_phase_ispec_elastic;
 
   copy_todevice_int((void**)&mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,2*mp->num_phase_ispec_elastic);
 
-//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic),
-//                                     mp->num_phase_ispec_elastic*2*sizeof(int)),4008);
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,
-//                                     mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),4011);
+  // debug
+  //synchronize_mpi();
 
   // for seismograms
   if( mp->nrec_local > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
-                                     3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+    // debug
+    //printf("prepare_fields_elastic_device: rank %d - seismogram setup\n",mp->myrank);
 
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+
     mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) );
     if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
   }
 
+  // debug
+  //synchronize_mpi();
+
   // absorbing conditions
   if( mp->absorbing_conditions && mp->d_num_abs_boundary_faces > 0){
+
+    // debug
+    //printf("prepare_fields_elastic_device: rank %d - absorbing boundary setup\n",mp->myrank);
+
     // non-padded arrays
+    // rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
     copy_todevice_realw((void**)&mp->d_rho_vp,rho_vp,NGLL3*mp->NSPEC_AB);
     copy_todevice_realw((void**)&mp->d_rho_vs,rho_vs,NGLL3*mp->NSPEC_AB);
 
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),4006);
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),4007);
-
-    // rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
-//                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4013);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
-//                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4014);
-
     // absorb_field array used for file i/o
-    if(mp->simulation_type == 3 || ( mp->simulation_type == 1 && *SAVE_FORWARD )){
-      mp->d_b_reclen_field = *h_b_reclen_field;
+    if(mp->simulation_type == 3 || ( mp->simulation_type == 1 && mp->save_forward )){
+      // note: b_reclen_field is length in bytes already (CUSTOM_REAL * NDIM * NGLLSQUARE * num_abs_boundary_faces )
+      mp->d_b_reclen_field = *b_reclen_field;
 
-      copy_todevice_realw((void**)&mp->d_b_absorb_field,h_b_absorb_field,mp->d_b_reclen_field);
+      // debug
+      //printf("prepare_fields_elastic_device: rank %d - absorbing boundary i/o %d\n",mp->myrank,mp->d_b_reclen_field);
 
-//      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field),
-//                                       mp->d_b_reclen_field),4016);
-//      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
-//                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),4017);
+      print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_b_absorb_field,mp->d_b_reclen_field),4101);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,b_absorb_field,mp->d_b_reclen_field,cudaMemcpyHostToDevice),4102);
+
     }
   }
 
+  // debug
+  //synchronize_mpi();
+
   // strains used for attenuation and kernel simulations
   if( *COMPUTE_AND_STORE_STRAIN ){
-    // strains
-    int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+    // debug
+    //printf("prepare_fields_elastic_device: rank %d - strain setup\n",mp->myrank);
+    //synchronize_mpi();
 
-    copy_todevice_realw((void**)&mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
-//                                       epsilondev_size*sizeof(realw)),4301);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),4302);
-
-    copy_todevice_realw((void**)&mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
-//                                       epsilondev_size*sizeof(realw)),4302);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),4303);
-
-    copy_todevice_realw((void**)&mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
-//                                       epsilondev_size*sizeof(realw)),4304);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),4305);
-
-    copy_todevice_realw((void**)&mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
-//                                       epsilondev_size*sizeof(realw)),4306);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),4307);
-
-    copy_todevice_realw((void**)&mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
-//                                       epsilondev_size*sizeof(realw)),4308);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),4309);
-
+    // strains
+    size = NGLL3 * mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+    copy_todevice_realw((void**)&mp->d_epsilondev_xx,epsilondev_xx,size);
+    copy_todevice_realw((void**)&mp->d_epsilondev_yy,epsilondev_yy,size);
+    copy_todevice_realw((void**)&mp->d_epsilondev_xy,epsilondev_xy,size);
+    copy_todevice_realw((void**)&mp->d_epsilondev_xz,epsilondev_xz,size);
+    copy_todevice_realw((void**)&mp->d_epsilondev_yz,epsilondev_yz,size);
   }
 
   // attenuation memory variables
   if( *ATTENUATION ){
-    // memory arrays
-    copy_todevice_realw((void**)&mp->d_R_xx,R_xx,(*R_size));
+    // debug
+    //printf("prepare_fields_elastic_device: rank %d - attenuation setup\n",mp->myrank);
+    //synchronize_mpi();
 
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
-//                                       (*R_size)*sizeof(realw)),4401);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),4402);
-
-    copy_todevice_realw((void**)&mp->d_R_yy,R_yy,(*R_size));
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
-//                                       (*R_size)*sizeof(realw)),4403);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),4404);
-
-    copy_todevice_realw((void**)&mp->d_R_xy,R_xy,(*R_size));
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
-//                                       (*R_size)*sizeof(realw)),4405);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),4406);
-
-    copy_todevice_realw((void**)&mp->d_R_xz,R_xz,(*R_size));
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
-//                                       (*R_size)*sizeof(realw)),4407);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),4408);
-
-    copy_todevice_realw((void**)&mp->d_R_yz,R_yz,(*R_size));
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
-//                                       (*R_size)*sizeof(realw)),4409);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),4410);
-
+    // memory arrays
+    size = *R_size;
+    copy_todevice_realw((void**)&mp->d_R_xx,R_xx,size);
+    copy_todevice_realw((void**)&mp->d_R_yy,R_yy,size);
+    copy_todevice_realw((void**)&mp->d_R_xy,R_xy,size);
+    copy_todevice_realw((void**)&mp->d_R_xz,R_xz,size);
+    copy_todevice_realw((void**)&mp->d_R_yz,R_yz,size);
     // attenuation factors
     copy_todevice_realw((void**)&mp->d_one_minus_sum_beta,one_minus_sum_beta,NGLL3*mp->NSPEC_AB);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
-//                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),4430);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
-//                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4431);
-
     copy_todevice_realw((void**)&mp->d_factor_common,factor_common,N_SLS*NGLL3*mp->NSPEC_AB);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
-//                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),4432);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
-//                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4433);
-
     // alpha,beta,gamma factors
     copy_todevice_realw((void**)&mp->d_alphaval,alphaval,N_SLS);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
-//                                       N_SLS*sizeof(realw)),4434);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
-//                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435);
-
     copy_todevice_realw((void**)&mp->d_betaval,betaval,N_SLS);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
-//                                       N_SLS*sizeof(realw)),4436);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
-//                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437);
-
     copy_todevice_realw((void**)&mp->d_gammaval,gammaval,N_SLS);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
-//                                       N_SLS*sizeof(realw)),4438);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
-//                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439);
-
   }
 
   // anisotropy
   if( *ANISOTROPY ){
+    // debug
+    //printf("prepare_fields_elastic_device: rank %d - attenuation setup\n",mp->myrank);
+    //synchronize_mpi();
+
+    // Assuming NGLLX==5. Padded is then 128 (5^3+3)
+    int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
+
     // allocates memory on GPU
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store),
-                                       size_padded*sizeof(realw)),4700);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store),
-                                       size_padded*sizeof(realw)),4701);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store),
-                                       size_padded*sizeof(realw)),4702);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store),
-                                       size_padded*sizeof(realw)),4703);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store),
-                                       size_padded*sizeof(realw)),4704);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store),
-                                       size_padded*sizeof(realw)),4705);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store),
-                                       size_padded*sizeof(realw)),4706);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store),
-                                       size_padded*sizeof(realw)),4707);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store),
-                                       size_padded*sizeof(realw)),4708);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store),
-                                       size_padded*sizeof(realw)),4709);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store),
-                                       size_padded*sizeof(realw)),4710);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store),
-                                       size_padded*sizeof(realw)),4711);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store),
-                                       size_padded*sizeof(realw)),4712);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store),
-                                       size_padded*sizeof(realw)),4713);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store),
-                                       size_padded*sizeof(realw)),4714);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store),
-                                       size_padded*sizeof(realw)),4715);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store),
-                                       size_padded*sizeof(realw)),4716);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store),
-                                       size_padded*sizeof(realw)),4717);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store),
-                                       size_padded*sizeof(realw)),4718);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store),
-                                       size_padded*sizeof(realw)),4719);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store),
-                                       size_padded*sizeof(realw)),4720);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store),size_padded*sizeof(realw)),4700);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store),size_padded*sizeof(realw)),4701);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store),size_padded*sizeof(realw)),4702);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store),size_padded*sizeof(realw)),4703);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store),size_padded*sizeof(realw)),4704);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store),size_padded*sizeof(realw)),4705);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store),size_padded*sizeof(realw)),4706);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store),size_padded*sizeof(realw)),4707);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store),size_padded*sizeof(realw)),4708);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store),size_padded*sizeof(realw)),4709);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store),size_padded*sizeof(realw)),4710);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store),size_padded*sizeof(realw)),4711);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store),size_padded*sizeof(realw)),4712);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store),size_padded*sizeof(realw)),4713);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store),size_padded*sizeof(realw)),4714);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store),size_padded*sizeof(realw)),4715);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store),size_padded*sizeof(realw)),4716);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store),size_padded*sizeof(realw)),4717);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store),size_padded*sizeof(realw)),4718);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store),size_padded*sizeof(realw)),4719);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store),size_padded*sizeof(realw)),4720);
 
     // transfer constant element data with padding
     for(int i=0;i < mp->NSPEC_AB;i++) {
@@ -1030,45 +831,26 @@
 
   // ocean load approximation
   if( *APPROXIMATE_OCEAN_LOAD ){
+    // debug
+    //printf("prepare_fields_elastic_device: rank %d - ocean load setup\n",mp->myrank);
+    //synchronize_mpi();
+
     // oceans needs a free surface
     mp->num_free_surface_faces = *num_free_surface_faces;
     if( mp->num_free_surface_faces > 0 ){
       // mass matrix
       copy_todevice_realw((void**)&mp->d_rmass_ocean_load,rmass_ocean_load,mp->NGLOB_AB);
-
-//      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
-//                                         sizeof(realw)*mp->NGLOB_AB),4501);
-//      print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
-//                                         sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4502);
       // surface normal
       copy_todevice_realw((void**)&mp->d_free_surface_normal,free_surface_normal,
                           3*NGLL2*(mp->num_free_surface_faces));
-
-//      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
-//                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),4503);
-//      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
-//                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),4504);
-
       // temporary global array: used to synchronize updates on global accel array
       print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load),
                                          sizeof(int)*mp->NGLOB_AB),4505);
 
       if( *NOISE_TOMOGRAPHY == 0 && *ACOUSTIC_SIMULATION == 0 ){
-
         copy_todevice_int((void**)&mp->d_free_surface_ispec,free_surface_ispec,mp->num_free_surface_faces);
-
-//        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
-//                                          mp->num_free_surface_faces*sizeof(int)),4601);
-//        print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
-//                                          mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4603);
-
         copy_todevice_int((void**)&mp->d_free_surface_ijk,free_surface_ijk,
                           3*NGLL2*mp->num_free_surface_faces);
-
-//        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
-//                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4602);
-//        print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-//                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4604);
       }
     }
   }
@@ -1082,6 +864,10 @@
 
   // JC JC here we will need to add GPU support for the new C-PML routines
 
+  // debug
+  //printf("prepare_fields_elastic_device: rank %d - done\n",mp->myrank);
+  //synchronize_mpi();
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_fields_elastic_device");
 #endif
@@ -1091,8 +877,8 @@
 
 extern "C"
 void FC_FUNC_(prepare_fields_elastic_adj_dev,
-              PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
-                                             int* size,
+              PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer,
+                                             int* size_f,
                                              int* COMPUTE_AND_STORE_STRAIN,
                                              realw* epsilon_trace_over_3,
                                              realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
@@ -1102,147 +888,144 @@
                                              int* R_size,
                                              realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
                                              realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
+                                             int* ANISOTROPIC_KL,
                                              int* APPROXIMATE_HESS_KL){
 
   TRACE("prepare_fields_elastic_adj_dev");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+  int size;
 
   // checks if kernel simulation
   if( mp->simulation_type != 3 ) return;
 
   // kernel simulations
+  // debug
+  //printf("prepare_fields_elastic_adj_dev: rank %d - kernel setup\n",mp->myrank);
+  //synchronize_mpi();
+
+  // backward/reconstructed wavefields
   // allocates backward/reconstructed arrays on device (GPU)
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*(*size)),5201);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*(*size)),5202);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*(*size)),5203);
+  size = *size_f;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*size),5201);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*size),5202);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*size),5203);
+  // initializes values to zero
+  //print_CUDA_error_if_any(cudaMemset(mp->d_b_displ,0,sizeof(realw)*size),5207);
+  //print_CUDA_error_if_any(cudaMemset(mp->d_b_veloc,0,sizeof(realw)*size),5207);
+  //print_CUDA_error_if_any(cudaMemset(mp->d_b_accel,0,sizeof(realw)*size),5207);
 
-  // allocates kernels
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5204);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5205);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5206);
+  #ifdef USE_TEXTURES_FIELDS
+  {
+    #ifdef USE_OLDER_CUDA4_GPU
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      const textureReference* d_b_displ_tex_ref_ptr;
+      print_CUDA_error_if_any(cudaGetTextureReference(&d_b_displ_tex_ref_ptr, "d_b_displ_tex"), 4001);
+      print_CUDA_error_if_any(cudaBindTexture(0, d_b_displ_tex_ref_ptr, mp->d_b_displ, &channelDesc, sizeof(realw)*size), 4001);
 
+      const textureReference* d_b_veloc_tex_ref_ptr;
+      print_CUDA_error_if_any(cudaGetTextureReference(&d_b_veloc_tex_ref_ptr, "d_b_veloc_tex"), 4002);
+      print_CUDA_error_if_any(cudaBindTexture(0, d_b_veloc_tex_ref_ptr, mp->d_b_veloc, &channelDesc, sizeof(realw)*size), 4002);
+
+      const textureReference* d_b_accel_tex_ref_ptr;
+      print_CUDA_error_if_any(cudaGetTextureReference(&d_b_accel_tex_ref_ptr, "d_b_accel_tex"), 4003);
+      print_CUDA_error_if_any(cudaBindTexture(0, d_b_accel_tex_ref_ptr, mp->d_b_accel, &channelDesc, sizeof(realw)*size), 4003);
+    #else
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      print_CUDA_error_if_any(cudaBindTexture(0, &d_b_displ_tex, mp->d_b_displ, &channelDesc, sizeof(realw)*size), 4001);
+      print_CUDA_error_if_any(cudaBindTexture(0, &d_b_veloc_tex, mp->d_b_veloc, &channelDesc, sizeof(realw)*size), 4002);
+      print_CUDA_error_if_any(cudaBindTexture(0, &d_b_accel_tex, mp->d_b_accel, &channelDesc, sizeof(realw)*size), 4003);
+    #endif
+  }
+  #endif
+
+
+  // anisotropic kernel flag
+  mp->anisotropic_kl = *ANISOTROPIC_KL;
+
+  // anisotropic/isotropic kernels
+  // debug
+  //printf("prepare_fields_elastic_adj_dev: rank %d -  anisotropic/isotropic kernels\n",mp->myrank);
+  //synchronize_mpi();
+
+  // allocates kernels
+  size = NGLL3 * mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+  // density kernel
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),size*sizeof(realw)),5204);
   // initializes kernel values to zero
-  print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0,
-                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),5207);
-  print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0,
-                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),5208);
-  print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0,
-                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),5209);
+  print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0,size*sizeof(realw)),5214);
 
+  if( mp->anisotropic_kl ){
+    // anisotropic kernels
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_cijkl_kl),21*size*sizeof(realw)),5205);
+    print_CUDA_error_if_any(cudaMemset(mp->d_cijkl_kl,0,21*size*sizeof(realw)),5215);
+
+  }else{
+    // isotropic kernels
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),size*sizeof(realw)),5206);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),size*sizeof(realw)),5207);
+    print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0,size*sizeof(realw)),5216);
+    print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0,size*sizeof(realw)),5217);
+  }
+
   // strains used for attenuation and kernel simulations
   if( *COMPUTE_AND_STORE_STRAIN ){
     // strains
-    int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+    // debug
+    //printf("prepare_fields_elastic_adj_dev: rank %d - strains\n",mp->myrank);
+    //synchronize_mpi();
 
+    size = NGLL3 * mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+
     // solid pressure
-    copy_todevice_realw((void**)&mp->d_epsilon_trace_over_3,epsilon_trace_over_3,NGLL3*mp->NSPEC_AB);
+    copy_todevice_realw((void**)&mp->d_epsilon_trace_over_3,epsilon_trace_over_3,size);
 
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
-//                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5310);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
-//                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5311);
     // backward solid pressure
+    copy_todevice_realw((void**)&mp->d_b_epsilon_trace_over_3,b_epsilon_trace_over_3,size);
 
-    copy_todevice_realw((void**)&mp->d_b_epsilon_trace_over_3,b_epsilon_trace_over_3,NGLL3*mp->NSPEC_AB);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
-//                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5312);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
-//                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5313);
-
     // prepares backward strains
-
-    copy_todevice_realw((void**)&mp->d_b_epsilondev_xx,b_epsilondev_xx,epsilondev_size);
-    copy_todevice_realw((void**)&mp->d_b_epsilondev_yy,b_epsilondev_yy,epsilondev_size);
-    copy_todevice_realw((void**)&mp->d_b_epsilondev_xy,b_epsilondev_xy,epsilondev_size);
-    copy_todevice_realw((void**)&mp->d_b_epsilondev_xz,b_epsilondev_xz,epsilondev_size);
-    copy_todevice_realw((void**)&mp->d_b_epsilondev_yz,b_epsilondev_yz,epsilondev_size);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
-//                                       epsilondev_size*sizeof(realw)),5321);
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
-//                                       epsilondev_size*sizeof(realw)),5322);
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
-//                                       epsilondev_size*sizeof(realw)),5323);
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
-//                                       epsilondev_size*sizeof(realw)),5324);
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
-//                                       epsilondev_size*sizeof(realw)),5325);
-
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
-//                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5326);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
-//                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5327);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
-//                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5328);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
-//                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5329);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
-//                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5330);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_xx,b_epsilondev_xx,size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_yy,b_epsilondev_yy,size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_xy,b_epsilondev_xy,size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_xz,b_epsilondev_xz,size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_yz,b_epsilondev_yz,size);
   }
 
   // attenuation memory variables
   if( *ATTENUATION ){
-    copy_todevice_realw((void**)&mp->d_b_R_xx,b_R_xx,(*R_size));
-    copy_todevice_realw((void**)&mp->d_b_R_yy,b_R_yy,(*R_size));
-    copy_todevice_realw((void**)&mp->d_b_R_xy,b_R_xy,(*R_size));
-    copy_todevice_realw((void**)&mp->d_b_R_xz,b_R_xz,(*R_size));
-    copy_todevice_realw((void**)&mp->d_b_R_yz,b_R_yz,(*R_size));
+    // debug
+    //printf("prepare_fields_elastic_adj_dev: rank %d - attenuation\n",mp->myrank);
+    //synchronize_mpi();
 
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx),
-//                                       (*R_size)*sizeof(realw)),5421);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),5422);
+    size = *R_size;
 
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy),
-//                                       (*R_size)*sizeof(realw)),5423);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),5424);
+    copy_todevice_realw((void**)&mp->d_b_R_xx,b_R_xx,size);
+    copy_todevice_realw((void**)&mp->d_b_R_yy,b_R_yy,size);
+    copy_todevice_realw((void**)&mp->d_b_R_xy,b_R_xy,size);
+    copy_todevice_realw((void**)&mp->d_b_R_xz,b_R_xz,size);
+    copy_todevice_realw((void**)&mp->d_b_R_yz,b_R_yz,size);
 
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy),
-//                                       (*R_size)*sizeof(realw)),5425);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),5426);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz),
-//                                       (*R_size)*sizeof(realw)),5427);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),5428);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz),
-//                                       (*R_size)*sizeof(realw)),5429);
-//   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(realw),
-//                                       cudaMemcpyHostToDevice),5420);
-
     // alpha,beta,gamma factors for backward fields
     copy_todevice_realw((void**)&mp->d_b_alphaval,b_alphaval,N_SLS);
     copy_todevice_realw((void**)&mp->d_b_betaval,b_betaval,N_SLS);
     copy_todevice_realw((void**)&mp->d_b_gammaval,b_gammaval,N_SLS);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval),
-//                                       N_SLS*sizeof(realw)),5434);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
-//                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5435);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval),
-//                                       N_SLS*sizeof(realw)),5436);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
-//                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5437);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval),
-//                                       N_SLS*sizeof(realw)),5438);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
-//                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5439);
   }
 
+  // approximate hessian kernel
   if( *APPROXIMATE_HESS_KL ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5450);
-    // initializes with zeros
-    print_CUDA_error_if_any(cudaMemset(mp->d_hess_el_kl,0,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5451);
+    // debug
+    //printf("prepare_fields_elastic_adj_dev: rank %d - hessian kernel\n",mp->myrank);
+    //synchronize_mpi();
+
+    size = NGLL3 * mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),size*sizeof(realw)),5450);
+    print_CUDA_error_if_any(cudaMemset(mp->d_hess_el_kl,0,size*sizeof(realw)),5451);
   }
 
+  // debug
+  //printf("prepare_fields_elastic_adj_dev: rank %d - done\n",mp->myrank);
+  //synchronize_mpi();
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_fields_elastic_adj_dev");
 #endif
@@ -1256,17 +1039,15 @@
 
 extern "C"
 void FC_FUNC_(prepare_sim2_or_3_const_device,
-              PREPARE_SIM2_OR_3_CONST_DEVICE)(
-                                              long* Mesh_pointer_f,
+              PREPARE_SIM2_OR_3_CONST_DEVICE)(long* Mesh_pointer,
                                               int* islice_selected_rec,
                                               int* islice_selected_rec_size,
                                               int* nadj_rec_local,
-                                              int* nrec,
-                                              int* myrank) {
+                                              int* nrec) {
 
   TRACE("prepare_sim2_or_3_const_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   // adjoint source arrays
   mp->nadj_rec_local = *nadj_rec_local;
@@ -1286,12 +1067,14 @@
 
     int irec_local = 0;
     for(int irec = 0; irec < *nrec; irec++) {
-      if(*myrank == islice_selected_rec[irec]) {
+      if(mp->myrank == islice_selected_rec[irec]) {
         irec_local++;
         h_pre_computed_irec[irec_local-1] = irec;
       }
     }
+    // checks if all local receivers have been found
     if( irec_local != mp->nadj_rec_local ) exit_on_error("prepare_sim2_or_3_const_device: irec_local not equal\n");
+
     // copies values onto GPU
     print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec,
                                        (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),6010);
@@ -1300,7 +1083,6 @@
     // temporary array to prepare extracted source array values
     mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
     if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
-
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1317,7 +1099,7 @@
 
 extern "C"
 void FC_FUNC_(prepare_fields_noise_device,
-              PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
+              PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer,
                                            int* NSPEC_AB, int* NGLOB_AB,
                                            int* free_surface_ispec,
                                            int* free_surface_ijk,
@@ -1325,34 +1107,21 @@
                                            int* NOISE_TOMOGRAPHY,
                                            int* NSTEP,
                                            realw* noise_sourcearray,
-                                           realw* normal_x_noise,
-                                           realw* normal_y_noise,
-                                           realw* normal_z_noise,
+                                           realw* normal_x_noise, realw* normal_y_noise, realw* normal_z_noise,
                                            realw* mask_noise,
                                            realw* free_surface_jacobian2Dw) {
 
   TRACE("prepare_fields_noise_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   // free surface
   mp->num_free_surface_faces = *num_free_surface_faces;
 
   copy_todevice_int((void**)&mp->d_free_surface_ispec,free_surface_ispec,mp->num_free_surface_faces);
-
-//  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec,
-//                                     mp->num_free_surface_faces*sizeof(int)),7001);
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec,
-//                                     mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7002);
-
   copy_todevice_int((void**)&mp->d_free_surface_ijk,free_surface_ijk,
                     3*NGLL2*mp->num_free_surface_faces);
 
-//  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk,
-//                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int)),7003);
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-//                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7004);
-
   // alloc storage for the surface buffer to be copied
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie,
                                      3*NGLL2*mp->num_free_surface_faces*sizeof(realw)),7005);
@@ -1361,11 +1130,6 @@
   if( *NOISE_TOMOGRAPHY == 1 ){
     copy_todevice_realw((void**)&mp->d_noise_sourcearray,noise_sourcearray,
                         3*NGLL3*(*NSTEP));
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
-//                                       3*NGLL3*(*NSTEP)*sizeof(realw)),7101);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
-//                                       3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),7102);
   }
 
   // prepares noise directions
@@ -1375,42 +1139,15 @@
     copy_todevice_realw((void**)&mp->d_normal_x_noise,normal_x_noise,nface_size);
     copy_todevice_realw((void**)&mp->d_normal_y_noise,normal_y_noise,nface_size);
     copy_todevice_realw((void**)&mp->d_normal_z_noise,normal_z_noise,nface_size);
-
     copy_todevice_realw((void**)&mp->d_mask_noise,mask_noise,nface_size);
     copy_todevice_realw((void**)&mp->d_free_surface_jacobian2Dw,free_surface_jacobian2Dw,nface_size);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
-//                                       nface_size*sizeof(realw)),7301);
-//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
-//                                       nface_size*sizeof(realw)),7302);
-//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
-//                                       nface_size*sizeof(realw)),7303);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
-//                                       nface_size*sizeof(realw)),7304);
-//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
-//                                       nface_size*sizeof(realw)),7305);
-    // transfers data onto GPU
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise,
-//                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7306);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise,
-//                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7307);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
-//                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7308);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
-//                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7309);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw,
-//                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7310);
   }
 
   // prepares noise strength kernel
   if( *NOISE_TOMOGRAPHY == 3 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),
-                                       NGLL3*(mp->NSPEC_AB)*sizeof(realw)),7401);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),NGLL3*(mp->NSPEC_AB)*sizeof(realw)),7401);
     // initializes kernel values to zero
-    print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),7403);
-
+    print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,NGLL3*mp->NSPEC_AB*sizeof(realw)),7403);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1427,7 +1164,7 @@
 
 extern "C"
 void FC_FUNC_(prepare_fields_gravity_device,
-              PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
+              PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer,
                                              int* GRAVITY,
                                              realw* minus_deriv_gravity,
                                              realw* minus_g,
@@ -1437,7 +1174,7 @@
 
   TRACE("prepare_fields_gravity_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   setConst_wgll_cube(h_wgll_cube,mp);
 
@@ -1445,21 +1182,10 @@
   if( mp->gravity ){
 
     copy_todevice_realw((void**)&mp->d_minus_deriv_gravity,minus_deriv_gravity,mp->NGLOB_AB);
-
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_deriv_gravity),
-//                                       (mp->NGLOB_AB)*sizeof(realw)),8000);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_deriv_gravity, minus_deriv_gravity,
-//                                       (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8001);
-
     copy_todevice_realw((void**)&mp->d_minus_g,minus_g,mp->NGLOB_AB);
 
-//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_g),
-//                                       (mp->NGLOB_AB)*sizeof(realw)),8002);
-//    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_g, minus_g,
-//                                       (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8003);
-
-
     if( *ACOUSTIC_SIMULATION == 0 ){
+      // density
       // rhostore not allocated yet
       int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
       // padded array
@@ -1472,10 +1198,16 @@
     }
   }
 
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_fields_gravity_device");
+#endif
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// unused yet...
+
+/*
 extern "C"
 void FC_FUNC_(prepare_seismogram_fields,
               PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {
@@ -1501,6 +1233,7 @@
   cudaMallocHost((void**)&mp->h_seismograms_v_it,3**nrec_local*sizeof(realw));
   cudaMallocHost((void**)&mp->h_seismograms_a_it,3**nrec_local*sizeof(realw));
 }
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
 
@@ -1510,8 +1243,7 @@
 
 extern "C"
 void FC_FUNC_(prepare_cleanup_device,
-              PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
-                                      int* SAVE_FORWARD,
+              PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer,
                                       int* ACOUSTIC_SIMULATION,
                                       int* ELASTIC_SIMULATION,
                                       int* ABSORBING_CONDITIONS,
@@ -1525,7 +1257,7 @@
 TRACE("prepare_cleanup_device");
 
   // frees allocated memory arrays
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   // frees memory on GPU
   // mesh
@@ -1610,7 +1342,9 @@
     cudaFree(mp->d_displ);
     cudaFree(mp->d_veloc);
     cudaFree(mp->d_accel);
+
     cudaFree(mp->d_send_accel_buffer);
+    if( mp->simulation_type == 3) cudaFree(mp->d_b_send_accel_buffer);
 
     cudaFree(mp->d_rmassx);
     cudaFree(mp->d_rmassy);
@@ -1628,7 +1362,7 @@
       cudaFree(mp->d_rho_vp);
       cudaFree(mp->d_rho_vs);
 
-      if(mp->simulation_type == 3 || ( mp->simulation_type == 1 && *SAVE_FORWARD ))
+      if(mp->simulation_type == 3 || ( mp->simulation_type == 1 && mp->save_forward ))
           cudaFree(mp->d_b_absorb_field);
     }
 
@@ -1637,8 +1371,12 @@
       cudaFree(mp->d_b_veloc);
       cudaFree(mp->d_b_accel);
       cudaFree(mp->d_rho_kl);
-      cudaFree(mp->d_mu_kl);
-      cudaFree(mp->d_kappa_kl);
+      if( mp->anisotropic_kl ){
+        cudaFree(mp->d_cijkl_kl);
+      }else{
+        cudaFree(mp->d_mu_kl);
+        cudaFree(mp->d_kappa_kl);
+      }
       if( *APPROXIMATE_HESS_KL ) cudaFree(mp->d_hess_el_kl);
     }
 

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2013-08-20 14:13:26 UTC (rev 22718)
@@ -1,4 +1,4 @@
-/*
+/* 
 !=====================================================================
 !
 !               S p e c f e m 3 D  V e r s i o n  2 . 1
@@ -33,76 +33,74 @@
 
 typedef float realw;
 
+ 
 
-
 //
 // src/cuda/check_fields_cuda.cu
 //
 
-void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {}
+void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {} 
 
 void FC_FUNC_(output_free_device_memory,
-              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank_f) {} 
 
 void FC_FUNC_(get_free_device_memory,
-              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {} 
 
+void FC_FUNC_(get_norm_acoustic_from_device,
+              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,long* Mesh_pointer,int* sim_type) {} 
+
+void FC_FUNC_(get_norm_elastic_from_device,
+              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
+                                            long* Mesh_pointer,
+                                            int* type) {} 
+
+void FC_FUNC_(get_max_accel,
+              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {} 
+
 void FC_FUNC_(check_max_norm_displ_gpu,
-              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
+              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_vector,
-              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
+              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_displ,
-              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
+              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_displ_gpu,
-              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {}
+              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_accel_gpu,
-              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {}
+              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_veloc_gpu,
-              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {}
+              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_displ,
-              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
+              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_accel,
-              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
+              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {} 
 
 void FC_FUNC_(check_error_vectors,
-              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
+              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {} 
 
-void FC_FUNC_(get_max_accel,
-              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
 
-void FC_FUNC_(get_norm_acoustic_from_device,
-              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
-                                             long* Mesh_pointer_f) {}
-
-void FC_FUNC_(get_norm_elastic_from_device,
-              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
-                                            long* Mesh_pointer_f) {}
-
-
 //
 // src/cuda/compute_add_sources_acoustic_cuda.cu
 //
 
 void FC_FUNC_(compute_add_sources_ac_cuda,
-              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
-                                                 int* phase_is_innerf,
-                                                 int* NSOURCESf,
-                                                 double* h_stf_pre_compute,
-                                                 int* myrankf) {}
+              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer,
+                                           int* phase_is_innerf,
+                                           int* NSOURCESf,
+                                           double* h_stf_pre_compute) {} 
 
 void FC_FUNC_(compute_add_sources_ac_s3_cuda,
-              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
-                                                      int* phase_is_innerf,
-                                                      int* NSOURCESf,
-                                                      double* h_stf_pre_compute,
-                                                      int* myrankf) {}
+              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer,
+                                              int* phase_is_innerf,
+                                              int* NSOURCESf,
+                                              double* h_stf_pre_compute) {} 
 
 void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
               ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
@@ -111,12 +109,11 @@
                                                int* h_ispec_is_inner,
                                                int* h_ispec_is_acoustic,
                                                int* h_ispec_selected_rec,
-                                               int* myrank,
                                                int* nrec,
                                                int* time_index,
                                                int* h_islice_selected_rec,
                                                int* nadj_rec_local,
-                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {}
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {} 
 
 
 //
@@ -124,25 +121,22 @@
 //
 
 void FC_FUNC_(compute_add_sources_el_cuda,
-              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
-                                            int* phase_is_innerf,
-                                            int* NSOURCESf,
-                                            double* h_stf_pre_compute,
-                                            int* myrankf) {}
+              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer,
+                                           double* h_stf_pre_compute,
+                                           int* h_NSOURCES,
+                                           int* h_phase_is_inner) {} 
 
 void FC_FUNC_(compute_add_sources_el_s3_cuda,
               COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
                                               double* h_stf_pre_compute,
-                                              int* NSOURCESf,
-                                              int* phase_is_inner,
-                                              int* myrank) {}
+                                              int* h_NSOURCES,
+                                              int* h_phase_is_inner) {} 
 
 void FC_FUNC_(add_source_master_rec_noise_cu,
-              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
-                                                int* myrank_f,
-                                                int* it_f,
-                                                int* irec_master_noise_f,
-                                                int* islice_selected_rec) {}
+              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer,
+                                              int* it_f,
+                                              int* irec_master_noise_f,
+                                              int* islice_selected_rec) {} 
 
 void FC_FUNC_(add_sources_el_sim_type_2_or_3,
               ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
@@ -151,12 +145,11 @@
                                                int* h_ispec_is_inner,
                                                int* h_ispec_is_elastic,
                                                int* h_ispec_selected_rec,
-                                               int* myrank,
                                                int* nrec,
                                                int* time_index,
                                                int* h_islice_selected_rec,
                                                int* nadj_rec_local,
-                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {}
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {} 
 
 
 //
@@ -164,17 +157,17 @@
 //
 
 void FC_FUNC_(compute_coupling_ac_el_cuda,
-              COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f,
+              COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer,
                                            int* phase_is_innerf,
-                                           int* num_coupling_ac_el_facesf) {}
+                                           int* num_coupling_ac_el_facesf) {} 
 
 void FC_FUNC_(compute_coupling_el_ac_cuda,
-              COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f,
+              COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer,
                                            int* phase_is_innerf,
-                                           int* num_coupling_ac_el_facesf) {}
+                                           int* num_coupling_ac_el_facesf) {} 
 
 void FC_FUNC_(compute_coupling_ocean_cuda,
-              COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f) {}
+              COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer) {} 
 
 
 //
@@ -182,37 +175,26 @@
 //
 
 void FC_FUNC_(transfer_boun_pot_from_device,
-              TRANSFER_BOUN_POT_FROM_DEVICE)(
-                                              int* size,
-                                              long* Mesh_pointer_f,
-                                              realw* potential_dot_dot_acoustic,
-                                              realw* send_potential_dot_dot_buffer,
-                                              int* num_interfaces_ext_mesh,
-                                              int* max_nibool_interfaces_ext_mesh,
-                                              int* nibool_interfaces_ext_mesh,
-                                              int* ibool_interfaces_ext_mesh,
-                                              int* FORWARD_OR_ADJOINT){}
+              TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer,
+                                             realw* potential_dot_dot_acoustic,
+                                             realw* send_potential_dot_dot_buffer,
+                                             int* FORWARD_OR_ADJOINT){} 
 
 void FC_FUNC_(transfer_asmbl_pot_to_device,
-              TRANSFER_ASMBL_POT_TO_DEVICE)(
-                                                long* Mesh_pointer,
-                                                realw* potential_dot_dot_acoustic,
-                                                realw* buffer_recv_scalar_ext_mesh,
-                                                int* num_interfaces_ext_mesh,
-                                                int* max_nibool_interfaces_ext_mesh,
-                                                int* nibool_interfaces_ext_mesh,
-                                                int* ibool_interfaces_ext_mesh,
-                                                int* FORWARD_OR_ADJOINT) {}
+              TRANSFER_ASMBL_POT_TO_DEVICE)(long* Mesh_pointer,
+                                            realw* potential_dot_dot_acoustic,
+                                            realw* buffer_recv_scalar_ext_mesh,
+                                            int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(compute_forces_acoustic_cuda,
-              COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+              COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer,
                                             int* iphase,
                                             int* nspec_outer_acoustic,
-                                            int* nspec_inner_acoustic) {}
+                                            int* nspec_inner_acoustic) {} 
 
 void FC_FUNC_(acoustic_enforce_free_surf_cuda,
-              ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
-                                               int* ABSORB_INSTEAD_OF_FREE_SURFACE) {}
+              ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer,
+                                               int* ABSORB_INSTEAD_OF_FREE_SURFACE) {} 
 
 
 //
@@ -220,33 +202,21 @@
 //
 
 void FC_FUNC_(transfer_boun_accel_from_device,
-              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
-                                                    realw* send_accel_buffer,
-                                                    int* num_interfaces_ext_mesh,
-                                                    int* max_nibool_interfaces_ext_mesh,
-                                                    int* nibool_interfaces_ext_mesh,
-                                                    int* ibool_interfaces_ext_mesh,
-                                                    int* FORWARD_OR_ADJOINT){}
+              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer,
+                                               realw* accel,
+                                               realw* send_accel_buffer,
+                                               int* FORWARD_OR_ADJOINT){} 
 
 void FC_FUNC_(transfer_boundary_from_device_a,
               TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer,
-                                               int* nspec_outer_elastic) {}
+                                               int* nspec_outer_elastic) {} 
 
 void FC_FUNC_(transfer_boundary_to_device_a,
               TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
                                              realw* buffer_recv_vector_ext_mesh,
                                              int* num_interfaces_ext_mesh,
-                                             int* max_nibool_interfaces_ext_mesh) {}
+                                             int* max_nibool_interfaces_ext_mesh) {} 
 
-//void FC_FUNC_(assemble_accel_on_device,
-//              ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel,
-//                                              realw* buffer_recv_vector_ext_mesh,
-//                                              int* num_interfaces_ext_mesh,
-//                                              int* max_nibool_interfaces_ext_mesh,
-//                                              int* nibool_interfaces_ext_mesh,
-//                                              int* ibool_interfaces_ext_mesh,
-//                                              int* FORWARD_OR_ADJOINT) {}
-
 void FC_FUNC_(transfer_asmbl_accel_to_device,
               TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel,
                                                     realw* buffer_recv_vector_ext_mesh,
@@ -254,21 +224,31 @@
                                                     int* max_nibool_interfaces_ext_mesh,
                                                     int* nibool_interfaces_ext_mesh,
                                                     int* ibool_interfaces_ext_mesh,
-                                                    int* FORWARD_OR_ADJOINT) {}
+                                                    int* FORWARD_OR_ADJOINT) {} 
 
+//void FC_FUNC_(assemble_accel_on_device,
+//              ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel,
+//                                              realw* buffer_recv_vector_ext_mesh,
+//                                              int* num_interfaces_ext_mesh,
+//                                              int* max_nibool_interfaces_ext_mesh,
+//                                              int* nibool_interfaces_ext_mesh,
+//                                              int* ibool_interfaces_ext_mesh,
+//                                              int* FORWARD_OR_ADJOINT) {} 
+
 void FC_FUNC_(compute_forces_viscoelastic_cuda,
-              COMPUTE_FORCES_VISCOELASTIC_CUDA)(long* Mesh_pointer_f,
-                                           int* iphase,
-                                           int* nspec_outer_elastic,
-                                           int* nspec_inner_elastic,
-                                           int* COMPUTE_AND_STORE_STRAIN,
-                                           int* ATTENUATION,
-                                           int* ANISOTROPY) {}
+              COMPUTE_FORCES_VISCOELASTIC_CUDA)(long* Mesh_pointer,
+                                                int* iphase,
+                                                realw* deltat,
+                                                int* nspec_outer_elastic,
+                                                int* nspec_inner_elastic,
+                                                int* COMPUTE_AND_STORE_STRAIN,
+                                                int* ATTENUATION,
+                                                int* ANISOTROPY) {} 
 
 void FC_FUNC_(sync_copy_from_device,
-              SYNC_copy_FROM_DEVICE)(long* Mesh_pointer_f,
+              SYNC_copy_FROM_DEVICE)(long* Mesh_pointer,
                                      int* iphase,
-                                     realw* send_buffer) {}
+                                     realw* send_buffer) {} 
 
 
 //
@@ -277,23 +257,22 @@
 
 void FC_FUNC_(compute_kernels_elastic_cuda,
               COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
-                                            realw* deltat_f) {}
+                                            realw* deltat_f) {} 
 
 void FC_FUNC_(compute_kernels_strgth_noise_cu,
               COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
                                                     realw* h_noise_surface_movie,
-                                                    realw* deltat) {}
+                                                    realw* deltat) {} 
 
 void FC_FUNC_(compute_kernels_acoustic_cuda,
-              COMPUTE_KERNELS_ACOUSTIC_CUDA)(
-                                             long* Mesh_pointer,
-                                             realw* deltat_f) {}
+              COMPUTE_KERNELS_ACOUSTIC_CUDA)(long* Mesh_pointer,
+                                             realw* deltat_f) {} 
 
 void FC_FUNC_(compute_kernels_hess_cuda,
               COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
                                          realw* deltat_f,
                                          int* ELASTIC_SIMULATION,
-                                         int* ACOUSTIC_SIMULATION) {}
+                                         int* ACOUSTIC_SIMULATION) {} 
 
 
 //
@@ -301,10 +280,9 @@
 //
 
 void FC_FUNC_(compute_stacey_acoustic_cuda,
-              COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+              COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer,
                                             int* phase_is_innerf,
-                                            int* SAVE_FORWARDf,
-                                            realw* h_b_absorb_potential) {}
+                                            realw* h_b_absorb_potential) {} 
 
 
 //
@@ -312,10 +290,9 @@
 //
 
 void FC_FUNC_(compute_stacey_viscoelastic_cuda,
-              COMPUTE_STACEY_VISCOELASTIC_CUDA)(long* Mesh_pointer_f,
+              COMPUTE_STACEY_VISCOELASTIC_CUDA)(long* Mesh_pointer,
                                            int* phase_is_innerf,
-                                           int* SAVE_FORWARDf,
-                                           realw* h_b_absorb_field) {}
+                                           realw* b_absorb_field) {} 
 
 
 //
@@ -323,10 +300,10 @@
 //
 
 void FC_FUNC_(initialize_cuda_device,
-              INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+              INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) { 
  fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
  exit(1);
-}
+} 
 
 
 //
@@ -334,71 +311,65 @@
 //
 
 void FC_FUNC_(it_update_displacement_cuda,
-              IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
-                                          int* size_F,
+              IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer,
                                           realw* deltat_F,
                                           realw* deltatsqover2_F,
                                           realw* deltatover2_F,
                                           realw* b_deltat_F,
                                           realw* b_deltatsqover2_F,
-                                          realw* b_deltatover2_F) {}
+                                          realw* b_deltatover2_F) {} 
 
 void FC_FUNC_(it_update_displacement_ac_cuda,
-              it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
-                                               int* size_F,
+              it_update_displacement_ac_cuda)(long* Mesh_pointer,
                                                realw* deltat_F,
                                                realw* deltatsqover2_F,
                                                realw* deltatover2_F,
                                                realw* b_deltat_F,
                                                realw* b_deltatsqover2_F,
-                                               realw* b_deltatover2_F) {}
+                                               realw* b_deltatover2_F) {} 
 
 void FC_FUNC_(kernel_3_a_cuda,
               KERNEL_3_A_CUDA)(long* Mesh_pointer,
-                               int* size_F,
                                realw* deltatover2_F,
                                realw* b_deltatover2_F,
-                               int* APPROXIMATE_OCEAN_LOAD) {}
+                               int* APPROXIMATE_OCEAN_LOAD) {} 
 
 void FC_FUNC_(kernel_3_b_cuda,
               KERNEL_3_B_CUDA)(long* Mesh_pointer,
-                             int* size_F,
-                             realw* deltatover2_F,
-                             realw* b_deltatover2_F) {}
+                               realw* deltatover2_F,
+                               realw* b_deltatover2_F) {} 
 
-void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
-                             long* Mesh_pointer,
-                             int* size_F) {}
+void FC_FUNC_(kernel_3_a_acoustic_cuda,
+              KERNEL_3_ACOUSTIC_CUDA)(long* Mesh_pointer ) {} 
 
-void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
-                                                             long* Mesh_pointer,
-                                                             int* size_F,
-                                                             realw* deltatover2_F,
-                                                             realw* b_deltatover2_F) {}
+void FC_FUNC_(kernel_3_b_acoustic_cuda,
+              KERNEL_3_ACOUSTIC_CUDA)(long* Mesh_pointer,
+                                      realw* deltatover2_F,
+                                      realw* b_deltatover2_F) {} 
 
 
 //
 // src/cuda/noise_tomography_cuda.cu
 //
 
-void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){} 
 
-void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {} 
 
-void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {} 
 
-void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {} 
 
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {}
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer,realw* h_displ) {} 
 
 void FC_FUNC_(transfer_surface_to_host,
-              TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
-                                        realw* h_noise_surface_movie) {}
+              TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer,
+                                        realw* h_noise_surface_movie) {} 
 
 void FC_FUNC_(noise_read_add_surface_movie_cu,
-              NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
+              NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer,
                                                realw* h_noise_surface_movie,
-                                               int* NOISE_TOMOGRAPHYf) {}
+                                               int* NOISE_TOMOGRAPHYf) {} 
 
 
 //
@@ -407,19 +378,15 @@
 
 void FC_FUNC_(prepare_constants_device,
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
-                                        int* h_NGLLX,
-                                        int* NSPEC_AB, int* NGLOB_AB,
+                                        int* h_NGLLX, int* NSPEC_AB, int* NGLOB_AB,
                                         realw* h_xix, realw* h_xiy, realw* h_xiz,
                                         realw* h_etax, realw* h_etay, realw* h_etaz,
                                         realw* h_gammax, realw* h_gammay, realw* h_gammaz,
                                         realw* h_kappav, realw* h_muv,
                                         int* h_ibool,
-                                        int* num_interfaces_ext_mesh,
-                                        int* max_nibool_interfaces_ext_mesh,
-                                        int* h_nibool_interfaces_ext_mesh,
-                                        int* h_ibool_interfaces_ext_mesh,
-                                        realw* h_hprime_xx,
-                                        realw* h_hprimewgll_xx,
+                                        int* num_interfaces_ext_mesh, int* max_nibool_interfaces_ext_mesh,
+                                        int* h_nibool_interfaces_ext_mesh, int* h_ibool_interfaces_ext_mesh,
+                                        realw* h_hprime_xx, realw* h_hprimewgll_xx,
                                         realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
                                         int* ABSORBING_CONDITIONS,
                                         int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
@@ -427,37 +394,27 @@
                                         realw* h_abs_boundary_jacobian2Dw,
                                         int* h_num_abs_boundary_faces,
                                         int* h_ispec_is_inner,
-                                        int* NSOURCES,
-                                        int* nsources_local_f,
+                                        int* NSOURCES, int* nsources_local_f,
                                         realw* h_sourcearrays,
-                                        int* h_islice_selected_source,
-                                        int* h_ispec_selected_source,
-                                        int* h_number_receiver_global,
-                                        int* h_ispec_selected_rec,
+                                        int* h_islice_selected_source, int* h_ispec_selected_source,
+                                        int* h_number_receiver_global, int* h_ispec_selected_rec,
                                         int* nrec,int* nrec_local,
                                         int* SIMULATION_TYPE,
                                         int* USE_MESH_COLORING_GPU_f,
                                         int* nspec_acoustic,int* nspec_elastic,
-                                        int* my_neighbours_ext_mesh,
-                                        int* request_send_vector_ext_mesh,
-                                        int* request_recv_vector_ext_mesh,
-                                        realw* buffer_recv_vector_ext_mesh
-                                        ) {}
+                                        int* h_myrank,
+                                        int* SAVE_FORWARD ) {} 
 
 void FC_FUNC_(prepare_fields_acoustic_device,
-              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
-                                              realw* rmass_acoustic,
-                                              realw* rhostore,
-                                              realw* kappastore,
-                                              int* num_phase_ispec_acoustic,
-                                              int* phase_ispec_inner_acoustic,
+              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer,
+                                              realw* rmass_acoustic, realw* rhostore, realw* kappastore,
+                                              int* num_phase_ispec_acoustic, int* phase_ispec_inner_acoustic,
                                               int* ispec_is_acoustic,
                                               int* NOISE_TOMOGRAPHY,
                                               int* num_free_surface_faces,
                                               int* free_surface_ispec,
                                               int* free_surface_ijk,
-                                              int* b_reclen_potential,
-                                              realw* b_absorb_potential,
+                                              int* b_reclen_potential, realw* b_absorb_potential,
                                               int* ELASTIC_SIMULATION,
                                               int* num_coupling_ac_el_faces,
                                               int* coupling_ac_el_ispec,
@@ -466,26 +423,20 @@
                                               realw* coupling_ac_el_jacobian2Dw,
                                               int* num_colors_outer_acoustic,
                                               int* num_colors_inner_acoustic,
-                                              int* num_elem_colors_acoustic) {}
+                                              int* num_elem_colors_acoustic) {} 
 
 void FC_FUNC_(prepare_fields_acoustic_adj_dev,
-              PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
-                                              int* APPROXIMATE_HESS_KL) {}
+              PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer,
+                                              int* APPROXIMATE_HESS_KL) {} 
 
 void FC_FUNC_(prepare_fields_elastic_device,
-              PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
-                                             int* size,
-                                             realw* rmassx,
-                                             realw* rmassy,
-                                             realw* rmassz,
-                                             realw* rho_vp,
-                                             realw* rho_vs,
+              PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer,
+                                             realw* rmassx, realw* rmassy, realw* rmassz,
+                                             realw* rho_vp, realw* rho_vs,
                                              int* num_phase_ispec_elastic,
                                              int* phase_ispec_inner_elastic,
                                              int* ispec_is_elastic,
-                                             realw* h_b_absorb_field,
-                                             int* h_b_reclen_field,
-                                             int* SAVE_FORWARD,
+                                             realw* b_absorb_field, int* b_reclen_field,
                                              int* COMPUTE_AND_STORE_STRAIN,
                                              realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
                                              realw* epsilondev_xz,realw* epsilondev_yz,
@@ -506,31 +457,17 @@
                                              int* num_colors_inner_elastic,
                                              int* num_elem_colors_elastic,
                                              int* ANISOTROPY,
-                                             realw *c11store,
-                                             realw *c12store,
-                                             realw *c13store,
-                                             realw *c14store,
-                                             realw *c15store,
-                                             realw *c16store,
-                                             realw *c22store,
-                                             realw *c23store,
-                                             realw *c24store,
-                                             realw *c25store,
-                                             realw *c26store,
-                                             realw *c33store,
-                                             realw *c34store,
-                                             realw *c35store,
-                                             realw *c36store,
-                                             realw *c44store,
-                                             realw *c45store,
-                                             realw *c46store,
-                                             realw *c55store,
-                                             realw *c56store,
-                                             realw *c66store){}
+                                             realw *c11store,realw *c12store,realw *c13store,
+                                             realw *c14store,realw *c15store,realw *c16store,
+                                             realw *c22store,realw *c23store,realw *c24store,
+                                             realw *c25store,realw *c26store,realw *c33store,
+                                             realw *c34store,realw *c35store,realw *c36store,
+                                             realw *c44store,realw *c45store,realw *c46store,
+                                             realw *c55store,realw *c56store,realw *c66store ){} 
 
 void FC_FUNC_(prepare_fields_elastic_adj_dev,
-              PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
-                                             int* size,
+              PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer,
+                                             int* size_f,
                                              int* COMPUTE_AND_STORE_STRAIN,
                                              realw* epsilon_trace_over_3,
                                              realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
@@ -540,19 +477,18 @@
                                              int* R_size,
                                              realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
                                              realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
-                                             int* APPROXIMATE_HESS_KL){}
+                                             int* ANISOTROPIC_KL,
+                                             int* APPROXIMATE_HESS_KL){} 
 
 void FC_FUNC_(prepare_sim2_or_3_const_device,
-              PREPARE_SIM2_OR_3_CONST_DEVICE)(
-                                              long* Mesh_pointer_f,
+              PREPARE_SIM2_OR_3_CONST_DEVICE)(long* Mesh_pointer,
                                               int* islice_selected_rec,
                                               int* islice_selected_rec_size,
                                               int* nadj_rec_local,
-                                              int* nrec,
-                                              int* myrank) {}
+                                              int* nrec) {} 
 
 void FC_FUNC_(prepare_fields_noise_device,
-              PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
+              PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer,
                                            int* NSPEC_AB, int* NGLOB_AB,
                                            int* free_surface_ispec,
                                            int* free_surface_ijk,
@@ -560,27 +496,24 @@
                                            int* NOISE_TOMOGRAPHY,
                                            int* NSTEP,
                                            realw* noise_sourcearray,
-                                           realw* normal_x_noise,
-                                           realw* normal_y_noise,
-                                           realw* normal_z_noise,
+                                           realw* normal_x_noise, realw* normal_y_noise, realw* normal_z_noise,
                                            realw* mask_noise,
-                                           realw* free_surface_jacobian2Dw) {}
+                                           realw* free_surface_jacobian2Dw) {} 
 
 void FC_FUNC_(prepare_fields_gravity_device,
-              PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
+              PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer,
                                              int* GRAVITY,
                                              realw* minus_deriv_gravity,
                                              realw* minus_g,
                                              realw* h_wgll_cube,
                                              int* ACOUSTIC_SIMULATION,
-                                             realw* rhostore) {}
+                                             realw* rhostore) {} 
 
 void FC_FUNC_(prepare_seismogram_fields,
-              PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {}
+              PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {} 
 
 void FC_FUNC_(prepare_cleanup_device,
-              PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
-                                      int* SAVE_FORWARD,
+              PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer,
                                       int* ACOUSTIC_SIMULATION,
                                       int* ELASTIC_SIMULATION,
                                       int* ABSORBING_CONDITIONS,
@@ -589,7 +522,7 @@
                                       int* ATTENUATION,
                                       int* ANISOTROPY,
                                       int* APPROXIMATE_OCEAN_LOAD,
-                                      int* APPROXIMATE_HESS_KL) {}
+                                      int* APPROXIMATE_HESS_KL) {} 
 
 
 //
@@ -597,153 +530,133 @@
 //
 
 void FC_FUNC_(transfer_fields_el_to_device,
-              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_fields_el_from_device,
-              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_b_fields_to_device,
               TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                           long* Mesh_pointer_f) {}
+                                           long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_b_fields_from_device,
-              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {}
+              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_accel_to_device,
-              TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+              TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_accel_from_device,
-              TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_b_accel_from_device,
-              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {}
+              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_sigma_from_device,
-              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {}
+              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_b_displ_from_device,
-              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {}
+              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_displ_from_device,
-              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {}
+              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer) {} 
 
-void FC_FUNC_(transfer_compute_kernel_answers_from_device,
-              TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
-                                                           realw* rho_kl,int* size_rho,
-                                                           realw* mu_kl, int* size_mu,
-                                                           realw* kappa_kl, int* size_kappa) {}
-
-void FC_FUNC_(transfer_compute_kernel_fields_from_device,
-              TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
-                                                          realw* accel, int* size_accel,
-                                                          realw* b_displ, int* size_b_displ,
-                                                          realw* epsilondev_xx,
-                                                          realw* epsilondev_yy,
-                                                          realw* epsilondev_xy,
-                                                          realw* epsilondev_xz,
-                                                          realw* epsilondev_yz,
-                                                          int* size_epsilondev,
-                                                          realw* b_epsilondev_xx,
-                                                          realw* b_epsilondev_yy,
-                                                          realw* b_epsilondev_xy,
-                                                          realw* b_epsilondev_xz,
-                                                          realw* b_epsilondev_yz,
-                                                          int* size_b_epsilondev,
-                                                          realw* rho_kl,int* size_rho,
-                                                          realw* mu_kl, int* size_mu,
-                                                          realw* kappa_kl, int* size_kappa,
-                                                          realw* epsilon_trace_over_3,
-                                                          realw* b_epsilon_trace_over_3,
-                                                          int* size_epsilon_trace_over_3) {}
-
 void FC_FUNC_(transfer_b_fields_att_to_device,
               TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
-                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
-                                             int* size_R,
-                                             realw* b_epsilondev_xx,
-                                             realw* b_epsilondev_yy,
-                                             realw* b_epsilondev_xy,
-                                             realw* b_epsilondev_xz,
-                                             realw* b_epsilondev_yz,
-                                             int* size_epsilondev) {}
+                                               realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,
+                                               realw* b_R_xz,realw* b_R_yz,
+                                               int* size_R,
+                                               realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                               realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+                                               int* size_epsilondev) {} 
 
 void FC_FUNC_(transfer_fields_att_from_device,
               TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
                                                realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
                                                int* size_R,
-                                               realw* epsilondev_xx,
-                                               realw* epsilondev_yy,
-                                               realw* epsilondev_xy,
-                                               realw* epsilondev_xz,
-                                               realw* epsilondev_yz,
-                                               int* size_epsilondev) {}
+                                               realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                               realw* epsilondev_xz,realw* epsilondev_yz,
+                                               int* size_epsilondev) {} 
 
 void FC_FUNC_(transfer_kernels_el_to_host,
               TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
-                                                    realw* h_rho_kl,
-                                                    realw* h_mu_kl,
-                                                    realw* h_kappa_kl,
-                                                    int* NSPEC_AB) {}
+                                            realw* h_rho_kl,
+                                            realw* h_mu_kl,
+                                            realw* h_kappa_kl,
+                                            realw* h_cijkl_kl,
+                                            int* NSPEC_AB) {} 
 
 void FC_FUNC_(transfer_kernels_noise_to_host,
               TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
-                                                          realw* h_Sigma_kl,
-                                                          int* NSPEC_AB) {}
+                                              realw* h_Sigma_kl,
+                                              int* NSPEC_AB) {} 
 
 void FC_FUNC_(transfer_fields_ac_to_device,
-              TRANSFER_FIELDS_AC_TO_DEVICE)(
-                                                  int* size,
-                                                  realw* potential_acoustic,
-                                                  realw* potential_dot_acoustic,
-                                                  realw* potential_dot_dot_acoustic,
-                                                  long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_AC_TO_DEVICE)(int* size,
+                                            realw* potential_acoustic,
+                                            realw* potential_dot_acoustic,
+                                            realw* potential_dot_dot_acoustic,
+                                            long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_b_fields_ac_to_device,
-              TRANSFER_B_FIELDS_AC_TO_DEVICE)(
-                                                    int* size,
-                                                    realw* b_potential_acoustic,
-                                                    realw* b_potential_dot_acoustic,
-                                                    realw* b_potential_dot_dot_acoustic,
-                                                    long* Mesh_pointer_f) {}
+              TRANSFER_B_FIELDS_AC_TO_DEVICE)(int* size,
+                                              realw* b_potential_acoustic,
+                                              realw* b_potential_dot_acoustic,
+                                              realw* b_potential_dot_dot_acoustic,
+                                              long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_fields_ac_from_device,
               TRANSFER_FIELDS_AC_FROM_DEVICE)(int* size,
                                               realw* potential_acoustic,
                                               realw* potential_dot_acoustic,
                                               realw* potential_dot_dot_acoustic,
-                                              long* Mesh_pointer_f) {}
+                                              long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_b_fields_ac_from_device,
-              TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
-                                                      int* size,
-                                                      realw* b_potential_acoustic,
-                                                      realw* b_potential_dot_acoustic,
-                                                      realw* b_potential_dot_dot_acoustic,
-                                                      long* Mesh_pointer_f) {}
+              TRANSFER_B_FIELDS_AC_FROM_DEVICE)(int* size,
+                                                realw* b_potential_acoustic,
+                                                realw* b_potential_dot_acoustic,
+                                                realw* b_potential_dot_dot_acoustic,
+                                                long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_dot_dot_from_device,
-              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {}
+              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_b_dot_dot_from_device,
-              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {}
+              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer) {} 
 
 void FC_FUNC_(transfer_kernels_ac_to_host,
-              TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
-                                                             realw* h_rho_ac_kl,
-                                                             realw* h_kappa_ac_kl,
-                                                             int* NSPEC_AB) {}
+              TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,realw* h_rho_ac_kl,realw* h_kappa_ac_kl,int* NSPEC_AB) {} 
 
 void FC_FUNC_(transfer_kernels_hess_el_tohost,
-              TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
-                                              realw* h_hess_kl,
-                                              int* NSPEC_AB) {}
+              TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,realw* h_hess_kl,int* NSPEC_AB) {} 
 
 void FC_FUNC_(transfer_kernels_hess_ac_tohost,
-              TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
-                                             realw* h_hess_ac_kl,
-                                             int* NSPEC_AB) {}
+              TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,realw* h_hess_ac_kl,int* NSPEC_AB) {} 
 
+void FC_FUNC_(transfer_compute_kernel_answers_from_device,
+              TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
+                                                           realw* rho_kl,int* size_rho,
+                                                           realw* mu_kl, int* size_mu,
+                                                           realw* kappa_kl, int* size_kappa) {} 
 
+void FC_FUNC_(transfer_compute_kernel_fields_from_device,
+              TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
+                                                          realw* accel, int* size_accel,
+                                                          realw* b_displ, int* size_b_displ,
+                                                          realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                                          realw* epsilondev_xz,realw* epsilondev_yz,
+                                                          int* size_epsilondev,
+                                                          realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                                          realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+                                                          int* size_b_epsilondev,
+                                                          realw* rho_kl,int* size_rho,
+                                                          realw* mu_kl, int* size_mu,
+                                                          realw* kappa_kl, int* size_kappa,
+                                                          realw* epsilon_trace_over_3,
+                                                          realw* b_epsilon_trace_over_3,
+                                                          int* size_epsilon_trace_over_3) {} 
+
+
 //
 // src/cuda/write_seismograms_cuda.cu
 //
@@ -754,18 +667,17 @@
                                               realw* seismograms_d,
                                               realw* seismograms_v,
                                               realw* seismograms_a,
-                                              int* it) {}
+                                              int* it) {} 
 
 void FC_FUNC_(transfer_station_el_from_device,
               TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
                                                    realw* b_displ, realw* b_veloc, realw* b_accel,
                                                    long* Mesh_pointer_f,int* number_receiver_global,
                                                    int* ispec_selected_rec,int* ispec_selected_source,
-                                                   int* ibool) {}
+                                                   int* ibool) {} 
 
 void FC_FUNC_(transfer_station_ac_from_device,
-              TRANSFER_STATION_AC_FROM_DEVICE)(
-                                                realw* potential_acoustic,
+              TRANSFER_STATION_AC_FROM_DEVICE)(realw* potential_acoustic,
                                                 realw* potential_dot_acoustic,
                                                 realw* potential_dot_dot_acoustic,
                                                 realw* b_potential_acoustic,
@@ -775,5 +687,5 @@
                                                 int* number_receiver_global,
                                                 int* ispec_selected_rec,
                                                 int* ispec_selected_source,
-                                                int* ibool) {}
+                                                int* ibool) {} 
 

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/transfer_fields_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/transfer_fields_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -53,11 +53,11 @@
 
 extern "C"
 void FC_FUNC_(transfer_fields_el_to_device,
-              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
+              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer) {
 
-TRACE("transfer_fields_el_to_device_");
+  TRACE("transfer_fields_el_to_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc,veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40004);
@@ -69,11 +69,11 @@
 
 extern "C"
 void FC_FUNC_(transfer_fields_el_from_device,
-              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
+              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer) {
 
-  TRACE("transfer_fields_el_from_device_");
+  TRACE("transfer_fields_el_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
   print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
@@ -86,30 +86,31 @@
 extern "C"
 void FC_FUNC_(transfer_b_fields_to_device,
               TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                           long* Mesh_pointer_f) {
+                                           long* Mesh_pointer) {
 
-  TRACE("transfer_b_fields_to_device_");
+  TRACE("transfer_b_fields_to_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  cudaMemcpy(mp->d_b_displ,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_veloc,b_veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_accel,b_accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_displ,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),41006);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_veloc,b_veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice),41007);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_accel,b_accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),41008);
+
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
 void FC_FUNC_(transfer_b_fields_from_device,
-              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {
+              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer) {
 
-TRACE("transfer_b_fields_from_device_");
+  TRACE("transfer_b_fields_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  cudaMemcpy(b_displ,mp->d_b_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_veloc,mp->d_b_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
+  print_CUDA_error_if_any(cudaMemcpy(b_displ,mp->d_b_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),42006);
+  print_CUDA_error_if_any(cudaMemcpy(b_veloc,mp->d_b_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),42007);
+  print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),42008);
 
 }
 
@@ -118,11 +119,11 @@
 
 extern "C"
 void FC_FUNC_(transfer_accel_to_device,
-              TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
+              TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer) {
 
-TRACE("transfer_accel_to_device");
+  TRACE("transfer_accel_to_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40016);
 
@@ -132,11 +133,11 @@
 
 extern "C"
 void FC_FUNC_(transfer_accel_from_device,
-              TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
+              TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer) {
 
-TRACE("transfer_accel_from_device");
+  TRACE("transfer_accel_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40026);
 
@@ -146,11 +147,11 @@
 
 extern "C"
 void FC_FUNC_(transfer_b_accel_from_device,
-              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {
+              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer) {
 
-TRACE("transfer_b_accel_from_device");
+  TRACE("transfer_b_accel_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40036);
 
@@ -160,11 +161,11 @@
 
 extern "C"
 void FC_FUNC_(transfer_sigma_from_device,
-              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {
+              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer) {
 
-TRACE("transfer_sigma_from_device");
+  TRACE("transfer_sigma_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40046);
 
@@ -174,11 +175,11 @@
 
 extern "C"
 void FC_FUNC_(transfer_b_displ_from_device,
-              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
+              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer) {
 
-TRACE("transfer_b_displ_from_device");
+  TRACE("transfer_b_displ_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_b_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40056);
 
@@ -188,120 +189,47 @@
 
 extern "C"
 void FC_FUNC_(transfer_displ_from_device,
-              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
+              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer) {
 
-TRACE("transfer_displ_from_device");
+  TRACE("transfer_displ_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40066);
 
 }
 
 /* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(transfer_compute_kernel_answers_from_device,
-              TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
-                                                           realw* rho_kl,int* size_rho,
-                                                           realw* mu_kl, int* size_mu,
-                                                           realw* kappa_kl, int* size_kappa) {
-TRACE("transfer_compute_kernel_answers_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-  cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
-
-}
-*/
-
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(transfer_compute_kernel_fields_from_device,
-              TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
-                                                          realw* accel, int* size_accel,
-                                                          realw* b_displ, int* size_b_displ,
-                                                          realw* epsilondev_xx,
-                                                          realw* epsilondev_yy,
-                                                          realw* epsilondev_xy,
-                                                          realw* epsilondev_xz,
-                                                          realw* epsilondev_yz,
-                                                          int* size_epsilondev,
-                                                          realw* b_epsilondev_xx,
-                                                          realw* b_epsilondev_yy,
-                                                          realw* b_epsilondev_xy,
-                                                          realw* b_epsilondev_xz,
-                                                          realw* b_epsilondev_yz,
-                                                          int* size_b_epsilondev,
-                                                          realw* rho_kl,int* size_rho,
-                                                          realw* mu_kl, int* size_mu,
-                                                          realw* kappa_kl, int* size_kappa,
-                                                          realw* epsilon_trace_over_3,
-                                                          realw* b_epsilon_trace_over_3,
-                                                          int* size_epsilon_trace_over_3) {
-TRACE("transfer_compute_kernel_fields_from_device");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-  cudaMemcpy(accel,mp->d_accel,*size_accel*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_displ,mp->d_b_displ,*size_b_displ*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_epsilondev_xx,mp->d_b_epsilondev_xx,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_epsilondev_yy,mp->d_b_epsilondev_yy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_epsilondev_xy,mp->d_b_epsilondev_xy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_epsilondev_xz,mp->d_b_epsilondev_xz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_epsilondev_yz,mp->d_b_epsilondev_yz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilon_trace_over_3,mp->d_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
-       cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_epsilon_trace_over_3,mp->d_b_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
-       cudaMemcpyDeviceToHost);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("after transfer_compute_kernel_fields_from_device");
-#endif
-}
-*/
-
-/* ----------------------------------------------------------------------------------------------- */
-
 // attenuation fields
 
 extern "C"
 void FC_FUNC_(transfer_b_fields_att_to_device,
               TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
-                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
-                                             int* size_R,
-                                             realw* b_epsilondev_xx,
-                                             realw* b_epsilondev_yy,
-                                             realw* b_epsilondev_xy,
-                                             realw* b_epsilondev_xz,
-                                             realw* b_epsilondev_yz,
-                                             int* size_epsilondev) {
+                                               realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,
+                                               realw* b_R_xz,realw* b_R_yz,
+                                               int* size_R,
+                                               realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                               realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+                                               int* size_epsilondev) {
+
   TRACE("transfer_b_fields_att_to_device");
+
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
 
-  cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(realw),cudaMemcpyHostToDevice),43011);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(realw),cudaMemcpyHostToDevice),43012);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(realw),cudaMemcpyHostToDevice),43013);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(realw),cudaMemcpyHostToDevice),43014);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(realw),cudaMemcpyHostToDevice),43015);
 
-  cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice),43016);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice),43017);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice),43018);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice),43019);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice),43020);
 
-
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("after transfer_b_fields_att_to_device");
 #endif
@@ -316,27 +244,25 @@
               TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
                                                realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
                                                int* size_R,
-                                               realw* epsilondev_xx,
-                                               realw* epsilondev_yy,
-                                               realw* epsilondev_xy,
-                                               realw* epsilondev_xz,
-                                               realw* epsilondev_yz,
+                                               realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                               realw* epsilondev_xz,realw* epsilondev_yz,
                                                int* size_epsilondev) {
   TRACE("transfer_fields_att_from_device");
+
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
 
-  cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  print_CUDA_error_if_any(cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(realw),cudaMemcpyDeviceToHost),43021);
+  print_CUDA_error_if_any(cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost),43022);
+  print_CUDA_error_if_any(cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost),43023);
+  print_CUDA_error_if_any(cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost),43024);
+  print_CUDA_error_if_any(cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost),43025);
 
-  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost),43026);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost),43027);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost),43028);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost),43029);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost),43030);
 
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -351,21 +277,27 @@
 extern "C"
 void FC_FUNC_(transfer_kernels_el_to_host,
               TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
-                                                    realw* h_rho_kl,
-                                                    realw* h_mu_kl,
-                                                    realw* h_kappa_kl,
-                                                    int* NSPEC_AB) {
-TRACE("transfer_kernels_el_to_host");
+                                            realw* h_rho_kl,
+                                            realw* h_mu_kl,
+                                            realw* h_kappa_kl,
+                                            realw* h_cijkl_kl,
+                                            int* NSPEC_AB) {
+  TRACE("transfer_kernels_el_to_host");
+
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*NGLL3*sizeof(realw),
                                      cudaMemcpyDeviceToHost),40101);
-  print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*NGLL3*sizeof(realw),
-                                     cudaMemcpyDeviceToHost),40102);
-  print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*NGLL3*sizeof(realw),
-                                     cudaMemcpyDeviceToHost),40103);
-
+  if( mp->anisotropic_kl ){
+    print_CUDA_error_if_any(cudaMemcpy(h_cijkl_kl,mp->d_cijkl_kl,*NSPEC_AB*21*NGLL3*sizeof(realw),
+                                       cudaMemcpyDeviceToHost),40102);
+  }else{
+    print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*NGLL3*sizeof(realw),
+                                       cudaMemcpyDeviceToHost),40102);
+    print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*NGLL3*sizeof(realw),
+                                       cudaMemcpyDeviceToHost),40103);
+  }
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -377,11 +309,12 @@
 extern "C"
 void FC_FUNC_(transfer_kernels_noise_to_host,
               TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
-                                                          realw* h_Sigma_kl,
-                                                          int* NSPEC_AB) {
-TRACE("transfer_kernels_noise_to_host");
+                                              realw* h_Sigma_kl,
+                                              int* NSPEC_AB) {
+  TRACE("transfer_kernels_noise_to_host");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
                                      cudaMemcpyDeviceToHost),40201);
@@ -397,16 +330,17 @@
 
 extern "C"
 void FC_FUNC_(transfer_fields_ac_to_device,
-              TRANSFER_FIELDS_AC_TO_DEVICE)(
-                                                  int* size,
-                                                  realw* potential_acoustic,
-                                                  realw* potential_dot_acoustic,
-                                                  realw* potential_dot_dot_acoustic,
-                                                  long* Mesh_pointer_f) {
-TRACE("transfer_fields_ac_to_device");
+              TRANSFER_FIELDS_AC_TO_DEVICE)(int* size,
+                                            realw* potential_acoustic,
+                                            realw* potential_dot_acoustic,
+                                            realw* potential_dot_dot_acoustic,
+                                            long* Mesh_pointer) {
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  TRACE("transfer_fields_ac_to_device");
 
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
   print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_acoustic,potential_acoustic,
                                      sizeof(realw)*(*size),cudaMemcpyHostToDevice),50110);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_acoustic,potential_dot_acoustic,
@@ -423,16 +357,17 @@
 
 extern "C"
 void FC_FUNC_(transfer_b_fields_ac_to_device,
-              TRANSFER_B_FIELDS_AC_TO_DEVICE)(
-                                                    int* size,
-                                                    realw* b_potential_acoustic,
-                                                    realw* b_potential_dot_acoustic,
-                                                    realw* b_potential_dot_dot_acoustic,
-                                                    long* Mesh_pointer_f) {
-TRACE("transfer_b_fields_ac_to_device");
+              TRANSFER_B_FIELDS_AC_TO_DEVICE)(int* size,
+                                              realw* b_potential_acoustic,
+                                              realw* b_potential_dot_acoustic,
+                                              realw* b_potential_dot_dot_acoustic,
+                                              long* Mesh_pointer) {
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  TRACE("transfer_b_fields_ac_to_device");
 
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_acoustic,b_potential_acoustic,
                                      sizeof(realw)*(*size),cudaMemcpyHostToDevice),51110);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic,
@@ -454,10 +389,11 @@
                                               realw* potential_acoustic,
                                               realw* potential_dot_acoustic,
                                               realw* potential_dot_dot_acoustic,
-                                              long* Mesh_pointer_f) {
-TRACE("transfer_fields_ac_from_device");
+                                              long* Mesh_pointer) {
+  TRACE("transfer_fields_ac_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   print_CUDA_error_if_any(cudaMemcpy(potential_acoustic,mp->d_potential_acoustic,
                                      sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52111);
@@ -475,15 +411,15 @@
 
 extern "C"
 void FC_FUNC_(transfer_b_fields_ac_from_device,
-              TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
-                                                      int* size,
-                                                      realw* b_potential_acoustic,
-                                                      realw* b_potential_dot_acoustic,
-                                                      realw* b_potential_dot_dot_acoustic,
-                                                      long* Mesh_pointer_f) {
-TRACE("transfer_b_fields_ac_from_device");
+              TRANSFER_B_FIELDS_AC_FROM_DEVICE)(int* size,
+                                                realw* b_potential_acoustic,
+                                                realw* b_potential_dot_acoustic,
+                                                realw* b_potential_dot_dot_acoustic,
+                                                long* Mesh_pointer) {
+  TRACE("transfer_b_fields_ac_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   print_CUDA_error_if_any(cudaMemcpy(b_potential_acoustic,mp->d_b_potential_acoustic,
                                      sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53111);
@@ -501,11 +437,12 @@
 
 extern "C"
 void FC_FUNC_(transfer_dot_dot_from_device,
-              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer) {
 
   TRACE("transfer_dot_dot_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
                                      sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50041);
@@ -516,11 +453,12 @@
 
 extern "C"
 void FC_FUNC_(transfer_b_dot_dot_from_device,
-              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer) {
 
   TRACE("transfer_b_dot_dot_from_device");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
                                      sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50042);
@@ -532,15 +470,13 @@
 
 extern "C"
 void FC_FUNC_(transfer_kernels_ac_to_host,
-              TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
-                                                             realw* h_rho_ac_kl,
-                                                             realw* h_kappa_ac_kl,
-                                                             int* NSPEC_AB) {
+              TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,realw* h_rho_ac_kl,realw* h_kappa_ac_kl,int* NSPEC_AB) {
 
   TRACE("transfer_kernels_ac_to_host");
 
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
+
   int size = *NSPEC_AB*NGLL3;
 
   // copies kernel values over to CPU host
@@ -558,13 +494,13 @@
 
 extern "C"
 void FC_FUNC_(transfer_kernels_hess_el_tohost,
-              TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
-                                              realw* h_hess_kl,
-                                              int* NSPEC_AB) {
-TRACE("transfer_kernels_hess_el_tohost");
+              TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,realw* h_hess_kl,int* NSPEC_AB) {
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  TRACE("transfer_kernels_hess_el_tohost");
 
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
   print_CUDA_error_if_any(cudaMemcpy(h_hess_kl,mp->d_hess_el_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
                                      cudaMemcpyDeviceToHost),70201);
 }
@@ -573,15 +509,87 @@
 
 extern "C"
 void FC_FUNC_(transfer_kernels_hess_ac_tohost,
-              TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
-                                             realw* h_hess_ac_kl,
-                                             int* NSPEC_AB) {
+              TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,realw* h_hess_ac_kl,int* NSPEC_AB) {
+
   TRACE("transfer_kernels_hess_ac_tohost");
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   print_CUDA_error_if_any(cudaMemcpy(h_hess_ac_kl,mp->d_hess_ac_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
                                      cudaMemcpyDeviceToHost),70202);
 }
 
+// unused...
 
+/* ----------------------------------------------------------------------------------------------- */
+/*
+extern "C"
+void FC_FUNC_(transfer_compute_kernel_answers_from_device,
+              TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
+                                                           realw* rho_kl,int* size_rho,
+                                                           realw* mu_kl, int* size_mu,
+                                                           realw* kappa_kl, int* size_kappa) {
+TRACE("transfer_compute_kernel_answers_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
+  if( ! mp->anisotropic_kl ){
+    cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
+    cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
+  }
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+/*
+extern "C"
+void FC_FUNC_(transfer_compute_kernel_fields_from_device,
+              TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
+                                                          realw* accel, int* size_accel,
+                                                          realw* b_displ, int* size_b_displ,
+                                                          realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                                          realw* epsilondev_xz,realw* epsilondev_yz,
+                                                          int* size_epsilondev,
+                                                          realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                                          realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+                                                          int* size_b_epsilondev,
+                                                          realw* rho_kl,int* size_rho,
+                                                          realw* mu_kl, int* size_mu,
+                                                          realw* kappa_kl, int* size_kappa,
+                                                          realw* epsilon_trace_over_3,
+                                                          realw* b_epsilon_trace_over_3,
+                                                          int* size_epsilon_trace_over_3) {
+TRACE("transfer_compute_kernel_fields_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  cudaMemcpy(accel,mp->d_accel,*size_accel*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_displ,mp->d_b_displ,*size_b_displ*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_xx,mp->d_b_epsilondev_xx,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_yy,mp->d_b_epsilondev_yy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_xy,mp->d_b_epsilondev_xy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_xz,mp->d_b_epsilondev_xz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_yz,mp->d_b_epsilondev_yz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  if( ! mp->anisotropic_kl ){
+    cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
+    cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
+  }
+
+  cudaMemcpy(epsilon_trace_over_3,mp->d_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
+       cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilon_trace_over_3,mp->d_b_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
+       cudaMemcpyDeviceToHost);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_compute_kernel_fields_from_device");
+#endif
+}
+*/
+

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/write_seismograms_cuda.cu	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/write_seismograms_cuda.cu	2013-08-20 14:13:26 UTC (rev 22718)
@@ -43,60 +43,44 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+//fortran code snippet...
 /*
-      ! gets global number of that receiver
-    irec = number_receiver_global(irec_local)
+  ! gets global number of that receiver
+  irec = number_receiver_global(irec_local)
 
-    ! gets local receiver interpolators
-    ! (1-D Lagrange interpolators)
-    hxir(:) = hxir_store(irec_local,:)
-    hetar(:) = hetar_store(irec_local,:)
-    hgammar(:) = hgammar_store(irec_local,:)
-
+  ! gets local receiver interpolators
+  ! (1-D Lagrange interpolators)
+  hxir(:) = hxir_store(irec_local,:)
+  hetar(:) = hetar_store(irec_local,:)
+  hgammar(:) = hgammar_store(irec_local,:)
 */
 
 /* ----------------------------------------------------------------------------------------------- */
 
-// Initially sets the blocks_x to be the num_blocks, and adds rows as
-// needed. If an additional row is added, the row length is cut in
-// half. If the block count is odd, there will be 1 too many blocks,
-// which must be managed at runtime with an if statement.
-void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y) {
-  *num_blocks_x = num_blocks;
-  *num_blocks_y = 1;
-  while(*num_blocks_x > 65535) {
-    *num_blocks_x = (int) ceil(*num_blocks_x*0.5f);
-    *num_blocks_y = *num_blocks_y*2;
-  }
-  return;
-}
+// unused...
+/*
+__device__ double my_atomicAdd(double* address, double val) {
 
-/* ----------------------------------------------------------------------------------------------- */
-
-__device__ double atomicAdd(double* address, double val)
-{
-    unsigned long long int* address_as_ull =
-                             (unsigned long long int*)address;
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
     unsigned long long int old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-old = atomicCAS(address_as_ull, assumed,
-                        __double_as_longlong(val +
-                               __longlong_as_double(assumed)));
+    do{
+      assumed = old;
+      old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
     } while (assumed != old);
     return __longlong_as_double(old);
 }
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
 
 __global__ void compute_interpolated_dva_plus_seismogram(int nrec_local,
-               realw* displ, realw* veloc, realw* accel,
-               int* ibool,
-               double* hxir, double* hetar, double* hgammar,
-               realw* seismograms_d, realw* seismograms_v, realw* seismograms_a,
-               double* nu,
-               int* number_receiver_global,
-               int* ispec_selected_rec) {
+                                                         realw* displ, realw* veloc, realw* accel,
+                                                         int* ibool,
+                                                         double* hxir, double* hetar, double* hgammar,
+                                                         realw* seismograms_d, realw* seismograms_v, realw* seismograms_a,
+                                                         double* nu,
+                                                         int* number_receiver_global,
+                                                         int* ispec_selected_rec) {
   int irec_local = blockIdx.x + blockIdx.y*gridDim.x;
   int i = threadIdx.x;
   int j = threadIdx.y;
@@ -202,12 +186,13 @@
 
 // transfers seismograms from device to host
 
-  TRACE("transfer_seismograms_el_from_d");
+  TRACE("\ttransfer_seismograms_el_from_d");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
 
   int num_blocks_x, num_blocks_y;
   get_blocks_xy(*nrec_local,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(5,5,5);
 
@@ -222,22 +207,23 @@
   // cudaEventRecord( start, 0 );
 
   compute_interpolated_dva_plus_seismogram<<<grid,threads,0,mp->compute_stream>>>(*nrec_local,
-                      mp->d_displ,mp->d_veloc,mp->d_accel,
-                      mp->d_ibool,
-                      mp->d_hxir, mp->d_hetar, mp->d_hgammar,
-                      mp->d_seismograms_d,
-                      mp->d_seismograms_v,
-                      mp->d_seismograms_a,
-                      mp->d_nu,
-                      mp->d_number_receiver_global,
-                      mp->d_ispec_selected_rec
-                      );
+                                                                                  mp->d_displ,mp->d_veloc,mp->d_accel,
+                                                                                  mp->d_ibool,
+                                                                                  mp->d_hxir, mp->d_hetar, mp->d_hgammar,
+                                                                                  mp->d_seismograms_d,
+                                                                                  mp->d_seismograms_v,
+                                                                                  mp->d_seismograms_a,
+                                                                                  mp->d_nu,
+                                                                                  mp->d_number_receiver_global,
+                                                                                  mp->d_ispec_selected_rec
+                                                                                  );
 
   // cudaMemcpy(h_debug,d_debug,125*sizeof(double),cudaMemcpyDeviceToHost);
 
-  cudaMemcpy(mp->h_seismograms_d_it,mp->d_seismograms_d,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
-  cudaMemcpy(mp->h_seismograms_v_it,mp->d_seismograms_v,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
-  cudaMemcpy(mp->h_seismograms_a_it,mp->d_seismograms_a,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
+  // (cudaMemcpy implicitly synchronizes all other cuda operations)
+  print_CUDA_error_if_any(cudaMemcpy(mp->h_seismograms_d_it,mp->d_seismograms_d,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost),72001);
+  print_CUDA_error_if_any(cudaMemcpy(mp->h_seismograms_v_it,mp->d_seismograms_v,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost),72002);
+  print_CUDA_error_if_any(cudaMemcpy(mp->h_seismograms_a_it,mp->d_seismograms_a,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost),72003);
 
   // cudaEventRecord( stop, 0 );
   // cudaEventSynchronize( stop );
@@ -286,37 +272,35 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 void transfer_field_from_device(Mesh* mp, realw* d_field,realw* h_field,
-                                          int* number_receiver_global,
-                                          int* d_ispec_selected,
-                                          int* h_ispec_selected,
-                                          int* ibool) {
+                                int* number_receiver_global,
+                                int* d_ispec_selected,
+                                int* h_ispec_selected,
+                                int* ibool) {
 
-TRACE("transfer_field_from_device");
+TRACE("\ttransfer_field_from_device");
 
   // checks if anything to do
   if( mp->nrec_local == 0 ) return;
 
   int blocksize = NGLL3;
-  int num_blocks_x = mp->nrec_local;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
 
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nrec_local,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
   // prepare field transfer array on device
   transfer_stations_fields_from_device_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_number_receiver_global,
-                                                                d_ispec_selected,
-                                                                mp->d_ibool,
-                                                                mp->d_station_seismo_field,
-                                                                d_field,
-                                                                mp->nrec_local);
+                                                                                      d_ispec_selected,
+                                                                                      mp->d_ibool,
+                                                                                      mp->d_station_seismo_field,
+                                                                                      d_field,
+                                                                                      mp->nrec_local);
 
-  cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
-       (3*NGLL3)*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
+  // (cudaMemcpy implicitly synchronizes all other cuda operations)
+  print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
+                                    (3*NGLL3)*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost),71001);
 
   int irec_local;
   for(irec_local=0;irec_local<mp->nrec_local;irec_local++) {
@@ -422,13 +406,10 @@
 
   // sets up kernel dimensions
   int blocksize = NGLL3;
-  int num_blocks_x = mp->nrec_local;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
 
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nrec_local,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
@@ -444,6 +425,7 @@
   exit_on_cuda_error("transfer_field_acoustic_from_device kernel");
 #endif
 
+  // (cudaMemcpy implicitly synchronizes all other cuda operations)
   print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_potential,mp->d_station_seismo_potential,
                                      mp->nrec_local*NGLL3*sizeof(realw),cudaMemcpyDeviceToHost),55000);
 
@@ -474,8 +456,7 @@
 
 extern "C"
 void FC_FUNC_(transfer_station_ac_from_device,
-              TRANSFER_STATION_AC_FROM_DEVICE)(
-                                                realw* potential_acoustic,
+              TRANSFER_STATION_AC_FROM_DEVICE)(realw* potential_acoustic,
                                                 realw* potential_dot_acoustic,
                                                 realw* potential_dot_dot_acoustic,
                                                 realw* b_potential_acoustic,

Modified: seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_MPI.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_MPI.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_MPI.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -215,12 +215,12 @@
 
   ! adds contributions from different partitions to flag arrays
   ! integer arrays
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob,test_flag, &
+  call assemble_MPI_scalar_i_blocking(NPROC,nglob,test_flag, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_dummy,&
                         my_neighbours_ext_mesh)
   ! custom_real arrays
-  call assemble_MPI_scalar_ext_mesh(NPROC,nglob,test_flag_cr, &
+  call assemble_MPI_scalar_blocking(NPROC,nglob,test_flag_cr, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_dummy, &
                         my_neighbours_ext_mesh)

Modified: seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_coupling_surfaces.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_coupling_surfaces.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_coupling_surfaces.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -122,7 +122,7 @@
   enddo
   ! sums acoustic flags
   if( ACOUSTIC_SIMULATION ) then
-    call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, &
+    call assemble_MPI_scalar_i_blocking(NPROC,nglob_dummy,acoustic_flag, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
                         my_neighbours_ext_mesh)
@@ -130,7 +130,7 @@
 
   ! sums elastic flags
   if( ELASTIC_SIMULATION ) then
-    call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, &
+    call assemble_MPI_scalar_i_blocking(NPROC,nglob_dummy,elastic_flag, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
                         my_neighbours_ext_mesh)
@@ -138,7 +138,7 @@
 
   ! sums poroelastic flags
   if( POROELASTIC_SIMULATION ) then
-    call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, &
+    call assemble_MPI_scalar_i_blocking(NPROC,nglob_dummy,poroelastic_flag, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
                         my_neighbours_ext_mesh)

Modified: seismo/3D/SPECFEM3D/trunk/src/generate_databases/memory_eval.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/generate_databases/memory_eval.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/generate_databases/memory_eval.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -32,8 +32,8 @@
  subroutine memory_eval(NSPEC_AB,NGLOB_AB,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh,&
                         APPROXIMATE_OCEAN_LOAD,memory_size)
 
-   use generate_databases_par, only: PML_CONDITIONS,nspec_cpml
-  use create_regions_mesh_ext_par,only: NSPEC_ANISO,ispec_is_acoustic,ispec_is_elastic
+  use generate_databases_par, only: PML_CONDITIONS,nspec_cpml
+  use create_regions_mesh_ext_par,only: NSPEC_ANISO,ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic
 
   implicit none
 
@@ -46,7 +46,7 @@
   ! output
   double precision, intent(out) :: memory_size
   ! local parameters
-  logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION
+  logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION,POROELASTIC_SIMULATION
 
   memory_size = 0.d0
 
@@ -159,6 +159,17 @@
     endif
   endif
 
+  ! elastic arrays
+  call any_all_l( ANY(ispec_is_poroelastic), POROELASTIC_SIMULATION )
+  if( POROELASTIC_SIMULATION ) then
+    ! displs_poroelastic,..
+    memory_size = memory_size + 6.d0*dble(NDIM)*NGLOB_AB*dble(CUSTOM_REAL)
+    ! rmass_solid_poroelastic,..
+    memory_size = memory_size + 2.d0*NGLOB_AB*dble(CUSTOM_REAL)
+    ! rhoarraystore,..
+    memory_size = memory_size + 17.d0*dble(NGLLX)*dble(NGLLY)*dble(NGLLZ)*NSPEC_AB*dble(CUSTOM_REAL)
+  endif
+
   ! skipping boundary surfaces
   ! skipping free surfaces
   ! skipping acoustic-elastic coupling surfaces
@@ -171,14 +182,26 @@
   memory_size = memory_size + max_nibool_interfaces_ext_mesh*num_interfaces_ext_mesh*dble(SIZE_INTEGER)
 
   ! MPI communications
-  ! buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
-  memory_size = memory_size + 2.d0*dble(NDIM)*max_nibool_interfaces_ext_mesh*num_interfaces_ext_mesh*dble(CUSTOM_REAL)
+  if( ACOUSTIC_SIMULATION ) then
+    ! buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh
+    memory_size = memory_size + 2.d0*max_nibool_interfaces_ext_mesh*num_interfaces_ext_mesh*dble(CUSTOM_REAL)
+    ! request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
+    memory_size = memory_size + 2.d0*num_interfaces_ext_mesh*dble(SIZE_INTEGER)
+  endif
 
-  ! buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh
-  memory_size = memory_size + 2.d0*max_nibool_interfaces_ext_mesh*num_interfaces_ext_mesh*dble(CUSTOM_REAL)
+  if( ELASTIC_SIMULATION ) then
+    ! buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
+    memory_size = memory_size + 2.d0*dble(NDIM)*max_nibool_interfaces_ext_mesh*num_interfaces_ext_mesh*dble(CUSTOM_REAL)
+    ! request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+    memory_size = memory_size + 2.d0*num_interfaces_ext_mesh*dble(SIZE_INTEGER)
+  endif
 
-  ! request_send_vector_ext_mesh,request_recv_vector_ext_mesh,request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
-  memory_size = memory_size + 4.d0*num_interfaces_ext_mesh*dble(SIZE_INTEGER)
+  if( POROELASTIC_SIMULATION ) then
+    ! buffer_send_vector_ext_mesh_s,..
+    memory_size = memory_size + 4.d0*dble(NDIM)*max_nibool_interfaces_ext_mesh*num_interfaces_ext_mesh*dble(CUSTOM_REAL)
+    ! request_send_vector_ext_mesh_s,..
+    memory_size = memory_size + 4.d0*num_interfaces_ext_mesh*dble(SIZE_INTEGER)
+  endif
 
   ! ispec_is_inner
   memory_size = memory_size + NSPEC_AB*dble(SIZE_LOGICAL)

Modified: seismo/3D/SPECFEM3D/trunk/src/shared/assemble_MPI_scalar.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/shared/assemble_MPI_scalar.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/shared/assemble_MPI_scalar.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -28,17 +28,12 @@
 !---- assemble the contributions between slices using non-blocking MPI
 !----
 
-  subroutine assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,array_val, &
+  subroutine assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,array_val, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
 
-!  subroutine assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,array_val, &
-!                        buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
-!                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-!                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-!                        my_neighbours_ext_mesh, &
-!                        request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh)
+! assembles scalar field in a blocking way, returns only after values have been assembled
 
   implicit none
 
@@ -54,16 +49,12 @@
   integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
   integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
 
-!  real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
-!       buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh
-!  integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
+  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: buffer_send_scalar
+  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: buffer_recv_scalar
+  integer, dimension(:), allocatable :: request_send_scalar
+  integer, dimension(:), allocatable :: request_recv_scalar
 
-  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: buffer_send_scalar_ext_mesh
-  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: buffer_recv_scalar_ext_mesh
-  integer, dimension(:), allocatable :: request_send_scalar_ext_mesh
-  integer, dimension(:), allocatable :: request_recv_scalar_ext_mesh
 
-
   integer ipoin,iinterface,ier
 
 ! here we have to assemble all the contributions between partitions using MPI
@@ -71,72 +62,70 @@
 ! assemble only if more than one partition
   if(NPROC > 1) then
 
-    allocate(buffer_send_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array buffer_send_scalar_ext_mesh'
-    allocate(buffer_recv_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array buffer_recv_scalar_ext_mesh'
-    allocate(request_send_scalar_ext_mesh(num_interfaces_ext_mesh),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array request_send_scalar_ext_mesh'
-    allocate(request_recv_scalar_ext_mesh(num_interfaces_ext_mesh),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array request_recv_scalar_ext_mesh'
+    allocate(buffer_send_scalar(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array buffer_send_scalar'
+    allocate(buffer_recv_scalar(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array buffer_recv_scalar'
+    allocate(request_send_scalar(num_interfaces_ext_mesh),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array request_send_scalar'
+    allocate(request_recv_scalar(num_interfaces_ext_mesh),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array request_recv_scalar'
 
     ! partition border copy into the buffer
     do iinterface = 1, num_interfaces_ext_mesh
       do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
-        buffer_send_scalar_ext_mesh(ipoin,iinterface) = array_val(ibool_interfaces_ext_mesh(ipoin,iinterface))
+        buffer_send_scalar(ipoin,iinterface) = array_val(ibool_interfaces_ext_mesh(ipoin,iinterface))
       enddo
     enddo
 
     ! send messages
     do iinterface = 1, num_interfaces_ext_mesh
       ! non-blocking synchronous send request
-      call isend_cr(buffer_send_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
-           nibool_interfaces_ext_mesh(iinterface), &
-           my_neighbours_ext_mesh(iinterface), &
-           itag, &
-           request_send_scalar_ext_mesh(iinterface) &
-           )
+      call isend_cr(buffer_send_scalar(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+                     nibool_interfaces_ext_mesh(iinterface), &
+                     my_neighbours_ext_mesh(iinterface), &
+                     itag, &
+                     request_send_scalar(iinterface) )
       ! receive request
-      call irecv_cr(buffer_recv_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
-           nibool_interfaces_ext_mesh(iinterface), &
-           my_neighbours_ext_mesh(iinterface), &
-           itag, &
-           request_recv_scalar_ext_mesh(iinterface) &
-           )
+      call irecv_cr(buffer_recv_scalar(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+                     nibool_interfaces_ext_mesh(iinterface), &
+                     my_neighbours_ext_mesh(iinterface), &
+                     itag, &
+                     request_recv_scalar(iinterface) )
     enddo
 
     ! wait for communications completion (recv)
     do iinterface = 1, num_interfaces_ext_mesh
-      call wait_req(request_recv_scalar_ext_mesh(iinterface))
+      call wait_req(request_recv_scalar(iinterface))
     enddo
 
     ! adding contributions of neighbours
     do iinterface = 1, num_interfaces_ext_mesh
       do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
         array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
-             array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_scalar_ext_mesh(ipoin,iinterface)
+             array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_scalar(ipoin,iinterface)
       enddo
     enddo
 
     ! wait for communications completion (send)
     do iinterface = 1, num_interfaces_ext_mesh
-      call wait_req(request_send_scalar_ext_mesh(iinterface))
+      call wait_req(request_send_scalar(iinterface))
     enddo
 
-    deallocate(buffer_send_scalar_ext_mesh)
-    deallocate(buffer_recv_scalar_ext_mesh)
-    deallocate(request_send_scalar_ext_mesh)
-    deallocate(request_recv_scalar_ext_mesh)
+    deallocate(buffer_send_scalar)
+    deallocate(buffer_recv_scalar)
+    deallocate(request_send_scalar)
+    deallocate(request_recv_scalar)
 
   endif
 
-  end subroutine assemble_MPI_scalar_ext_mesh
+  end subroutine assemble_MPI_scalar_blocking
 
 !
 !----
 !
 
-  subroutine assemble_MPI_scalar_i_ext_mesh(NPROC,NGLOB_AB,array_val, &
+  subroutine assemble_MPI_scalar_i_blocking(NPROC,NGLOB_AB,array_val, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
@@ -155,10 +144,10 @@
   integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
   integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
 
-  integer, dimension(:,:), allocatable :: buffer_send_scalar_ext_mesh
-  integer, dimension(:,:), allocatable :: buffer_recv_scalar_ext_mesh
-  integer, dimension(:), allocatable :: request_send_scalar_ext_mesh
-  integer, dimension(:), allocatable :: request_recv_scalar_ext_mesh
+  integer, dimension(:,:), allocatable :: buffer_send_scalar
+  integer, dimension(:,:), allocatable :: buffer_recv_scalar
+  integer, dimension(:), allocatable :: request_send_scalar
+  integer, dimension(:), allocatable :: request_recv_scalar
 
   integer :: ipoin,iinterface,ier
 
@@ -167,74 +156,70 @@
 ! assemble only if more than one partition
   if(NPROC > 1) then
 
-    allocate(buffer_send_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array buffer_send_scalar_ext_mesh'
-    allocate(buffer_recv_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array buffer_recv_scalar_ext_mesh'
-    allocate(request_send_scalar_ext_mesh(num_interfaces_ext_mesh),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array request_send_scalar_ext_mesh'
-    allocate(request_recv_scalar_ext_mesh(num_interfaces_ext_mesh),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array request_recv_scalar_ext_mesh'
+    allocate(buffer_send_scalar(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array buffer_send_scalar'
+    allocate(buffer_recv_scalar(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array buffer_recv_scalar'
+    allocate(request_send_scalar(num_interfaces_ext_mesh),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array request_send_scalar'
+    allocate(request_recv_scalar(num_interfaces_ext_mesh),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array request_recv_scalar'
 
     ! partition border copy into the buffer
     do iinterface = 1, num_interfaces_ext_mesh
       do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
-        buffer_send_scalar_ext_mesh(ipoin,iinterface) = &
-          array_val(ibool_interfaces_ext_mesh(ipoin,iinterface))
+        buffer_send_scalar(ipoin,iinterface) = array_val(ibool_interfaces_ext_mesh(ipoin,iinterface))
       enddo
     enddo
 
     ! send messages
     do iinterface = 1, num_interfaces_ext_mesh
       ! non-blocking synchronous send request
-      call isend_i(buffer_send_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
-           nibool_interfaces_ext_mesh(iinterface), &
-           my_neighbours_ext_mesh(iinterface), &
-           itag, &
-           request_send_scalar_ext_mesh(iinterface) &
-           )
+      call isend_i(buffer_send_scalar(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+                   nibool_interfaces_ext_mesh(iinterface), &
+                   my_neighbours_ext_mesh(iinterface), &
+                   itag, &
+                   request_send_scalar(iinterface) )
       ! receive request
-      call irecv_i(buffer_recv_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
-           nibool_interfaces_ext_mesh(iinterface), &
-           my_neighbours_ext_mesh(iinterface), &
-           itag, &
-           request_recv_scalar_ext_mesh(iinterface) &
-           )
+      call irecv_i(buffer_recv_scalar(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+                   nibool_interfaces_ext_mesh(iinterface), &
+                   my_neighbours_ext_mesh(iinterface), &
+                   itag, &
+                   request_recv_scalar(iinterface) )
     enddo
 
     ! wait for communications completion
     do iinterface = 1, num_interfaces_ext_mesh
-      call wait_req(request_recv_scalar_ext_mesh(iinterface))
+      call wait_req(request_recv_scalar(iinterface))
     enddo
 
     ! adding contributions of neighbours
     do iinterface = 1, num_interfaces_ext_mesh
       do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
         array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
-             array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) &
-             + buffer_recv_scalar_ext_mesh(ipoin,iinterface)
+             array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_scalar(ipoin,iinterface)
       enddo
     enddo
 
     ! wait for communications completion (send)
     do iinterface = 1, num_interfaces_ext_mesh
-      call wait_req(request_send_scalar_ext_mesh(iinterface))
+      call wait_req(request_send_scalar(iinterface))
     enddo
 
-    deallocate(buffer_send_scalar_ext_mesh)
-    deallocate(buffer_recv_scalar_ext_mesh)
-    deallocate(request_send_scalar_ext_mesh)
-    deallocate(request_recv_scalar_ext_mesh)
+    deallocate(buffer_send_scalar)
+    deallocate(buffer_recv_scalar)
+    deallocate(request_send_scalar)
+    deallocate(request_recv_scalar)
 
   endif
 
-  end subroutine assemble_MPI_scalar_i_ext_mesh
+  end subroutine assemble_MPI_scalar_i_blocking
 
 !
 !----
 !
 
-  subroutine assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_AB,array_val, &
+  subroutine assemble_MPI_scalar_async_send(NPROC,NGLOB_AB,array_val, &
                         buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
@@ -296,13 +281,13 @@
 
   endif
 
-  end subroutine assemble_MPI_scalar_ext_mesh_s
+  end subroutine assemble_MPI_scalar_async_send
 
 !
 !----
 !
 
-  subroutine assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_AB,array_val, &
+  subroutine assemble_MPI_scalar_async_recv(NPROC,NGLOB_AB,array_val, &
                         buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh, &
                         max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
@@ -354,5 +339,5 @@
 
   endif
 
-  end subroutine assemble_MPI_scalar_ext_mesh_w
+  end subroutine assemble_MPI_scalar_async_recv
 

Modified: seismo/3D/SPECFEM3D/trunk/src/shared/constants.h.in
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/shared/constants.h.in	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/shared/constants.h.in	2013-08-20 14:13:26 UTC (rev 22718)
@@ -115,7 +115,7 @@
 
 ! to plot total energy curves, for instance to monitor how CPML absorbing layers behave;
 ! should be turned OFF in most cases
-  logical, parameter :: output_energy = .false. ! .true.
+  logical, parameter :: OUTPUT_ENERGY = .false. ! .true.
   integer, parameter :: IOUT_ENERGY = 937  ! file number for the energy file
   integer, parameter :: NTSTEP_BETWEEN_OUTPUT_ENERGY = 10  ! how often we compute energy (which is expensive to compute)
 

Modified: seismo/3D/SPECFEM3D/trunk/src/shared/detect_surface.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/shared/detect_surface.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/shared/detect_surface.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -88,7 +88,7 @@
   enddo
 
   ! adds contributions from different partitions to valence_external_mesh
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob,valence_external_mesh, &
+  call assemble_MPI_scalar_i_blocking(NPROC,nglob,valence_external_mesh, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
@@ -335,7 +335,7 @@
   enddo
 
 ! adds contributions from different partitions to valence_external_mesh
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob,valence_external_mesh, &
+  call assemble_MPI_scalar_i_blocking(NPROC,nglob,valence_external_mesh, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
@@ -738,7 +738,7 @@
   enddo
 
 ! adds contributions from different partitions to valence_external_mesh
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob,valence_external_mesh, &
+  call assemble_MPI_scalar_i_blocking(NPROC,nglob,valence_external_mesh, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in	2013-08-20 14:13:26 UTC (rev 22718)
@@ -45,14 +45,30 @@
 @COND_CUDA_TRUE at NVCC = nvcc
 @COND_CUDA_FALSE at NVCC = @CC@
 
+# GPU architecture
 
- at COND_CUDA_TRUE@NVCC_FLAGS_BASE = $(CUDA_INC) $(MPI_INC) $(COND_MPI_CPPFLAGS) 
- at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -dc -DCUDA -gencode=arch=compute_35,code=sm_35 
- at COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -DCUDA -DUSE_OLDER_CUDA4_GPU -gencode=arch=compute_20,code=sm_20 
+# CUDA architecture / code version
+# Fermi: -gencode=arch=compute_10,code=sm_10 not supported
+# Tesla (default): -gencode=arch=compute_20,code=sm_20
+# Geforce GT 650m: -gencode=arch=compute_30,code=sm_30
+# Kepler (cuda5) : -gencode=arch=compute_35,code=sm_35
+GENCODE_20 = -gencode=arch=compute_20,code=\"sm_20,compute_20\"
+GENCODE_30 = -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+GENCODE_35 = -gencode=arch=compute_35,code=\"sm_35,compute_35\"
 
+# CUDA version 5.x
+ at COND_CUDA_TRUE@@COND_CUDA5_TRUE at GENCODE = $(GENCODE_35)
+# CUDA version 4.x
+ at COND_CUDA_TRUE@@COND_CUDA5_FALSE at GENCODE = $(GENCODE_20)
+
+# CUDA flags and linking
+ at COND_CUDA_TRUE@NVCC_FLAGS_BASE = $(CUDA_INC) $(MPI_INC) $(COND_MPI_CPPFLAGS)
+ at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -dc -DCUDA $(GENCODE)
+ at COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -DCUDA -DUSE_OLDER_CUDA4_GPU $(GENCODE)
+
 @COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCCLINK_BASE = $(NVCC) $(CUDA_INC) $(MPI_INC) $(COND_MPI_CPPFLAGS) -DCUDA
- at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCCLINK = $(NVCCLINK_BASE) -dlink -gencode=arch=compute_35,code=sm_35
- at COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCCLINK = $(NVCCLINK_BASE) -DUSE_OLDER_CUDA4_GPU -gencode=arch=compute_20,code=sm_20
+ at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCCLINK = $(NVCCLINK_BASE) -dlink $(GENCODE)
+ at COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCCLINK = $(NVCCLINK_BASE) -DUSE_OLDER_CUDA4_GPU $(GENCODE)
 
 @COND_CUDA_FALSE at NVCC_FLAGS = $(MPI_INC) $(COND_MPI_CPPFLAGS)
 @COND_CUDA_FALSE at NVCCLINK = $(NVCC) $(NVCC_FLAGS)
@@ -100,6 +116,8 @@
 ARFLAGS = cru
 RANLIB = ranlib
 
+#######################################
+
 libspecfem_a_OBJECTS = \
 	$O/assemble_MPI_scalar.shared.o \
 	$O/check_mesh_resolution.shared.o \
@@ -172,10 +190,11 @@
 	$O/specfem3D_par.o \
 	$O/pml_par.o \
 	$O/assemble_MPI_vector.o \
+	$O/check_stability.o \
 	$O/fault_solver_common.o \
 	$O/fault_solver_dynamic.o \
 	$O/fault_solver_kinematic.o \
-        $O/gravity_perturbation.o \
+	$O/gravity_perturbation.o \
 	$O/compute_add_sources_acoustic.o \
 	$O/compute_add_sources_viscoelastic.o \
 	$O/compute_add_sources_poroelastic.o \
@@ -201,6 +220,7 @@
 	$O/compute_stacey_acoustic.o \
 	$O/compute_stacey_viscoelastic.o \
 	$O/compute_stacey_poroelastic.o \
+	$O/compute_total_energy.o \
 	$O/create_color_image.o \
 	$O/detect_mesh_surfaces.o \
 	$O/finalize_simulation.o \
@@ -220,6 +240,7 @@
 	$O/setup_movie_meshes.o \
 	$O/setup_sources_receivers.o \
 	$O/specfem3D.o \
+	$O/update_displacement_scheme.o \
 	$O/write_movie_output.o \
 	$O/write_output_ASCII.o \
 	$O/write_output_SU.o \
@@ -271,6 +292,8 @@
 @COND_PYRE_FALSE@@COND_CUDA_TRUE at XSPECFEM_OBJECTS = $(SOLVER_ARRAY_OBJECTS) $(LIBSPECFEM) $(CUDA_OBJECTS)
 @COND_PYRE_FALSE@@COND_CUDA_FALSE at XSPECFEM_OBJECTS = $(SOLVER_ARRAY_OBJECTS) $(LIBSPECFEM) $(CUDA_STUBS)
 
+#######################################
+
 ####
 #### targets
 ####
@@ -292,6 +315,7 @@
 
 specfem3D: xspecfem3D
 
+#######################################
 
 ####
 #### rules for executables
@@ -343,6 +367,9 @@
         xcreate_movie_shakemap_AVS_DX_GMT xcombine_vol_data xcombine_surf_data \
         xsmooth_vol_data xmodel_update xsum_kernels
 
+
+#######################################
+
 ###
 ### rule for the archive library
 ###
@@ -352,6 +379,8 @@
 	$(AR) $(ARFLAGS) $L/libspecfem.a $(libspecfem_a_OBJECTS)
 	$(RANLIB) $L/libspecfem.a
 
+#######################################
+
 ####
 #### rule to build each .o file below
 ####
@@ -372,6 +401,7 @@
 $O/%.shared.o: ${SHARED}%.F90 $(SHARED)constants.h
 	${FCCOMPILE_CHECK} -c -o $@ $<
 
+#######################################
 
 ###
 ### OpenMP compilation
@@ -380,12 +410,20 @@
 	${FCCOMPILE_CHECK} -c -o $@ $<
 
 
+#######################################
+
 ###
 ### CUDA compilation
 ###
 $O/%.cuda.o: ${CUDAD}%.cu ../../config.h $(CUDAD)mesh_constants_cuda.h $(CUDAD)prepare_constants_cuda.h
 	$(NVCC) -c $< -o $@ $(NVCC_FLAGS)
 
+$O/%.cudacc.o: ${CUDAD}%.c ../../config.h
+	${CC} -c $(CFLAGS) $(MPI_INC) -o $@ ${CUDAD}$< -I../../
+
+
+#######################################
+
 ###
 ### C compilation
 ###
@@ -395,9 +433,8 @@
 $O/%.cc.o: ${SHARED}%.c ../../config.h
 	${CC} -c $(CFLAGS) $(MPI_INC) -o $@ ${SHARED}$< -I../../
 
-$O/%.cudacc.o: ${CUDAD}%.c ../../config.h
-	${CC} -c $(CFLAGS) $(MPI_INC) -o $@ ${CUDAD}$< -I../../
 
+#######################################
 
 ###
 ### MPI compilation without optimization
@@ -409,19 +446,24 @@
 $O/serial.o: $(SHARED)constants.h $(SHARED)serial.f90
 	${FCCOMPILE_CHECK} -c -o $O/serial.o $(SHARED)serial.f90
 
-$O/smooth_vol_data.o: $(SHARED)constants.h $(SHARED)smooth_vol_data.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/smooth_vol_data.o $(SHARED)smooth_vol_data.f90
 
 
+#######################################
 
 
 ##
 ## kernel summation
 ##
 
+$O/smooth_vol_data.o: $(SHARED)constants.h $(SHARED)smooth_vol_data.f90
+	${MPIFCCOMPILE_CHECK} -c -o $O/smooth_vol_data.o $(SHARED)smooth_vol_data.f90
+
 $O/sum_kernels.o: $(SHARED)constants.h $(SHARED)sum_kernels.f90
 	${MPIFCCOMPILE_CHECK} -c -o $O/sum_kernels.o $(SHARED)sum_kernels.f90
 
+
+#######################################
+
 ##
 ## model update
 ##

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -28,11 +28,13 @@
 !---- assemble the contributions between slices using non-blocking MPI
 !----
 
-  subroutine assemble_MPI_vector_ext_mesh(NPROC,NGLOB_AB,array_val, &
+  subroutine assemble_MPI_vector_blocking(NPROC,NGLOB_AB,array_val, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
 
+! assembles vector field in blocking way, only returns after values have been received and assembled
+
   implicit none
 
   include "constants.h"
@@ -50,15 +52,12 @@
   ! local parameters
 
   ! send/receive temporary buffers
-  !real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
-  !     buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
-  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: buffer_send_vector_ext_mesh
-  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: buffer_recv_vector_ext_mesh
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: buffer_send_vector
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: buffer_recv_vector
 
   ! requests
-  !integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
-  integer, dimension(:), allocatable :: request_send_vector_ext_mesh
-  integer, dimension(:), allocatable :: request_recv_vector_ext_mesh
+  integer, dimension(:), allocatable :: request_send_vector
+  integer, dimension(:), allocatable :: request_recv_vector
 
   integer ipoin,iinterface,ier
 
@@ -68,72 +67,68 @@
 ! assemble only if more than one partition
   if(NPROC > 1) then
 
-    allocate(buffer_send_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array buffer_send_vector_ext_mesh'
-    allocate(buffer_recv_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array buffer_recv_vector_ext_mesh'
-    allocate(request_send_vector_ext_mesh(num_interfaces_ext_mesh),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array request_send_vector_ext_mesh'
-    allocate(request_recv_vector_ext_mesh(num_interfaces_ext_mesh),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array request_recv_vector_ext_mesh'
+    allocate(buffer_send_vector(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array buffer_send_vector'
+    allocate(buffer_recv_vector(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array buffer_recv_vector'
+    allocate(request_send_vector(num_interfaces_ext_mesh),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array request_send_vector'
+    allocate(request_recv_vector(num_interfaces_ext_mesh),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array request_recv_vector'
 
     ! partition border copy into the buffer
     do iinterface = 1, num_interfaces_ext_mesh
       do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
-        buffer_send_vector_ext_mesh(:,ipoin,iinterface) = &
-          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface))
+        buffer_send_vector(:,ipoin,iinterface) = array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface))
       enddo
     enddo
 
     ! send messages
     do iinterface = 1, num_interfaces_ext_mesh
-      call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), &
-           NDIM*nibool_interfaces_ext_mesh(iinterface), &
-           my_neighbours_ext_mesh(iinterface), &
-           itag, &
-           request_send_vector_ext_mesh(iinterface) &
-           )
-      call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), &
-           NDIM*nibool_interfaces_ext_mesh(iinterface), &
-           my_neighbours_ext_mesh(iinterface), &
-           itag, &
-           request_recv_vector_ext_mesh(iinterface) &
-           )
+      call isend_cr(buffer_send_vector(1,1,iinterface), &
+                     NDIM*nibool_interfaces_ext_mesh(iinterface), &
+                     my_neighbours_ext_mesh(iinterface), &
+                     itag, &
+                     request_send_vector(iinterface) )
+      call irecv_cr(buffer_recv_vector(1,1,iinterface), &
+                     NDIM*nibool_interfaces_ext_mesh(iinterface), &
+                     my_neighbours_ext_mesh(iinterface), &
+                     itag, &
+                     request_recv_vector(iinterface) )
     enddo
 
     ! wait for communications completion (recv)
     do iinterface = 1, num_interfaces_ext_mesh
-      call wait_req(request_recv_vector_ext_mesh(iinterface))
+      call wait_req(request_recv_vector(iinterface))
     enddo
 
     ! adding contributions of neighbours
     do iinterface = 1, num_interfaces_ext_mesh
       do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
         array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
-             array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) &
-             + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+             array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector(:,ipoin,iinterface)
       enddo
     enddo
 
     ! wait for communications completion (send)
     do iinterface = 1, num_interfaces_ext_mesh
-      call wait_req(request_send_vector_ext_mesh(iinterface))
+      call wait_req(request_send_vector(iinterface))
     enddo
 
-    deallocate(buffer_send_vector_ext_mesh)
-    deallocate(buffer_recv_vector_ext_mesh)
-    deallocate(request_send_vector_ext_mesh)
-    deallocate(request_recv_vector_ext_mesh)
+    deallocate(buffer_send_vector)
+    deallocate(buffer_recv_vector)
+    deallocate(request_send_vector)
+    deallocate(request_recv_vector)
 
   endif
 
-  end subroutine assemble_MPI_vector_ext_mesh
+  end subroutine assemble_MPI_vector_blocking
 
 !
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB,array_val, &
+  subroutine assemble_MPI_vector_async_send(NPROC,NGLOB_AB,array_val, &
                                            buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
                                            num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
@@ -194,19 +189,95 @@
 
     endif
 
-  end subroutine assemble_MPI_vector_ext_mesh_s
+  end subroutine assemble_MPI_vector_async_send
 
 !
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_AB,array_val, &
-            buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-            request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+! unused, new ordered routine is used now...
+!
+!  subroutine assemble_MPI_vector_async_recv(NPROC,NGLOB_AB,array_val, &
+!                                            buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh, &
+!                                            max_nibool_interfaces_ext_mesh, &
+!                                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+!                                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+!
+!! waits for data to receive and assembles
+!
+!  implicit none
+!
+!  include "constants.h"
+!
+!  integer :: NPROC
+!  integer :: NGLOB_AB
+!
+!! array to assemble
+!  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
+!
+!  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+!
+!  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+!       buffer_recv_vector_ext_mesh
+!
+!  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+!  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+!  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+!
+!  integer ipoin,iinterface
+!
+!! here we have to assemble all the contributions between partitions using MPI
+!
+!! assemble only if more than one partition
+!  if(NPROC > 1) then
+!
+!! wait for communications completion (recv)
+!  do iinterface = 1, num_interfaces_ext_mesh
+!    call wait_req(request_recv_vector_ext_mesh(iinterface))
+!  enddo
+!
+!! adding contributions of neighbours
+!  do iinterface = 1, num_interfaces_ext_mesh
+!    do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+!      array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+!           array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+!    enddo
+!  enddo
+!
+!! wait for communications completion (send)
+!  do iinterface = 1, num_interfaces_ext_mesh
+!    call wait_req(request_send_vector_ext_mesh(iinterface))
+!  enddo
+!
+!  endif
+!
+!  end subroutine assemble_MPI_vector_async_recv
+!
+!
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine assemble_MPI_vector_async_w_ord(NPROC,NGLOB_AB,array_val, &
+                                            buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh, &
+                                            max_nibool_interfaces_ext_mesh, &
+                                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
+                                            my_neighbours_ext_mesh,myrank)
+
 ! waits for data to receive and assembles
 
+! The goal of this version is to avoid different round-off errors in different processors.
+! The contribution of each processor is added following the order of its rank.
+! This guarantees that the sums are done in the same order on all processors.
+!
+! NOTE: this version assumes that the interfaces are ordered by increasing rank of the neighbour.
+! That is currently done so in subroutine write_interfaces_database in decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90
+! A safety test could be added here.
+!
+! October 2012 - Surendra Somala and Jean-Paul Ampuero - Caltech Seismolab
+
   implicit none
 
   include "constants.h"
@@ -217,7 +288,7 @@
 ! array to assemble
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
 
-  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,myrank
 
   real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
        buffer_recv_vector_ext_mesh
@@ -225,38 +296,68 @@
   integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
   integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
   integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
 
-  integer ipoin,iinterface
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: mybuffer
+  integer :: ipoin,iinterface,iglob
+  logical :: need_add_my_contrib
 
 ! here we have to assemble all the contributions between partitions using MPI
 
 ! assemble only if more than one partition
-  if(NPROC > 1) then
+  if (NPROC == 1) return
 
+! move interface values of array_val to local buffers
+  do iinterface = 1, num_interfaces_ext_mesh
+    do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+      iglob = ibool_interfaces_ext_mesh(ipoin,iinterface)
+      mybuffer(:,ipoin,iinterface) = array_val(:,iglob)
+     ! set them to zero right away to avoid counting it more than once during assembly:
+     ! buffers of higher rank get zeros on nodes shared with current buffer
+      array_val(:,iglob) = 0._CUSTOM_REAL
+    enddo
+  enddo
+
 ! wait for communications completion (recv)
   do iinterface = 1, num_interfaces_ext_mesh
     call wait_req(request_recv_vector_ext_mesh(iinterface))
   enddo
 
-! adding contributions of neighbours
+! adding all contributions in order of processor rank
+  need_add_my_contrib = .true.
   do iinterface = 1, num_interfaces_ext_mesh
+    if (need_add_my_contrib .and. myrank < my_neighbours_ext_mesh(iinterface)) call add_my_contrib()
     do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
-      array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
-           array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) &
-           + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+      iglob = ibool_interfaces_ext_mesh(ipoin,iinterface)
+      array_val(:,iglob) = array_val(:,iglob) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
     enddo
   enddo
+  if (need_add_my_contrib) call add_my_contrib()
 
 ! wait for communications completion (send)
   do iinterface = 1, num_interfaces_ext_mesh
     call wait_req(request_send_vector_ext_mesh(iinterface))
   enddo
 
-  endif
+  contains
 
-  end subroutine assemble_MPI_vector_ext_mesh_w
+    subroutine add_my_contrib()
 
+    integer :: my_iinterface,my_ipoin
 
+    do my_iinterface = 1, num_interfaces_ext_mesh
+      do my_ipoin = 1, nibool_interfaces_ext_mesh(my_iinterface)
+        iglob = ibool_interfaces_ext_mesh(my_ipoin,my_iinterface)
+        array_val(:,iglob) = array_val(:,iglob) + mybuffer(:,my_ipoin,my_iinterface)
+      enddo
+    enddo
+    need_add_my_contrib = .false.
+
+    end subroutine add_my_contrib
+
+  end subroutine assemble_MPI_vector_async_w_ord
+
+
 !
 !--------------------------------------------------------------------------------------------------
 !
@@ -436,9 +537,6 @@
   integer, dimension(num_interfaces_ext_mesh) :: request_recv_vector_ext_mesh
 
   ! local parameters
-  !integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
-  !integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
-  !integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh
   integer :: iinterface
 
   ! here we have to assemble all the contributions between partitions using MPI
@@ -572,17 +670,15 @@
      ! send messages
      do iinterface = 1, num_interfaces_ext_mesh
         call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), &
-             NDIM*nibool_interfaces_ext_mesh(iinterface), &
-             my_neighbours_ext_mesh(iinterface), &
-             itag, &
-             request_send_vector_ext_mesh(iinterface) &
-             )
+                       NDIM*nibool_interfaces_ext_mesh(iinterface), &
+                       my_neighbours_ext_mesh(iinterface), &
+                       itag, &
+                       request_send_vector_ext_mesh(iinterface) )
         call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), &
-             NDIM*nibool_interfaces_ext_mesh(iinterface), &
-             my_neighbours_ext_mesh(iinterface), &
-             itag, &
-             request_recv_vector_ext_mesh(iinterface) &
-             )
+                       NDIM*nibool_interfaces_ext_mesh(iinterface), &
+                       my_neighbours_ext_mesh(iinterface), &
+                       itag, &
+                       request_recv_vector_ext_mesh(iinterface) )
      enddo
 
   endif
@@ -610,11 +706,11 @@
   integer :: NPROC
   integer :: NGLOB_AB
   integer(kind=8) :: Mesh_pointer
-! array to assemble
+
+  ! array to assemble
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
 
   integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
-
   real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
        buffer_recv_vector_ext_mesh
 
@@ -638,10 +734,11 @@
     enddo
 
     ! adding contributions of neighbours
-    call transfer_asmbl_accel_to_device(Mesh_pointer, array_val, buffer_recv_vector_ext_mesh, &
-                                      num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
-                                      nibool_interfaces_ext_mesh,&
-                                      ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
+    call transfer_asmbl_accel_to_device(Mesh_pointer, array_val, &
+                                        buffer_recv_vector_ext_mesh, &
+                                        num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+                                        nibool_interfaces_ext_mesh,&
+                                        ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
 
     ! This step is done via previous function transfer_and_assemble...
     ! do iinterface = 1, num_interfaces_ext_mesh
@@ -666,11 +763,11 @@
 !
 
   subroutine assemble_MPI_scalar_send_cuda(NPROC, &
-                        buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh, &
-                        my_neighbours_ext_mesh, &
-                        request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh)
+                                           buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
+                                           num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                           nibool_interfaces_ext_mesh, &
+                                           my_neighbours_ext_mesh, &
+                                           request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh)
 
 ! non-blocking MPI send
 
@@ -700,21 +797,24 @@
 
     ! send messages
     do iinterface = 1, num_interfaces_ext_mesh
+      ! note: passing arguments:
+      !          **array**(1:nibool_interfaces_ext_mesh(iinterface),iinterface)
+      !       might lead to additional copy memory operations for certain compilers (slows down performance);
+      !       to avoid this in fortran, one might just pass the pointer to the first array value:
+      !          **array**(1,iinterface)
+
       ! non-blocking synchronous send request
       call isend_cr(buffer_send_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
-           nibool_interfaces_ext_mesh(iinterface), &
-           my_neighbours_ext_mesh(iinterface), &
-           itag, &
-           request_send_scalar_ext_mesh(iinterface) &
-           )
+                    nibool_interfaces_ext_mesh(iinterface), &
+                    my_neighbours_ext_mesh(iinterface), &
+                    itag, &
+                    request_send_scalar_ext_mesh(iinterface) )
       ! receive request
       call irecv_cr(buffer_recv_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
-           nibool_interfaces_ext_mesh(iinterface), &
-           my_neighbours_ext_mesh(iinterface), &
-           itag, &
-           request_recv_scalar_ext_mesh(iinterface) &
-           )
-
+                    nibool_interfaces_ext_mesh(iinterface), &
+                    my_neighbours_ext_mesh(iinterface), &
+                    itag, &
+                    request_recv_scalar_ext_mesh(iinterface) )
     enddo
 
   endif
@@ -756,9 +856,9 @@
   integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
   integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
 
-  integer FORWARD_OR_ADJOINT
+  integer :: FORWARD_OR_ADJOINT
 
-  integer iinterface ! ipoin
+  integer :: iinterface ! ipoin
 
   ! assemble only if more than one partition
   if(NPROC > 1) then
@@ -769,9 +869,7 @@
     enddo
 
     ! adding contributions of neighbours
-    call transfer_asmbl_pot_to_device(Mesh_pointer, array_val, buffer_recv_scalar_ext_mesh, &
-                num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, nibool_interfaces_ext_mesh,&
-                ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
+    call transfer_asmbl_pot_to_device(Mesh_pointer,array_val,buffer_recv_scalar_ext_mesh,FORWARD_OR_ADJOINT)
 
     ! note: adding contributions of neighbours has been done just above for cuda
     !do iinterface = 1, num_interfaces_ext_mesh
@@ -791,109 +889,3 @@
 
   end subroutine assemble_MPI_scalar_write_cuda
 
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-  subroutine assemble_MPI_vector_ext_mesh_w_ordered(NPROC,NGLOB_AB,array_val, &
-            buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-            request_send_vector_ext_mesh,request_recv_vector_ext_mesh,my_neighbours_ext_mesh,myrank)
-
-! waits for data to receive and assembles
-
-! The goal of this version is to avoid different round-off errors in different processors.
-! The contribution of each processor is added following the order of its rank.
-! This guarantees that the sums are done in the same order on all processors.
-!
-! NOTE: this version assumes that the interfaces are ordered by increasing rank of the neighbour.
-! That is currently done so in subroutine write_interfaces_database in decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90
-! A safety test could be added here.
-!
-! October 2012 - Surendra Somala and Jean-Paul Ampuero - Caltech Seismolab
-
-  implicit none
-
-  include "constants.h"
-
-  integer :: NPROC
-  integer :: NGLOB_AB
-
-! array to assemble
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
-
-  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,myrank
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
-       buffer_recv_vector_ext_mesh
-
-  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
-  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: mybuffer
-  integer :: ipoin,iinterface,iglob
-  logical :: need_add_my_contrib
-
-! here we have to assemble all the contributions between partitions using MPI
-
-! assemble only if more than one partition
-  if (NPROC == 1) return
-
-! move interface values of array_val to local buffers
-  do iinterface = 1, num_interfaces_ext_mesh
-    do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
-      iglob = ibool_interfaces_ext_mesh(ipoin,iinterface)
-      mybuffer(:,ipoin,iinterface) = array_val(:,iglob)
-     ! set them to zero right away to avoid counting it more than once during assembly:
-     ! buffers of higher rank get zeros on nodes shared with current buffer
-      array_val(:,iglob) = 0._CUSTOM_REAL
-    enddo
-  enddo
-
-! wait for communications completion (recv)
-  do iinterface = 1, num_interfaces_ext_mesh
-    call wait_req(request_recv_vector_ext_mesh(iinterface))
-  enddo
-
-! adding all contributions in order of processor rank
-  need_add_my_contrib = .true.
-  do iinterface = 1, num_interfaces_ext_mesh
-    if (need_add_my_contrib .and. myrank < my_neighbours_ext_mesh(iinterface)) call add_my_contrib()
-    do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
-      iglob = ibool_interfaces_ext_mesh(ipoin,iinterface)
-      array_val(:,iglob) = array_val(:,iglob) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
-    enddo
-  enddo
-  if (need_add_my_contrib) call add_my_contrib()
-
-! wait for communications completion (send)
-  do iinterface = 1, num_interfaces_ext_mesh
-    call wait_req(request_send_vector_ext_mesh(iinterface))
-  enddo
-
-  contains
-
-    subroutine add_my_contrib()
-
-    integer :: my_iinterface,my_ipoin
-
-    do my_iinterface = 1, num_interfaces_ext_mesh
-      do my_ipoin = 1, nibool_interfaces_ext_mesh(my_iinterface)
-        iglob = ibool_interfaces_ext_mesh(my_ipoin,my_iinterface)
-        array_val(:,iglob) = array_val(:,iglob) + mybuffer(:,my_ipoin,my_iinterface)
-      enddo
-    enddo
-    need_add_my_contrib = .false.
-
-    end subroutine add_my_contrib
-
-  end subroutine assemble_MPI_vector_ext_mesh_w_ordered
-
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-

Added: seismo/3D/SPECFEM3D/trunk/src/specfem3D/check_stability.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/check_stability.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/check_stability.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -0,0 +1,290 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 1
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+!                             July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+!
+! United States and French Government Sponsorship Acknowledged.
+
+  subroutine check_stability()
+
+! computes the maximum of the norm of the displacement
+! in all the slices using an MPI reduction
+! and output timestamp file to check that simulation is running fine
+
+  use specfem_par
+  use specfem_par_elastic
+  use specfem_par_poroelastic
+  use specfem_par_acoustic
+
+  implicit none
+
+  double precision :: tCPU,t_remain,t_total
+  integer :: ihours,iminutes,iseconds,int_tCPU, &
+             ihours_remain,iminutes_remain,iseconds_remain,int_t_remain, &
+             ihours_total,iminutes_total,iseconds_total,int_t_total
+
+  ! maximum of the norm of the displacement
+  real(kind=CUSTOM_REAL) Usolidnorm,Usolidnorm_all ! elastic
+  real(kind=CUSTOM_REAL) Usolidnormp,Usolidnormp_all ! acoustic
+  real(kind=CUSTOM_REAL) Usolidnorms,Usolidnorms_all ! solid poroelastic
+  real(kind=CUSTOM_REAL) Usolidnormw,Usolidnormw_all ! fluid (w.r.t.s) poroelastic
+
+  ! norm of the backward displacement
+  real(kind=CUSTOM_REAL) b_Usolidnorm, b_Usolidnorm_all
+  real(kind=CUSTOM_REAL) b_Usolidnormp, b_Usolidnormp_all
+  real(kind=CUSTOM_REAL) b_Usolidnorms, b_Usolidnorms_all
+  real(kind=CUSTOM_REAL) b_Usolidnormw, b_Usolidnormw_all
+
+  ! initializes
+  Usolidnorm_all = 0.0_CUSTOM_REAL
+  Usolidnormp_all = 0.0_CUSTOM_REAL
+  Usolidnorms_all = 0.0_CUSTOM_REAL
+  Usolidnormw_all = 0.0_CUSTOM_REAL
+
+  ! compute maximum of norm of displacement in each slice
+  if( ELASTIC_SIMULATION ) then
+    if( GPU_MODE) then
+      ! way 2: just get maximum of field from GPU
+      call get_norm_elastic_from_device(Usolidnorm,Mesh_pointer,1)
+    else
+      Usolidnorm = maxval(sqrt(displ(1,:)**2 + displ(2,:)**2 + displ(3,:)**2))
+    endif
+
+    ! check stability of the code, exit if unstable
+    ! negative values can occur with some compilers when the unstable value is greater
+    ! than the greatest possible floating-point number of the machine
+    !if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0.0_CUSTOM_REAL) &
+    !  call exit_MPI(myrank,'single forward simulation became unstable and blew up')
+
+    ! compute the maximum of the maxima for all the slices using an MPI reduction
+    call max_all_cr(Usolidnorm,Usolidnorm_all)
+  endif
+
+  if( ACOUSTIC_SIMULATION ) then
+    if(GPU_MODE) then
+      ! way 2: just get maximum of field from GPU
+      call get_norm_acoustic_from_device(Usolidnormp,Mesh_pointer,1)
+    else
+      Usolidnormp = maxval(abs(potential_dot_dot_acoustic(:)))
+    endif
+
+    ! compute the maximum of the maxima for all the slices using an MPI reduction
+    call max_all_cr(Usolidnormp,Usolidnormp_all)
+  endif
+
+  if( POROELASTIC_SIMULATION ) then
+    Usolidnorms = maxval(sqrt(displs_poroelastic(1,:)**2 + displs_poroelastic(2,:)**2 + &
+                             displs_poroelastic(3,:)**2))
+    Usolidnormw = maxval(sqrt(displw_poroelastic(1,:)**2 + displw_poroelastic(2,:)**2 + &
+                             displw_poroelastic(3,:)**2))
+
+    ! compute the maximum of the maxima for all the slices using an MPI reduction
+    call max_all_cr(Usolidnorms,Usolidnorms_all)
+    call max_all_cr(Usolidnormw,Usolidnormw_all)
+  endif
+
+
+  ! adjoint simulations
+  if( SIMULATION_TYPE == 3 ) then
+    ! initializes backward field norms
+    b_Usolidnorm_all = 0.0_CUSTOM_REAL
+    b_Usolidnormp_all = 0.0_CUSTOM_REAL
+    b_Usolidnorms_all = 0.0_CUSTOM_REAL
+    b_Usolidnormw_all = 0.0_CUSTOM_REAL
+
+    if( ELASTIC_SIMULATION ) then
+      ! way 2
+      if(GPU_MODE) then
+        call get_norm_elastic_from_device(b_Usolidnorm,Mesh_pointer,3)
+      else
+        b_Usolidnorm = maxval(sqrt(b_displ(1,:)**2 + b_displ(2,:)**2 + b_displ(3,:)**2))
+      endif
+      ! compute max of all slices
+      call max_all_cr(b_Usolidnorm,b_Usolidnorm_all)
+    endif
+    if( ACOUSTIC_SIMULATION ) then
+      ! way 2
+      if(GPU_MODE) then
+        call get_norm_acoustic_from_device(b_Usolidnormp,Mesh_pointer,3)
+      else
+        b_Usolidnormp = maxval(abs(b_potential_dot_dot_acoustic(:)))
+      endif
+      ! compute max of all slices
+      call max_all_cr(b_Usolidnormp,b_Usolidnormp_all)
+    endif
+    if( POROELASTIC_SIMULATION ) then
+      b_Usolidnorms = maxval(sqrt(b_displs_poroelastic(1,:)**2 + b_displs_poroelastic(2,:)**2 + &
+                                  b_displs_poroelastic(3,:)**2))
+      b_Usolidnormw = maxval(sqrt(b_displw_poroelastic(1,:)**2 + b_displw_poroelastic(2,:)**2 + &
+                                  b_displw_poroelastic(3,:)**2))
+      ! compute max of all slices
+      call max_all_cr(b_Usolidnorms,b_Usolidnorms_all)
+      call max_all_cr(b_Usolidnormw,b_Usolidnormw_all)
+    endif
+    ! check stability of the code, exit if unstable
+    ! negative values can occur with some compilers when the unstable value is greater
+    ! than the greatest possible floating-point number of the machine
+    !if(b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0.0_CUSTOM_REAL) &
+    !  call exit_MPI(myrank,'single backward simulation became unstable and blew up')
+  endif
+
+  ! user output
+  if(myrank == 0) then
+
+    write(IMAIN,*) 'Time step # ',it
+    write(IMAIN,*) 'Time: ',sngl((it-1)*DT-t0),' seconds'
+
+    ! elapsed time since beginning of the simulation
+    tCPU = wtime() - time_start
+    int_tCPU = int(tCPU)
+    ihours = int_tCPU / 3600
+    iminutes = (int_tCPU - 3600*ihours) / 60
+    iseconds = int_tCPU - 3600*ihours - 60*iminutes
+    write(IMAIN,*) 'Elapsed time in seconds = ',tCPU
+    write(IMAIN,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
+    write(IMAIN,*) 'Mean elapsed time per time step in seconds = ',sngl(tCPU/dble(it))
+
+    if( ELASTIC_SIMULATION ) &
+      write(IMAIN,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all
+
+    if( ACOUSTIC_SIMULATION ) &
+      write(IMAIN,*) 'Max norm pressure P in all slices (Pa) = ',Usolidnormp_all
+
+    if( POROELASTIC_SIMULATION ) then
+      write(IMAIN,*) 'Max norm displacement vector Us in all slices (m) = ',Usolidnorms_all
+      write(IMAIN,*) 'Max norm displacement vector W in all slices (m) = ',Usolidnormw_all
+    endif
+
+    ! adjoint simulations
+    if (SIMULATION_TYPE == 3) then
+      if( ELASTIC_SIMULATION ) &
+        write(IMAIN,*) 'Max norm displacement vector U (backward) in all slices (m) = ',b_Usolidnorm_all
+      if( ACOUSTIC_SIMULATION ) &
+        write(IMAIN,*) 'Max norm pressure P (backward) in all slices (Pa) = ',b_Usolidnormp_all
+      if( POROELASTIC_SIMULATION ) then
+        write(IMAIN,*) 'Max norm displacement vector Us (backward) in all slices (m) = ',b_Usolidnorms_all
+        write(IMAIN,*) 'Max norm displacement vector W (backward) in all slices (m) = ',b_Usolidnormw_all
+      endif
+    endif
+
+    ! compute estimated remaining simulation time
+    t_remain = (NSTEP - it) * (tCPU/dble(it))
+    int_t_remain = int(t_remain)
+    ihours_remain = int_t_remain / 3600
+    iminutes_remain = (int_t_remain - 3600*ihours_remain) / 60
+    iseconds_remain = int_t_remain - 3600*ihours_remain - 60*iminutes_remain
+    write(IMAIN,*) 'Time steps done = ',it,' out of ',NSTEP
+    write(IMAIN,*) 'Time steps remaining = ',NSTEP - it
+    write(IMAIN,*) 'Estimated remaining time in seconds = ',sngl(t_remain)
+    write(IMAIN,"(' Estimated remaining time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
+             ihours_remain,iminutes_remain,iseconds_remain
+
+    ! compute estimated total simulation time
+    t_total = t_remain + tCPU
+    int_t_total = int(t_total)
+    ihours_total = int_t_total / 3600
+    iminutes_total = (int_t_total - 3600*ihours_total) / 60
+    iseconds_total = int_t_total - 3600*ihours_total - 60*iminutes_total
+    write(IMAIN,*) 'Estimated total run time in seconds = ',sngl(t_total)
+    write(IMAIN,"(' Estimated total run time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
+             ihours_total,iminutes_total,iseconds_total
+    write(IMAIN,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that'
+
+    if(it < 100) then
+      write(IMAIN,*) '************************************************************'
+      write(IMAIN,*) '**** BEWARE: the above time estimates are not reliable'
+      write(IMAIN,*) '**** because fewer than 100 iterations have been performed'
+      write(IMAIN,*) '************************************************************'
+    endif
+    write(IMAIN,*)
+
+    ! flushes file buffer for main output file (IMAIN)
+    call flush_IMAIN()
+
+    ! write time stamp file to give information about progression of simulation
+    write(outputname,"('/timestamp',i6.6)") it
+    open(unit=IOUT,file=trim(OUTPUT_FILES)//outputname,status='unknown')
+    write(IOUT,*) 'Time step # ',it
+    write(IOUT,*) 'Time: ',sngl((it-1)*DT-t0),' seconds'
+    write(IOUT,*) 'Elapsed time in seconds = ',tCPU
+    write(IOUT,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
+    write(IOUT,*) 'Mean elapsed time per time step in seconds = ',tCPU/dble(it)
+
+    if( ELASTIC_SIMULATION ) &
+      write(IOUT,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all
+
+    if( ACOUSTIC_SIMULATION ) &
+      write(IOUT,*) 'Max norm pressure P in all slices (Pa) = ',Usolidnormp_all
+
+    if( POROELASTIC_SIMULATION ) then
+      write(IOUT,*) 'Max norm displacement vector Us in all slices (m) = ',Usolidnorms_all
+      write(IOUT,*) 'Max norm displacement vector W in all slices (m) = ',Usolidnormw_all
+    endif
+
+    ! adjoint simulations
+    if (SIMULATION_TYPE == 3) then
+      if( ELASTIC_SIMULATION ) &
+        write(IOUT,*) 'Max norm displacement vector U (backward) in all slices (m) = ',b_Usolidnorm_all
+      if( ACOUSTIC_SIMULATION ) &
+        write(IOUT,*) 'Max norm pressure P (backward) in all slices (Pa) = ',b_Usolidnormp_all
+      if( POROELASTIC_SIMULATION ) then
+        write(IOUT,*) 'Max norm displacement vector Us (backward) in all slices (m) = ',b_Usolidnorms_all
+        write(IOUT,*) 'Max norm displacement vector W (backward) in all slices (m) = ',b_Usolidnormw_all
+      endif
+    endif
+
+    ! estimation
+    write(IOUT,*) 'Time steps done = ',it,' out of ',NSTEP
+    write(IOUT,*) 'Time steps remaining = ',NSTEP - it
+    write(IOUT,*) 'Estimated remaining time in seconds = ',t_remain
+    write(IOUT,"(' Estimated remaining time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
+             ihours_remain,iminutes_remain,iseconds_remain
+    write(IOUT,*) 'Estimated total run time in seconds = ',t_total
+    write(IOUT,"(' Estimated total run time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
+             ihours_total,iminutes_total,iseconds_total
+    write(IOUT,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that'
+    close(IOUT)
+
+    ! check stability of the code, exit if unstable
+    ! negative values can occur with some compilers when the unstable value is greater
+    ! than the greatest possible floating-point number of the machine
+    if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0.0_CUSTOM_REAL &
+     .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0.0_CUSTOM_REAL &
+     .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0.0_CUSTOM_REAL &
+     .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0.0_CUSTOM_REAL) &
+        call exit_MPI(myrank,'forward simulation became unstable and blew up')
+
+    ! adjoint simulations
+    if( SIMULATION_TYPE == 3 ) then
+      if( b_Usolidnorm_all > STABILITY_THRESHOLD .or. b_Usolidnorm_all < 0.0_CUSTOM_REAL &
+        .or. b_Usolidnormp_all > STABILITY_THRESHOLD .or. b_Usolidnormp_all < 0.0_CUSTOM_REAL &
+        .or. b_Usolidnorms_all > STABILITY_THRESHOLD .or. b_Usolidnorms_all < 0.0_CUSTOM_REAL &
+        .or. b_Usolidnormw_all > STABILITY_THRESHOLD .or. b_Usolidnormw_all < 0.0_CUSTOM_REAL ) &
+        call exit_MPI(myrank,'backward simulation became unstable and blew up')
+    endif
+
+  endif ! myrank
+
+  end subroutine check_stability
+

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_acoustic.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_acoustic.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -265,9 +265,10 @@
                ! reads in **sta**.**net**.**LH**.adj files
                adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
                call compute_arrays_adjoint_source(myrank,adj_source_file, &
-                         xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                         adj_sourcearray, xigll,yigll,zigll, &
-                         it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+                                                  xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                                                  adj_sourcearray, xigll,yigll,zigll, &
+                                                  it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+
                do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
                  adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
                enddo
@@ -416,6 +417,9 @@
 ! adjoint sources in SU format
   integer,parameter :: nheader=240      ! 240 bytes
 
+  ! checks if anything to do
+  if( SIMULATION_TYPE /= 3 ) return
+
 ! plotting source time function
   if(PRINT_SOURCE_TIME_FUNCTION .and. .not. phase_is_inner ) then
     ! initializes total
@@ -451,7 +455,7 @@
 !           thus indexing is NSTEP - it , instead of NSTEP - it - 1
 
 ! adjoint simulations
-  if (SIMULATION_TYPE == 3 .and. nsources_local > 0) then
+  if( nsources_local > 0 ) then
 
      ! adds acoustic sources
      do isource = 1,NSOURCES
@@ -663,8 +667,7 @@
          ! only implements SIMTYPE=1 and NOISE_TOM=0
          ! write(*,*) "fortran dt = ", dt
          ! change dt -> DT
-         call compute_add_sources_ac_cuda(Mesh_pointer, phase_is_inner, &
-              NSOURCES, stf_pre_compute, myrank)
+         call compute_add_sources_ac_cuda(Mesh_pointer,phase_is_inner,NSOURCES,stf_pre_compute)
       endif
   endif
 
@@ -730,9 +733,10 @@
                ! reads in **sta**.**net**.**LH**.adj files
                adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
                call compute_arrays_adjoint_source(myrank,adj_source_file, &
-                         xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                         adj_sourcearray, xigll,yigll,zigll, &
-                         it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+                                                  xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                                                  adj_sourcearray, xigll,yigll,zigll, &
+                                                  it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+
                do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
                  adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
                enddo
@@ -783,7 +787,8 @@
         ! on GPU
         call add_sources_ac_sim_2_or_3_cuda(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
                                            ispec_is_inner,ispec_is_acoustic, &
-                                           ispec_selected_rec,myrank,nrec, &
+                                           ispec_selected_rec, &
+                                           nrec, &
                                            NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
                                            islice_selected_rec,nadj_rec_local, &
                                            NTSTEP_BETWEEN_READ_ADJSRC)
@@ -821,8 +826,7 @@
          enddo
          stf_used_total = stf_used_total + sum(stf_pre_compute(:))
          ! only implements SIMTYPE=3
-         call compute_add_sources_ac_s3_cuda(Mesh_pointer, phase_is_inner, &
-                NSOURCES,stf_pre_compute, myrank)
+         call compute_add_sources_ac_s3_cuda(Mesh_pointer,phase_is_inner,NSOURCES,stf_pre_compute)
       endif
   endif
 

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_poroelastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_poroelastic.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_poroelastic.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -274,9 +274,10 @@
              ! reads in **sta**.**net**.**LH**.adj files
              adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
              call compute_arrays_adjoint_source(myrank,adj_source_file, &
-                       xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                       adj_sourcearray, xigll,yigll,zigll, &
-                       it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+                                                xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                                                adj_sourcearray, xigll,yigll,zigll, &
+                                                it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+
              do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
                adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
              enddo

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_viscoelastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_viscoelastic.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_viscoelastic.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -102,6 +102,9 @@
   !equivalence (i2head,i4head,r4head)    ! share the same 240-byte-memory
   double precision :: hxir(NGLLX),hpxir(NGLLX),hetar(NGLLY),hpetar(NGLLY),hgammar(NGLLZ),hpgammar(NGLLZ)
 
+  ! some old tests (currently unstable; do not remove them though, we might fix this one day)
+  if (OLD_TEST_TO_FIX_ONE_DAY) return
+
 ! plotting source time function
   if(PRINT_SOURCE_TIME_FUNCTION .and. .not. phase_is_inner ) then
      ! initializes total
@@ -250,9 +253,10 @@
               ! reads in **sta**.**net**.**LH**.adj files
               adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
               call compute_arrays_adjoint_source(myrank,adj_source_file, &
-                    xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                    adj_sourcearray, xigll,yigll,zigll, &
-                    it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+                                                 xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                                                 adj_sourcearray, xigll,yigll,zigll, &
+                                                 it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+
               do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
                 adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
               enddo
@@ -355,37 +359,38 @@
   endif
 
   ! for noise simulations
-  ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
-  ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
-  ! because boundary points are claculated first!
-  if( .not. phase_is_inner ) then
-    if ( NOISE_TOMOGRAPHY == 1 ) then
-       ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
-       ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
-       ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
-       ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
-        call add_source_master_rec_noise(myrank,nrec, &
-               NSTEP,accel,noise_sourcearray, &
-               ibool,islice_selected_rec,ispec_selected_rec, &
-               it,irec_master_noise, &
-               NSPEC_AB,NGLOB_AB)
-    else if ( NOISE_TOMOGRAPHY == 2 ) then
-       ! second step of noise tomography, i.e., read the surface movie saved at every timestep
-       ! use the movie to drive the ensemble forward wavefield
-       call noise_read_add_surface_movie(NGLLSQUARE*num_free_surface_faces,accel, &
-                              normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
-                              ibool,noise_surface_movie,NSTEP-it+1,NSPEC_AB,NGLOB_AB, &
-                              num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
-                              free_surface_jacobian2Dw)
-        ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
-        ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
-        ! note the ensemble forward sources are generally distributed on the surface of the earth
-        ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
-        ! therefore, we must add it here, before applying the inverse of mass matrix
+  if( NOISE_TOMOGRAPHY > 0 ) then
+    ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
+    ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
+    ! because boundary points are claculated first!
+    if( .not. phase_is_inner ) then
+      if ( NOISE_TOMOGRAPHY == 1 ) then
+         ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
+         ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
+         ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
+         ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
+          call add_source_master_rec_noise(myrank,nrec, &
+                 NSTEP,accel,noise_sourcearray, &
+                 ibool,islice_selected_rec,ispec_selected_rec, &
+                 it,irec_master_noise, &
+                 NSPEC_AB,NGLOB_AB)
+      else if ( NOISE_TOMOGRAPHY == 2 ) then
+         ! second step of noise tomography, i.e., read the surface movie saved at every timestep
+         ! use the movie to drive the ensemble forward wavefield
+         call noise_read_add_surface_movie(NGLLSQUARE*num_free_surface_faces,accel, &
+                                normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
+                                ibool,noise_surface_movie,NSTEP-it+1,NSPEC_AB,NGLOB_AB, &
+                                num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
+                                free_surface_jacobian2Dw)
+          ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
+          ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
+          ! note the ensemble forward sources are generally distributed on the surface of the earth
+          ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
+          ! therefore, we must add it here, before applying the inverse of mass matrix
+      endif
     endif
   endif
 
-
   end subroutine compute_add_sources_viscoelastic
 !
 !=====================================================================
@@ -443,6 +448,9 @@
 ! adjoint sources in SU format
   integer,parameter :: nheader=240      ! 240 bytes
 
+  ! some old tests (currently unstable; do not remove them though, we might fix this one day)
+  if (OLD_TEST_TO_FIX_ONE_DAY) return
+
 ! plotting source time function
   if(PRINT_SOURCE_TIME_FUNCTION .and. .not. phase_is_inner ) then
      ! initializes total
@@ -566,24 +574,25 @@
   endif
 
   ! for noise simulations
-  ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
-  ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
-  ! because boundary points are claculated first!
-  if( .not. phase_is_inner ) then
-    if ( NOISE_TOMOGRAPHY == 3 ) then
-        ! third step of noise tomography, i.e., read the surface movie saved at every timestep
-        ! use the movie to reconstruct the ensemble forward wavefield
-        ! the ensemble adjoint wavefield is done as usual
-        ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
-        call noise_read_add_surface_movie(NGLLSQUARE*num_free_surface_faces,b_accel, &
-                              normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
-                              ibool,noise_surface_movie,it,NSPEC_AB,NGLOB_AB, &
-                              num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
-                              free_surface_jacobian2Dw)
+  if( NOISE_TOMOGRAPHY > 0 ) then
+    ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
+    ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
+    ! because boundary points are claculated first!
+    if( .not. phase_is_inner ) then
+      if ( NOISE_TOMOGRAPHY == 3 ) then
+          ! third step of noise tomography, i.e., read the surface movie saved at every timestep
+          ! use the movie to reconstruct the ensemble forward wavefield
+          ! the ensemble adjoint wavefield is done as usual
+          ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
+          call noise_read_add_surface_movie(NGLLSQUARE*num_free_surface_faces,b_accel, &
+                                normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
+                                ibool,noise_surface_movie,it,NSPEC_AB,NGLOB_AB, &
+                                num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
+                                free_surface_jacobian2Dw)
+      endif
     endif
   endif
 
-
   end subroutine compute_add_sources_viscoelastic_bpwf
 
 !
@@ -596,7 +605,8 @@
                         ispec_is_elastic,SIMULATION_TYPE,NSTEP, &
                         nrec,islice_selected_rec,ispec_selected_rec, &
                         nadj_rec_local,adj_sourcearrays, &
-                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY,Mesh_pointer)
+                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY, &
+                        Mesh_pointer)
 
   use specfem_par,only: PRINT_SOURCE_TIME_FUNCTION,stf_used_total, &
                         xigll,yigll,zigll,xi_receiver,eta_receiver,gamma_receiver,&
@@ -659,6 +669,9 @@
   !equivalence (i2head,i4head,r4head)    ! share the same 240-byte-memory
   double precision :: hxir(NGLLX),hpxir(NGLLX),hetar(NGLLY),hpetar(NGLLY),hgammar(NGLLZ),hpgammar(NGLLZ)
 
+  ! some old tests (currently unstable; do not remove them though, we might fix this one day)
+  if (OLD_TEST_TO_FIX_ONE_DAY) return
+
 ! plotting source time function
   if(PRINT_SOURCE_TIME_FUNCTION .and. .not. phase_is_inner ) then
      ! initializes total
@@ -667,7 +680,6 @@
 
   ! forward simulations
   if (SIMULATION_TYPE == 1 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) then
-
     if( NSOURCES > 0 ) then
        do isource = 1,NSOURCES
           ! precomputes source time function factor
@@ -692,8 +704,7 @@
        ! only implements SIMTYPE=1 and NOISE_TOM=0
        ! write(*,*) "fortran dt = ", dt
        ! change dt -> DT
-       call compute_add_sources_el_cuda(Mesh_pointer, phase_is_inner, &
-                                       NSOURCES, stf_pre_compute, myrank)
+       call compute_add_sources_el_cuda(Mesh_pointer,stf_pre_compute,NSOURCES,phase_is_inner)
     endif
   endif ! forward
 
@@ -723,7 +734,6 @@
 
 ! adjoint simulations
   if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
-
     ! adds adjoint source in this partitions
     if( nadj_rec_local > 0 ) then
 
@@ -758,9 +768,10 @@
               ! reads in **sta**.**net**.**LH**.adj files
               adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
               call compute_arrays_adjoint_source(myrank,adj_source_file, &
-                    xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                    adj_sourcearray, xigll,yigll,zigll, &
-                    it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+                                                 xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                                                 adj_sourcearray, xigll,yigll,zigll, &
+                                                 it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+
               do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
                 adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
               enddo
@@ -822,7 +833,8 @@
       if( it < NSTEP ) then
         call add_sources_el_sim_type_2_or_3(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
                                             ispec_is_inner,ispec_is_elastic, &
-                                            ispec_selected_rec,myrank,nrec, &
+                                            ispec_selected_rec, &
+                                            nrec, &
                                             NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
                                             islice_selected_rec,nadj_rec_local, &
                                             NTSTEP_BETWEEN_READ_ADJSRC)
@@ -836,33 +848,30 @@
 
 ! adjoint simulations
   if (SIMULATION_TYPE == 3 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) then
-
-     if( NSOURCES > 0 ) then
-        do isource = 1,NSOURCES
-           ! precomputes source time function factors
-           if(USE_FORCE_POINT_SOURCE) then
-              if( USE_RICKER_TIME_FUNCTION ) then
-                 stf_pre_compute(isource) = &
-                      comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
-              else
-                 stf_pre_compute(isource) = &
-                      comp_source_time_function_gauss(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_tiny(isource))
-              endif
-           else
-              if( USE_RICKER_TIME_FUNCTION ) then
-                 stf_pre_compute(isource) = &
-                      comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
-              else
-                 stf_pre_compute(isource) = &
-                      comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_gaussian(isource))
-              endif
-           endif
-        enddo
-        ! only implements SIMTYPE=3
-        call compute_add_sources_el_s3_cuda(Mesh_pointer,stf_pre_compute, &
-             NSOURCES,phase_is_inner,myrank)
-
-     endif
+    if( NSOURCES > 0 ) then
+      do isource = 1,NSOURCES
+         ! precomputes source time function factors
+         if(USE_FORCE_POINT_SOURCE) then
+            if( USE_RICKER_TIME_FUNCTION ) then
+               stf_pre_compute(isource) = &
+                    comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
+            else
+               stf_pre_compute(isource) = &
+                    comp_source_time_function_gauss(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_tiny(isource))
+            endif
+         else
+            if( USE_RICKER_TIME_FUNCTION ) then
+               stf_pre_compute(isource) = &
+                    comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
+            else
+               stf_pre_compute(isource) = &
+                    comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_gaussian(isource))
+            endif
+         endif
+      enddo
+      ! only implements SIMTYPE=3
+      call compute_add_sources_el_s3_cuda(Mesh_pointer,stf_pre_compute,NSOURCES,phase_is_inner)
+    endif
   endif ! adjoint
 
   ! master prints out source time function to file
@@ -873,35 +882,36 @@
   endif
 
   ! for noise simulations
-  ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
-  ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
-  ! because boundary points are claculated first!
-  if( .not. phase_is_inner ) then
-    if ( NOISE_TOMOGRAPHY == 1 ) then
-       ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
-       ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
-       ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
-       ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
-       call add_source_master_rec_noise_cu(Mesh_pointer, myrank, it, irec_master_noise, islice_selected_rec)
-    else if ( NOISE_TOMOGRAPHY == 2 ) then
-       ! second step of noise tomography, i.e., read the surface movie saved at every timestep
-       ! use the movie to drive the ensemble forward wavefield
-        call noise_read_add_surface_movie_GPU(noise_surface_movie,NSTEP-it+1,num_free_surface_faces, &
-                                              Mesh_pointer,NOISE_TOMOGRAPHY)
-        ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
-        ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
-        ! note the ensemble forward sources are generally distributed on the surface of the earth
-        ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
-        ! therefore, we must add it here, before applying the inverse of mass matrix
-    else if ( NOISE_TOMOGRAPHY == 3 ) then
-        ! third step of noise tomography, i.e., read the surface movie saved at every timestep
-        ! use the movie to reconstruct the ensemble forward wavefield
-        ! the ensemble adjoint wavefield is done as usual
-        ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
-        call noise_read_add_surface_movie_GPU(noise_surface_movie,it,num_free_surface_faces, &
-                                              Mesh_pointer,NOISE_TOMOGRAPHY)
+  if( NOISE_TOMOGRAPHY > 0 ) then
+    ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
+    ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
+    ! because boundary points are claculated first!
+    if( .not. phase_is_inner ) then
+      if ( NOISE_TOMOGRAPHY == 1 ) then
+         ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
+         ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
+         ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
+         ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
+         call add_source_master_rec_noise_cu(Mesh_pointer,it,irec_master_noise,islice_selected_rec)
+      else if ( NOISE_TOMOGRAPHY == 2 ) then
+         ! second step of noise tomography, i.e., read the surface movie saved at every timestep
+         ! use the movie to drive the ensemble forward wavefield
+          call noise_read_add_surface_movie_GPU(noise_surface_movie,NSTEP-it+1,num_free_surface_faces, &
+                                                Mesh_pointer,NOISE_TOMOGRAPHY)
+          ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
+          ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
+          ! note the ensemble forward sources are generally distributed on the surface of the earth
+          ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
+          ! therefore, we must add it here, before applying the inverse of mass matrix
+      else if ( NOISE_TOMOGRAPHY == 3 ) then
+          ! third step of noise tomography, i.e., read the surface movie saved at every timestep
+          ! use the movie to reconstruct the ensemble forward wavefield
+          ! the ensemble adjoint wavefield is done as usual
+          ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
+          call noise_read_add_surface_movie_GPU(noise_surface_movie,it,num_free_surface_faces, &
+                                                Mesh_pointer,NOISE_TOMOGRAPHY)
+      endif
     endif
   endif
 
-
   end subroutine compute_add_sources_viscoelastic_GPU

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_arrays_source.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_arrays_source.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_arrays_source.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -106,9 +106,9 @@
 !=============================================================================
 
   subroutine compute_arrays_adjoint_source(myrank, adj_source_file, &
-                    xi_receiver,eta_receiver,gamma_receiver, adj_sourcearray, &
-                    xigll,yigll,zigll, &
-                    it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+                                           xi_receiver,eta_receiver,gamma_receiver, adj_sourcearray, &
+                                           xigll,yigll,zigll, &
+                                           it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
 
   implicit none
 
@@ -132,7 +132,7 @@
   double precision :: hxir(NGLLX), hpxir(NGLLX), hetar(NGLLY), hpetar(NGLLY), &
         hgammar(NGLLZ), hpgammar(NGLLZ)
 
-  real(kind=CUSTOM_REAL) :: adj_src(NTSTEP_BETWEEN_READ_ADJSRC,NDIM)
+  real(kind=CUSTOM_REAL), dimension(NTSTEP_BETWEEN_READ_ADJSRC,NDIM) :: adj_src
 
   integer icomp, itime, i, j, k, ios, it_start, it_end
   double precision :: junk
@@ -149,8 +149,7 @@
   it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1
   it_end   = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1
 
-  !adj_sourcearray(:,:,:,:,:) = 0.
-  adj_src = 0._CUSTOM_REAL
+  adj_src(:,:) = 0._CUSTOM_REAL
 
   ! loops over components
   do icomp = 1, NDIM

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_ac.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_ac.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_ac.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -295,6 +295,9 @@
   ! adjoint locals
   real(kind=CUSTOM_REAL) :: b_force_normal_comp
 
+  ! checks if anything to do
+  if (SIMULATION_TYPE /= 3) return
+
   !   initialize the updates
   updated_dof_ocean_load(:) = .false.
 
@@ -323,18 +326,16 @@
         ! we use the total force which includes the Coriolis term above
 
         ! adjoint simulations
-        if (SIMULATION_TYPE == 3) then
-          b_force_normal_comp = b_accel(1,iglob)*nx / rmassx(iglob) &
-                                + b_accel(2,iglob)*ny / rmassy(iglob) &
-                                + b_accel(3,iglob)*nz / rmassz(iglob)
+        b_force_normal_comp = b_accel(1,iglob)*nx / rmassx(iglob) &
+                              + b_accel(2,iglob)*ny / rmassy(iglob) &
+                              + b_accel(3,iglob)*nz / rmassz(iglob)
 
-          b_accel(1,iglob) = b_accel(1,iglob) &
-            + (rmass_ocean_load(iglob) - rmassx(iglob)) * b_force_normal_comp * nx
-          b_accel(2,iglob) = b_accel(2,iglob) &
-            + (rmass_ocean_load(iglob) - rmassy(iglob)) * b_force_normal_comp * ny
-          b_accel(3,iglob) = b_accel(3,iglob) &
-            + (rmass_ocean_load(iglob) - rmassz(iglob)) * b_force_normal_comp * nz
-        endif !adjoint
+        b_accel(1,iglob) = b_accel(1,iglob) &
+          + (rmass_ocean_load(iglob) - rmassx(iglob)) * b_force_normal_comp * nx
+        b_accel(2,iglob) = b_accel(2,iglob) &
+          + (rmass_ocean_load(iglob) - rmassy(iglob)) * b_force_normal_comp * ny
+        b_accel(3,iglob) = b_accel(3,iglob) &
+          + (rmass_ocean_load(iglob) - rmassz(iglob)) * b_force_normal_comp * nz
 
         ! done with this point
         updated_dof_ocean_load(iglob) = .true.

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -186,7 +186,7 @@
     ! assemble all the contributions between slices using MPI
     if( phase_is_inner .eqv. .false. ) then
       ! sends potential_dot_dot_acoustic values to corresponding MPI interface neighbors (non-blocking)
-      call assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
+      call assemble_MPI_scalar_async_send(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
                         buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
@@ -195,7 +195,7 @@
     else
 
       ! waits for send/receive requests to be completed and assembles values
-      call assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
+      call assemble_MPI_scalar_async_recv(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
                         buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,&
                         max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
@@ -336,12 +336,15 @@
   integer:: iphase
   logical:: phase_is_inner
 
+  ! checks
+  if( SIMULATION_TYPE /= 3 ) &
+    call exit_MPI(myrank,'error calling compute_forces_acoustic_bpwf() with wrong SIMULATION_TYPE')
+
   ! adjoint simulations
-  if( SIMULATION_TYPE == 3 ) &
-    call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT,STACEY_INSTEAD_OF_FREE_SURFACE, &
-                        b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
-                        ibool,free_surface_ijk,free_surface_ispec, &
-                        num_free_surface_faces,ispec_is_acoustic)
+  call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT,STACEY_INSTEAD_OF_FREE_SURFACE, &
+                      b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
+                      ibool,free_surface_ijk,free_surface_ispec, &
+                      num_free_surface_faces,ispec_is_acoustic)
 
   ! distinguishes two runs: for elements on MPI interfaces, and elements within the partitions
   do iphase=1,2
@@ -354,29 +357,27 @@
     endif
 
     ! adjoint simulations
-    if( SIMULATION_TYPE == 3 ) then
-      if(USE_DEVILLE_PRODUCTS) then
-        ! uses Deville (2002) optimizations
-        call compute_forces_acoustic_Dev(iphase,NSPEC_ADJOINT,NGLOB_ADJOINT, &
-                        b_potential_acoustic,b_potential_dot_dot_acoustic, &
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-                        wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D, &
-                        rhostore,jacobian,ibool, &
-                        num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic,&
-                        phase_ispec_inner_acoustic)
-      else
-        call compute_forces_acoustic_noDev(iphase,NSPEC_ADJOINT,NGLOB_ADJOINT, &
-                        b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        hprime_xx,hprime_yy,hprime_zz, &
-                        hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                        rhostore,jacobian,ibool,deltat, &
-                        num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic,&
-                        phase_ispec_inner_acoustic,ELASTIC_SIMULATION,&
-                        .true.,potential_dot_dot_acoustic_interface)
-      endif
+    if(USE_DEVILLE_PRODUCTS) then
+      ! uses Deville (2002) optimizations
+      call compute_forces_acoustic_Dev(iphase,NSPEC_ADJOINT,NGLOB_ADJOINT, &
+                      b_potential_acoustic,b_potential_dot_dot_acoustic, &
+                      xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                      hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
+                      wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D, &
+                      rhostore,jacobian,ibool, &
+                      num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic,&
+                      phase_ispec_inner_acoustic)
+    else
+      call compute_forces_acoustic_noDev(iphase,NSPEC_ADJOINT,NGLOB_ADJOINT, &
+                      b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
+                      xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                      hprime_xx,hprime_yy,hprime_zz, &
+                      hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+                      wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                      rhostore,jacobian,ibool,deltat, &
+                      num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic,&
+                      phase_ispec_inner_acoustic,ELASTIC_SIMULATION,&
+                      .true.,potential_dot_dot_acoustic_interface)
     endif
 
     ! ! Stacey absorbing boundary conditions
@@ -394,25 +395,23 @@
     if(ELASTIC_SIMULATION ) then
       if( num_coupling_ac_el_faces > 0 ) then
         ! adjoint/kernel simulations
-        if( SIMULATION_TYPE == 3 ) &
-          call compute_coupling_acoustic_el(NSPEC_ADJOINT,NGLOB_ADJOINT, &
-                            ibool,b_displ,b_potential_dot_dot_acoustic, &
-                            num_coupling_ac_el_faces, &
-                            coupling_ac_el_ispec,coupling_ac_el_ijk, &
-                            coupling_ac_el_normal, &
-                            coupling_ac_el_jacobian2Dw, &
-                            ispec_is_inner,phase_is_inner,&
-                            PML_CONDITIONS,spec_to_CPML,is_CPML,&
-                            potential_dot_dot_acoustic_interface,veloc,rmemory_coupling_ac_el_displ,&
-                            SIMULATION_TYPE,.true.,accel_interface)
+        call compute_coupling_acoustic_el(NSPEC_ADJOINT,NGLOB_ADJOINT, &
+                          ibool,b_displ,b_potential_dot_dot_acoustic, &
+                          num_coupling_ac_el_faces, &
+                          coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                          coupling_ac_el_normal, &
+                          coupling_ac_el_jacobian2Dw, &
+                          ispec_is_inner,phase_is_inner,&
+                          PML_CONDITIONS,spec_to_CPML,is_CPML,&
+                          potential_dot_dot_acoustic_interface,veloc,rmemory_coupling_ac_el_displ,&
+                          SIMULATION_TYPE,.true.,accel_interface)
       endif
     endif
 
 ! poroelastic coupling
     if(POROELASTIC_SIMULATION )  then
       if( num_coupling_ac_po_faces > 0 ) then
-        if( SIMULATION_TYPE == 3 ) &
-          stop 'not implemented yet'
+          stop 'coupling acoustic-poroelastic domains not implemented yet...'
       endif
     endif
 
@@ -429,32 +428,27 @@
     if( phase_is_inner .eqv. .false. ) then
       ! sends b_potential_dot_dot_acoustic values to corresponding MPI interface neighbors (non-blocking)
       ! adjoint simulations
-      if( SIMULATION_TYPE == 3 ) then
-        call assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, &
-                        b_buffer_send_scalar_ext_mesh,b_buffer_recv_scalar_ext_mesh, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
-                        my_neighbours_ext_mesh, &
-                        b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh)
-      endif
+      call assemble_MPI_scalar_async_send(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, &
+                      b_buffer_send_scalar_ext_mesh,b_buffer_recv_scalar_ext_mesh, &
+                      num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                      nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
+                      my_neighbours_ext_mesh, &
+                      b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh)
 
     else
       ! adjoint simulations
-      if( SIMULATION_TYPE == 3 ) then
-        call assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, &
-                        b_buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,&
-                        max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh)
-      endif
+      call assemble_MPI_scalar_async_recv(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, &
+                      b_buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,&
+                      max_nibool_interfaces_ext_mesh, &
+                      nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                      b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh)
     endif !phase_is_inner
 
   enddo
 
   ! divides pressure with mass matrix
   ! adjoint simulations
-  if (SIMULATION_TYPE == 3) &
-      b_potential_dot_dot_acoustic(:) = b_potential_dot_dot_acoustic(:) * rmass_acoustic(:)
+  b_potential_dot_dot_acoustic(:) = b_potential_dot_dot_acoustic(:) * rmass_acoustic(:)
 
 ! update velocity
 ! note: Newmark finite-difference time scheme with acoustic domains:
@@ -471,18 +465,17 @@
 !
 ! corrector:
 !   updates the chi_dot term which requires chi_dot_dot(t+delta)
+
   ! corrector
   ! adjoint simulations
-  if (SIMULATION_TYPE == 3) &
-      b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) + b_deltatover2*b_potential_dot_dot_acoustic(:)
+  b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) + b_deltatover2*b_potential_dot_dot_acoustic(:)
 
 ! enforces free surface (zeroes potentials at free surface)
   ! adjoint simulations
-  if (SIMULATION_TYPE == 3) &
-    call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT,STACEY_INSTEAD_OF_FREE_SURFACE, &
-                        b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
-                        ibool,free_surface_ijk,free_surface_ispec, &
-                        num_free_surface_faces,ispec_is_acoustic)
+  call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT,STACEY_INSTEAD_OF_FREE_SURFACE, &
+                      b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
+                      ibool,free_surface_ijk,free_surface_ispec, &
+                      num_free_surface_faces,ispec_is_acoustic)
 
 end subroutine compute_forces_acoustic_bpwf
 !
@@ -568,13 +561,9 @@
     ! assemble all the contributions between slices using MPI
     if( phase_is_inner .eqv. .false. ) then
       ! sends potential_dot_dot_acoustic values to corresponding MPI interface neighbors (non-blocking)
-      call transfer_boun_pot_from_device(NGLOB_AB, Mesh_pointer, &
+      call transfer_boun_pot_from_device(Mesh_pointer, &
                                          potential_dot_dot_acoustic, &
                                          buffer_send_scalar_ext_mesh, &
-                                         num_interfaces_ext_mesh, &
-                                         max_nibool_interfaces_ext_mesh, &
-                                         nibool_interfaces_ext_mesh, &
-                                         ibool_interfaces_ext_mesh, &
                                          1) ! <-- 1 == fwd accel
       call assemble_MPI_scalar_send_cuda(NPROC, &
                         buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
@@ -585,13 +574,9 @@
 
       ! adjoint simulations
       if( SIMULATION_TYPE == 3 ) then
-        call transfer_boun_pot_from_device(NGLOB_AB, Mesh_pointer, &
+        call transfer_boun_pot_from_device(Mesh_pointer, &
                                            b_potential_dot_dot_acoustic, &
                                            b_buffer_send_scalar_ext_mesh,&
-                                           num_interfaces_ext_mesh, &
-                                           max_nibool_interfaces_ext_mesh, &
-                                           nibool_interfaces_ext_mesh, &
-                                           ibool_interfaces_ext_mesh, &
                                            3) ! <-- 3 == adjoint b_accel
 
         call assemble_MPI_scalar_send_cuda(NPROC, &
@@ -608,8 +593,8 @@
       ! waits for send/receive requests to be completed and assembles values
       call assemble_MPI_scalar_write_cuda(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
                         Mesh_pointer,&
-                        buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,&
-                        max_nibool_interfaces_ext_mesh, &
+                        buffer_recv_scalar_ext_mesh, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh, &
                         1)
@@ -618,10 +603,10 @@
       if( SIMULATION_TYPE == 3 ) then
         call assemble_MPI_scalar_write_cuda(NPROC,NGLOB_AB,b_potential_dot_dot_acoustic, &
                         Mesh_pointer, &
-                        b_buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh, &
-                        max_nibool_interfaces_ext_mesh, &
+                        b_buffer_recv_scalar_ext_mesh, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        b_request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh, &
+                        b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh, &
                         3)
       endif
     endif !phase_is_inner
@@ -629,7 +614,7 @@
   enddo
 
  ! divides pressure with mass matrix
-  call kernel_3_a_acoustic_cuda(Mesh_pointer,NGLOB_AB)
+  call kernel_3_a_acoustic_cuda(Mesh_pointer)
 
 ! update velocity
 ! note: Newmark finite-difference time scheme with acoustic domains:
@@ -646,7 +631,7 @@
 !
 ! corrector:
 ! updates the chi_dot term which requires chi_dot_dot(t+delta)
-  call kernel_3_b_acoustic_cuda(Mesh_pointer,NGLOB_AB,deltatover2,b_deltatover2)
+  call kernel_3_b_acoustic_cuda(Mesh_pointer,deltatover2,b_deltatover2)
 
 ! enforces free surface (zeroes potentials at free surface)
   call acoustic_enforce_free_surf_cuda(Mesh_pointer,STACEY_INSTEAD_OF_FREE_SURFACE)

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_poroelastic_calling_routine.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_poroelastic_calling_routine.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_poroelastic_calling_routine.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -124,7 +124,7 @@
 
     else
       ! on GPU
-stop 'GPU for poroelastic simulation not implemented'
+      call exit_MPI(myrank,'GPU for poroelastic simulation not implemented')
     endif ! GPU_MODE
 
 ! adds poroelastic absorbing boundary terms to accelerations (type Stacey conditions)
@@ -260,12 +260,6 @@
                         b_request_send_vector_ext_meshw,b_request_recv_vector_ext_meshw)
       endif !adjoint
 
-    !! DK DK May 2009: removed this because now each slice of a CUBIT + SCOTCH mesh
-    !! DK DK May 2009: has a different number of spectral elements and therefore
-    !! DK DK May 2009: only the general non-blocking MPI routines assemble_MPI_vector_ext_mesh_s
-    !! DK DK May 2009: and assemble_MPI_vector_ext_mesh_w above can be used.
-    !! DK DK May 2009: For adjoint runs below (SIMULATION_TYPE == 3) they should be used as well.
-
     endif
   enddo
 

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -52,14 +52,15 @@
       phase_is_inner = .true.
     endif
 
-    ! elastic term
+! elastic term
     if(USE_DEVILLE_PRODUCTS) then
       ! uses Deville (2002) optimizations
       call compute_forces_viscoelastic_Dev_sim1(iphase)
 
     else
       ! no optimizations used
-      call compute_forces_viscoelastic_noDev(iphase,NSPEC_AB,NGLOB_AB,displ,veloc,accel, &
+      call compute_forces_viscoelastic_noDev(iphase,NSPEC_AB,NGLOB_AB, &
+                        displ,veloc,accel, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
                         hprime_xx,hprime_yy,hprime_zz, &
                         hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
@@ -84,7 +85,8 @@
                         dsdx_top,dsdx_bot, &
                         ispec2D_moho_top,ispec2D_moho_bot, &
                         num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic, &
-                        phase_ispec_inner_elastic,.false.,accel_interface,ACOUSTIC_SIMULATION)
+                        phase_ispec_inner_elastic,.false., &
+                        accel_interface,ACOUSTIC_SIMULATION)
 
     endif
 
@@ -117,7 +119,8 @@
                         coupling_ac_el_jacobian2Dw, &
                         ispec_is_inner,phase_is_inner,&
                         PML_CONDITIONS,is_CPML,potential_dot_dot_acoustic_interface,&
-                        SIMULATION_TYPE,.false.,accel_interface,&
+                        SIMULATION_TYPE,.false., &
+                        accel_interface,&
                         rmemory_coupling_el_ac_potential,spec_to_CPML,&
                         potential_acoustic,potential_dot_acoustic)
 
@@ -133,7 +136,8 @@
                               coupling_ac_el_jacobian2Dw, &
                               ispec_is_inner,phase_is_inner,&
                               PML_CONDITIONS,is_CPML,potential_dot_dot_acoustic_interface,&
-                              SIMULATION_TYPE,.false.,accel_interface,&
+                              SIMULATION_TYPE,.false., &
+                              accel_interface,&
                               rmemory_coupling_el_ac_potential,spec_to_CPML,&
                               potential_acoustic,potential_dot_acoustic)
 
@@ -167,7 +171,7 @@
     endif
 
 ! adds source term (single-force/moment-tensor solution)
-    if (.not. OLD_TEST_TO_FIX_ONE_DAY) call compute_add_sources_viscoelastic( NSPEC_AB,NGLOB_AB,accel, &
+    call compute_add_sources_viscoelastic( NSPEC_AB,NGLOB_AB,accel, &
                         ibool,ispec_is_inner,phase_is_inner, &
                         NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,&
                         hdur,hdur_gaussian,tshift_src,dt,t0,sourcearrays, &
@@ -179,7 +183,7 @@
     ! assemble all the contributions between slices using MPI
     if( phase_is_inner .eqv. .false. ) then
        ! sends accel values to corresponding MPI interface neighbors
-       call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB,accel, &
+       call assemble_MPI_vector_async_send(NPROC,NGLOB_AB,accel, &
                buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
                num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
@@ -188,7 +192,7 @@
 
     else
       ! waits for send/receive requests to be completed and assembles values
-      call assemble_MPI_vector_ext_mesh_w_ordered(NPROC,NGLOB_AB,accel, &
+      call assemble_MPI_vector_async_w_ord(NPROC,NGLOB_AB,accel, &
                             buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
                             max_nibool_interfaces_ext_mesh, &
                             nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
@@ -201,7 +205,6 @@
 !Percy , Fault boundary term B*tau is added to the assembled forces
 !        which at this point are stored in the array 'accel'
   if (SIMULATION_TYPE_DYN) call bc_dynflt_set3d_all(accel,veloc,displ)
-
   if (SIMULATION_TYPE_KIN) call bc_kinflt_set_all(accel,veloc,displ)
 
  ! multiplies with inverse of mass matrix (note: rmass has been inverted already)
@@ -229,37 +232,36 @@
   endif
 
   ! C-PML boundary
-    if(PML_CONDITIONS)then
-       do iface=1,num_abs_boundary_faces
-           ispec = abs_boundary_ispec(iface)
+  if( PML_CONDITIONS ) then
+    do iface=1,num_abs_boundary_faces
+      ispec = abs_boundary_ispec(iface)
 !!! It is better to move this into do iphase=1,2 loop
 !!!        if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
-              if( ispec_is_elastic(ispec) .and. is_CPML(ispec)) then
-                 ! reference gll points on boundary face
-                 do igll = 1,NGLLSQUARE
+      if( ispec_is_elastic(ispec) .and. is_CPML(ispec)) then
+        ! reference gll points on boundary face
+        do igll = 1,NGLLSQUARE
+          ! gets local indices for GLL point
+          i = abs_boundary_ijk(1,igll,iface)
+          j = abs_boundary_ijk(2,igll,iface)
+          k = abs_boundary_ijk(3,igll,iface)
 
-                    ! gets local indices for GLL point
-                    i = abs_boundary_ijk(1,igll,iface)
-                    j = abs_boundary_ijk(2,igll,iface)
-                    k = abs_boundary_ijk(3,igll,iface)
+          iglob=ibool(i,j,k,ispec)
 
-                    iglob=ibool(i,j,k,ispec)
+          accel(:,iglob) = 0.0
+          veloc(:,iglob) = 0.0
+          displ(:,iglob) = 0.0
 
-                    accel(:,iglob) = 0.0
-                    veloc(:,iglob) = 0.0
-                    displ(:,iglob) = 0.0
+          if(SIMULATION_TYPE ==3)then
+            if(ACOUSTIC_SIMULATION)then
+              accel_interface(:,iglob) = 0.0
+            endif
+          endif
 
-                   if(SIMULATION_TYPE ==3)then
-                     if(ACOUSTIC_SIMULATION)then
-                       accel_interface(:,iglob) = 0.0
-                     endif
-                   endif
-
-                 enddo
-             endif ! ispec_is_elastic
+        enddo
+      endif ! ispec_is_elastic
 !!!        endif
-        enddo
-      endif
+    enddo
+  endif
 
 ! updates velocities
 ! Newmark finite-difference time scheme with elastic domains:
@@ -279,9 +281,9 @@
 !   updates the velocity term which requires a(t+delta)
   veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
 
-  if(PML_CONDITIONS)then
-    if(SIMULATION_TYPE == 1 .and. SAVE_FORWARD)then
-      if(nglob_interface_PML_elastic > 0)then
+  if( PML_CONDITIONS ) then
+    if( SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
+      if( nglob_interface_PML_elastic > 0 ) then
         call save_field_on_pml_interface(displ,veloc,accel,nglob_interface_PML_elastic,&
                                          b_PML_field,b_reclen_PML_field)
       endif
@@ -289,10 +291,12 @@
   endif
 
 end subroutine compute_forces_viscoelastic
+
 !
 !=====================================================================
+!
 
-! elastic solver for back
+! elastic solver for backward/reconstructed wavefields
 
 subroutine compute_forces_viscoelastic_bpwf()
 
@@ -309,7 +313,11 @@
   integer:: iphase
   logical:: phase_is_inner
 
-! distinguishes two runs: for points on MPI interfaces, and points within the partitions
+  ! checks
+  if( SIMULATION_TYPE /= 3 ) &
+    call exit_MPI(myrank,'error calling compute_forces_viscoelastic_bpwf() with wrong SIMULATION_TYPE')
+
+  ! distinguishes two runs: for points on MPI interfaces, and points within the partitions
   do iphase=1,2
 
     !first for points on MPI interfaces
@@ -322,15 +330,13 @@
 
 ! elastic term
     if(USE_DEVILLE_PRODUCTS) then
-
       ! adjoint simulations: backward/reconstructed wavefield
-      if( SIMULATION_TYPE == 3 ) call compute_forces_viscoelastic_Dev_sim3(iphase)
+      call compute_forces_viscoelastic_Dev_sim3(iphase)
 
     else
       ! no optimizations used
       ! adjoint simulations: backward/reconstructed wavefield
-      if( SIMULATION_TYPE == 3 ) &
-        call compute_forces_viscoelastic_noDev(iphase,NSPEC_AB,NGLOB_AB, &
+      call compute_forces_viscoelastic_noDev(iphase,NSPEC_AB,NGLOB_AB, &
                         b_displ,b_veloc,b_accel, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
                         hprime_xx,hprime_yy,hprime_zz, &
@@ -356,8 +362,8 @@
                         b_dsdx_top,b_dsdx_bot, &
                         ispec2D_moho_top,ispec2D_moho_bot, &
                         num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic, &
-                        phase_ispec_inner_elastic,.true.,accel_interface,ACOUSTIC_SIMULATION)
-
+                        phase_ispec_inner_elastic,.true., &
+                        accel_interface,ACOUSTIC_SIMULATION)
     endif
 
 
@@ -377,8 +383,7 @@
     if( ACOUSTIC_SIMULATION ) then
       if( num_coupling_ac_el_faces > 0 ) then
          ! adjoint simulations
-         if( SIMULATION_TYPE == 3 ) &
-           call compute_coupling_viscoelastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
+          call compute_coupling_viscoelastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
                         ibool,b_accel,b_potential_dot_dot_acoustic, &
                         num_coupling_ac_el_faces, &
                         coupling_ac_el_ispec,coupling_ac_el_ijk, &
@@ -386,7 +391,8 @@
                         coupling_ac_el_jacobian2Dw, &
                         ispec_is_inner,phase_is_inner,&
                         PML_CONDITIONS,is_CPML,potential_dot_dot_acoustic_interface,&
-                        SIMULATION_TYPE,.true.,accel_interface,&
+                        SIMULATION_TYPE,.true., &
+                        accel_interface,&
                         rmemory_coupling_el_ac_potential,spec_to_CPML,&
                         potential_acoustic,potential_dot_acoustic)
 
@@ -400,7 +406,7 @@
     endif
 
 ! adds source term (single-force/moment-tensor solution)
-    if (.not. OLD_TEST_TO_FIX_ONE_DAY) call compute_add_sources_viscoelastic_bpwf( NSPEC_AB,NGLOB_AB, &
+    call compute_add_sources_viscoelastic_bpwf( NSPEC_AB,NGLOB_AB, &
                         ibool,ispec_is_inner,phase_is_inner, &
                         NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,&
                         hdur,hdur_gaussian,tshift_src,dt,t0,sourcearrays, &
@@ -411,45 +417,39 @@
     if( phase_is_inner .eqv. .false. ) then
        ! sends accel values to corresponding MPI interface neighbors
        ! adjoint simulations
-       if( SIMULATION_TYPE == 3 ) then
-          call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_accel, &
+       call assemble_MPI_vector_async_send(NPROC,NGLOB_ADJOINT,b_accel, &
                   b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
                   num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                   nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
                   my_neighbours_ext_mesh, &
                   b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
-       endif !adjoint
-
     else
       ! waits for send/receive requests to be completed and assembles values
       ! adjoint simulations
-      if( SIMULATION_TYPE == 3 ) then
-         call assemble_MPI_vector_ext_mesh_w_ordered(NPROC,NGLOB_ADJOINT,b_accel, &
+      call assemble_MPI_vector_async_w_ord(NPROC,NGLOB_ADJOINT,b_accel, &
                              b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
                              max_nibool_interfaces_ext_mesh, &
                              nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                              b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh, &
                              my_neighbours_ext_mesh,myrank)
-      endif !adjoint
-
     endif
 
   enddo
 
   ! multiplies with inverse of mass matrix (note: rmass has been inverted already)
   ! adjoint simulations
-  if (SIMULATION_TYPE == 3) then
-     b_accel(1,:) = b_accel(1,:)*rmassx(:)
-     b_accel(2,:) = b_accel(2,:)*rmassy(:)
-     b_accel(3,:) = b_accel(3,:)*rmassz(:)
-  endif !adjoint
+  b_accel(1,:) = b_accel(1,:)*rmassx(:)
+  b_accel(2,:) = b_accel(2,:)*rmassy(:)
+  b_accel(3,:) = b_accel(3,:)*rmassz(:)
 
 ! updates acceleration with ocean load term
   if(APPROXIMATE_OCEAN_LOAD) then
     call compute_coupling_ocean_bpwf(NSPEC_AB,NGLOB_AB, &
-                                     ibool,rmassx,rmassy,rmassz,rmass_ocean_load, &
+                                     ibool,rmassx,rmassy,rmassz, &
+                                     rmass_ocean_load, &
                                      free_surface_normal,free_surface_ijk,free_surface_ispec, &
-                                     num_free_surface_faces,SIMULATION_TYPE, &
+                                     num_free_surface_faces, &
+                                     SIMULATION_TYPE, &
                                      NGLOB_ADJOINT,b_accel)
   endif
 
@@ -471,9 +471,10 @@
 ! corrector:
 !   updates the velocity term which requires a(t+delta)
   ! adjoint simulations
-  if (SIMULATION_TYPE == 3) b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
+  b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
 
 end subroutine compute_forces_viscoelastic_bpwf
+
 !
 !-------------------------------------------------------------------------------------------------
 !
@@ -488,9 +489,7 @@
 ! forward simulations
 
   use specfem_par
-  use specfem_par_acoustic
   use specfem_par_elastic
-  use specfem_par_poroelastic
 
   implicit none
 
@@ -583,9 +582,7 @@
 ! uses backward/reconstructed displacement and acceleration arrays
 
   use specfem_par
-  use specfem_par_acoustic
   use specfem_par_elastic
-  use specfem_par_poroelastic
 
   implicit none
 
@@ -650,7 +647,11 @@
   integer:: iphase
   logical:: phase_is_inner
 
-! distinguishes two runs: for points on MPI interfaces, and points within the partitions
+  ! check
+  if( PML_CONDITIONS ) &
+    call exit_MPI(myrank,'PML conditions not yet implemented for routine compute_forces_viscoelastic_GPU()')
+
+  ! distinguishes two runs: for points on MPI interfaces, and points within the partitions
   do iphase=1,2
 
     !first for points on MPI interfaces
@@ -660,21 +661,20 @@
       phase_is_inner = .true.
     endif
 
-! elastic term
-   ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
-   call compute_forces_viscoelastic_cuda(Mesh_pointer, iphase, deltat, &
-                                         nspec_outer_elastic, &
-                                         nspec_inner_elastic, &
-                                         COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
+    ! elastic term
+    ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
+    call compute_forces_viscoelastic_cuda(Mesh_pointer, iphase, deltat, &
+                                          nspec_outer_elastic, &
+                                          nspec_inner_elastic, &
+                                          COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
 
-   if(phase_is_inner .eqv. .true.) then
-      ! while Inner elements compute "Kernel_2", we wait for MPI to
-      ! finish and transfer the boundary terms to the device
-      ! asynchronously
-
+    ! while inner elements compute "Kernel_2", we wait for MPI to
+    ! finish and transfer the boundary terms to the device asynchronously
+    if(phase_is_inner .eqv. .true.) then
       !daniel: todo - this avoids calling the fortran vector send from CUDA routine
       ! wait for asynchronous copy to finish
       call sync_copy_from_device(Mesh_pointer,iphase,buffer_send_vector_ext_mesh)
+
       ! sends mpi buffers
       call assemble_MPI_vector_send_cuda(NPROC, &
                   buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
@@ -687,28 +687,35 @@
       call transfer_boundary_to_device(NPROC,Mesh_pointer,buffer_recv_vector_ext_mesh, &
                   num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                   request_recv_vector_ext_mesh)
-   endif ! inner elements
+    endif ! inner elements
 
-! adds elastic absorbing boundary term to acceleration (Stacey conditions)
+    ! adds elastic absorbing boundary term to acceleration (Stacey conditions)
     if( STACEY_ABSORBING_CONDITIONS ) then
-       call compute_stacey_viscoelastic_GPU(phase_is_inner,num_abs_boundary_faces, &
-                        SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it, &
-                        b_num_abs_boundary_faces,b_reclen_field,b_absorb_field, &
-                        Mesh_pointer,it_dsm,Veloc_dsm_boundary,Tract_dsm_boundary)
+      call compute_stacey_viscoelastic_GPU(phase_is_inner,num_abs_boundary_faces, &
+                                           SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it, &
+                                           b_num_abs_boundary_faces,b_reclen_field,b_absorb_field, &
+                                           Mesh_pointer, &
+                                           it_dsm,Veloc_dsm_boundary,Tract_dsm_boundary)
     endif
 
-
-! acoustic coupling
+    ! acoustic coupling
     if( ACOUSTIC_SIMULATION ) then
       if( num_coupling_ac_el_faces > 0 ) then
         call compute_coupling_el_ac_cuda(Mesh_pointer,phase_is_inner, &
                                          num_coupling_ac_el_faces)
-      endif ! num_coupling_ac_el_faces
+      endif
     endif
 
+    ! poroelastic coupling
+    if( POROELASTIC_SIMULATION ) then
+      ! note:
+      ! these routines are not implemented as CUDA kernels, we just transfer the fields
+      ! from the GPU to the CPU and vice versa
 
-! poroelastic coupling
-    if( POROELASTIC_SIMULATION ) then
+      ! transfers displacement & acceleration to the CPU
+      call transfer_displ_from_device(NDIM*NGLOB_AB,displ, Mesh_pointer)
+      call transfer_accel_from_device(NDIM*NGLOB_AB,accel, Mesh_pointer)
+
       call compute_coupling_viscoelastic_po(NSPEC_AB,NGLOB_AB,ibool,&
                         displs_poroelastic,displw_poroelastic,&
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
@@ -728,94 +735,105 @@
                         coupling_el_po_normal, &
                         coupling_el_po_jacobian2Dw, &
                         ispec_is_inner,phase_is_inner)
+
+      ! transfers acceleration back to GPU
+      call transfer_accel_to_device(NDIM*NGLOB_AB,accel, Mesh_pointer)
     endif
 
-! adds source term (single-force/moment-tensor solution)
-    if (.not. OLD_TEST_TO_FIX_ONE_DAY) call compute_add_sources_viscoelastic_GPU(NSPEC_AB, &
+    ! adds source term (single-force/moment-tensor solution)
+    call compute_add_sources_viscoelastic_GPU(NSPEC_AB, &
                         ispec_is_inner,phase_is_inner,NSOURCES,myrank,it,&
                         hdur,hdur_gaussian,tshift_src,dt,t0, &
                         ispec_is_elastic,SIMULATION_TYPE,NSTEP, &
                         nrec,islice_selected_rec,ispec_selected_rec, &
                         nadj_rec_local,adj_sourcearrays, &
-                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY,Mesh_pointer)
+                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY, &
+                        Mesh_pointer)
 
     ! assemble all the contributions between slices using MPI
     if( phase_is_inner .eqv. .false. ) then
-       ! sends accel values to corresponding MPI interface neighbors
-       ! GPU_MODE == 1
-       ! transfers boundary region to host asynchronously. The
-       ! MPI-send is done from within compute_forces_viscoelastic_cuda,
-       ! once the inner element kernels are launched, and the
-       ! memcpy has finished. see compute_forces_viscoelastic_cuda:1655
-       call transfer_boundary_from_device_a(Mesh_pointer,nspec_outer_elastic)
+      ! sends accel values to corresponding MPI interface neighbors
 
-       ! adjoint simulations
-       if( SIMULATION_TYPE == 3 ) then
-          call transfer_boun_accel_from_device(NGLOB_AB*NDIM, Mesh_pointer, b_accel,&
-                       b_buffer_send_vector_ext_mesh,&
-                       num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh,&
-                       nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh,3) ! <-- 3 == adjoint b_accel
-          call assemble_MPI_vector_send_cuda(NPROC, &
-                      b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
-                      num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                      nibool_interfaces_ext_mesh,&
-                      my_neighbours_ext_mesh, &
-                      b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
-       endif !adjoint
+      ! transfers boundary region to host asynchronously. The
+      ! MPI-send is done from within compute_forces_viscoelastic_cuda,
+      ! once the inner element kernels are launched, and the
+      ! memcpy has finished. see compute_forces_viscoelastic_cuda: ~ line 1655
+      call transfer_boundary_from_device_a(Mesh_pointer,nspec_outer_elastic)
 
+      ! adjoint simulations
+      if( SIMULATION_TYPE == 3 ) then
+        call transfer_boun_accel_from_device(Mesh_pointer, b_accel,&
+                        b_buffer_send_vector_ext_mesh,&
+                        3) ! <-- 3 == adjoint b_accel
+        call assemble_MPI_vector_send_cuda(NPROC, &
+                        b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,&
+                        my_neighbours_ext_mesh, &
+                        b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
+      endif !adjoint
+
     else
       ! waits for send/receive requests to be completed and assembles values
-      ! GPU_MODE == 1
       call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,accel, Mesh_pointer,&
                       buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
                       max_nibool_interfaces_ext_mesh, &
                       nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                      request_send_vector_ext_mesh,request_recv_vector_ext_mesh,1)
+                      request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
+                      1)
       ! adjoint simulations
       if( SIMULATION_TYPE == 3 ) then
-         call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,b_accel, Mesh_pointer,&
+        call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,b_accel, Mesh_pointer,&
                               b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
                               max_nibool_interfaces_ext_mesh, &
                               nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                              b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh,3)
+                              b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh, &
+                              3)
       endif !adjoint
-
     endif
 
- enddo
+  enddo
 
 !Percy , Fault boundary term B*tau is added to the assembled forces
 !        which at this point are stored in the array 'accel'
-  if (SIMULATION_TYPE_DYN) call bc_dynflt_set3d_all(accel,veloc,displ)
+  if (SIMULATION_TYPE_DYN .or. SIMULATION_TYPE_KIN ) then
+    ! transfers wavefields to the CPU
+    call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc,accel, Mesh_pointer)
 
-  if (SIMULATION_TYPE_KIN) call bc_kinflt_set_all(accel,veloc,displ)
+    ! adds dynamic source
+    if (SIMULATION_TYPE_DYN) call bc_dynflt_set3d_all(accel,veloc,displ)
+    if (SIMULATION_TYPE_KIN) call bc_kinflt_set_all(accel,veloc,displ)
 
+    ! transfers acceleration back to GPU
+    call transfer_accel_to_device(NDIM*NGLOB_AB,accel, Mesh_pointer)
+  endif
+
  ! multiplies with inverse of mass matrix (note: rmass has been inverted already)
- call kernel_3_a_cuda(Mesh_pointer, NGLOB_AB, deltatover2,b_deltatover2,APPROXIMATE_OCEAN_LOAD)
+ call kernel_3_a_cuda(Mesh_pointer,deltatover2,b_deltatover2,APPROXIMATE_OCEAN_LOAD)
 
 ! updates acceleration with ocean load term
   if(APPROXIMATE_OCEAN_LOAD) then
     call compute_coupling_ocean_cuda(Mesh_pointer)
+
+    ! updates velocities
+    ! Newmark finite-difference time scheme with elastic domains:
+    ! (see e.g. Hughes, 1987; Chaljub et al., 2003)
+    !
+    ! u(t+delta_t) = u(t) + delta_t  v(t) + 1/2  delta_t**2 a(t)
+    ! v(t+delta_t) = v(t) + 1/2 delta_t a(t) + 1/2 delta_t a(t+delta_t)
+    ! a(t+delta_t) = 1/M_elastic ( -K_elastic u(t+delta) + B_elastic chi_dot_dot(t+delta_t) + f( t+delta_t) )
+    !
+    ! where
+    !   u, v, a are displacement,velocity & acceleration
+    !   M is mass matrix, K stiffness matrix and B boundary term for acoustic/elastic domains
+    !   f denotes a source term (acoustic/elastic)
+    !   chi_dot_dot is acoustic (fluid) potential ( dotted twice with respect to time)
+    !
+    ! corrector:
+    ! updates the velocity term which requires a(t+delta)
+    ! GPU_MODE: this is handled in 'kernel_3' at the same time as accel*rmass
+    call kernel_3_b_cuda(Mesh_pointer,deltatover2,b_deltatover2)
   endif
 
-! updates velocities
-! Newmark finite-difference time scheme with elastic domains:
-! (see e.g. Hughes, 1987; Chaljub et al., 2003)
-!
-! u(t+delta_t) = u(t) + delta_t  v(t) + 1/2  delta_t**2 a(t)
-! v(t+delta_t) = v(t) + 1/2 delta_t a(t) + 1/2 delta_t a(t+delta_t)
-! a(t+delta_t) = 1/M_elastic ( -K_elastic u(t+delta) + B_elastic chi_dot_dot(t+delta_t) + f( t+delta_t) )
-!
-! where
-!   u, v, a are displacement,velocity & acceleration
-!   M is mass matrix, K stiffness matrix and B boundary term for acoustic/elastic domains
-!   f denotes a source term (acoustic/elastic)
-!   chi_dot_dot is acoustic (fluid) potential ( dotted twice with respect to time)
-!
-! corrector:
-! updates the velocity term which requires a(t+delta)
-! GPU_MODE: this is handled in 'kernel_3' at the same time as accel*rmass
-  if( APPROXIMATE_OCEAN_LOAD ) call kernel_3_b_cuda(Mesh_pointer, NGLOB_AB, deltatover2,b_deltatover2)
-
 end subroutine compute_forces_viscoelastic_GPU
 

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_acoustic.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_acoustic.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -114,7 +114,7 @@
   enddo ! num_abs_boundary_faces
 
   ! adjoint simulations: stores absorbed wavefield part
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
+  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
     ! writes out absorbing boundary value only when second phase is running
     if( phase_is_inner .eqv. .true. ) then
       ! uses fortran routine
@@ -172,7 +172,7 @@
   if( num_abs_boundary_faces == 0 ) return
 
   ! adjoint simulations:
-  if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0)  then
+  if( SIMULATION_TYPE == 3 ) then
     ! reads in absorbing boundary array when first phase is running
     if( phase_is_inner .eqv. .false. ) then
       ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
@@ -252,7 +252,7 @@
   if( num_abs_boundary_faces == 0 ) return
 
   ! adjoint simulations:
-  if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0)  then
+  if( SIMULATION_TYPE == 3 ) then
     ! reads in absorbing boundary array when first phase is running
     if( phase_is_inner .eqv. .false. ) then
       ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
@@ -266,12 +266,10 @@
   endif !adjoint
 
   ! absorbs absorbing-boundary surface using Sommerfeld condition (vanishing field in the outer-space)
-  if( num_abs_boundary_faces > 0 ) &
-    call compute_stacey_acoustic_cuda(Mesh_pointer, phase_is_inner, &
-                                      SAVE_FORWARD,b_absorb_potential)
+  call compute_stacey_acoustic_cuda(Mesh_pointer, phase_is_inner,b_absorb_potential)
 
   ! adjoint simulations: stores absorbed wavefield part
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
+  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
     ! writes out absorbing boundary value only when second phase is running
     if( phase_is_inner .eqv. .true. ) then
       ! uses fortran routine

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_poroelastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_poroelastic.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_poroelastic.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -88,7 +88,7 @@
   if( num_abs_boundary_faces == 0 ) return
 
 ! adjoint simulations:
-  if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0)  then
+  if( SIMULATION_TYPE == 3 ) then
     ! reads in absorbing boundary array when first phase is running
     if( phase_is_inner .eqv. .false. ) then
       ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
@@ -191,7 +191,7 @@
      enddo
 
   ! adjoint simulations: stores absorbed wavefield part
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
+  if( SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
     ! writes out absorbing boundary value only when second phase is running
     if( phase_is_inner .eqv. .true. ) then
       ! uses fortran routine

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_viscoelastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_viscoelastic.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_viscoelastic.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -101,20 +101,6 @@
   ! checks if anything to do
   if( num_abs_boundary_faces == 0 ) return
 
-! adjoint simulations:
-  if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0)  then
-    ! reads in absorbing boundary array when first phase is running
-    if( phase_is_inner .eqv. .false. ) then
-      ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
-      ! uses fortran routine
-      !read(IOABS,rec=NSTEP-it+1) reclen1,b_absorb_field,reclen2
-      !if (reclen1 /= b_reclen_field .or. reclen1 /= reclen2) &
-      !  call exit_mpi(0,'Error reading absorbing contribution b_absorb_field')
-      ! uses c routine for faster reading
-      call read_abs(0,b_absorb_field,b_reclen_field,NSTEP-it+1)
-    endif
-  endif !adjoint
-
   ! absorbs absorbing-boundary surface using Stacey condition (Clayton & Enquist)
   do iface=1,num_abs_boundary_faces
 
@@ -182,7 +168,7 @@
   enddo
 
   ! adjoint simulations: stores absorbed wavefield part
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
+  if( SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
     ! writes out absorbing boundary value only when second phase is running
     if( phase_is_inner .eqv. .true. ) then
       ! uses fortran routine
@@ -214,6 +200,8 @@
                         NSTEP,it,NGLOB_ADJOINT,b_accel, &
                         b_num_abs_boundary_faces,b_reclen_field,b_absorb_field)
 
+  use specfem_par,only: myrank
+
   implicit none
 
   include "constants.h"
@@ -244,51 +232,48 @@
 ! local parameters
   integer :: ispec,iglob,i,j,k,iface,igll
 
+  ! checks
+  if (SIMULATION_TYPE /= 3 ) &
+    call exit_MPI(myrank,'error calling routine compute_stacey_viscoelastic_bpwf() with wrong SIMULATION_TYPE')
+
   ! checks if anything to do
   if( num_abs_boundary_faces == 0 ) return
 
-! adjoint simulations:
-  if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0)  then
-    ! reads in absorbing boundary array when first phase is running
-    if( phase_is_inner .eqv. .false. ) then
-      ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
-      ! uses fortran routine
-      !read(IOABS,rec=NSTEP-it+1) reclen1,b_absorb_field,reclen2
-      !if (reclen1 /= b_reclen_field .or. reclen1 /= reclen2) &
-      !  call exit_mpi(0,'Error reading absorbing contribution b_absorb_field')
-      ! uses c routine for faster reading
-      call read_abs(0,b_absorb_field,b_reclen_field,NSTEP-it+1)
-    endif
-  endif !adjoint
+  ! adjoint simulations:
+  ! reads in absorbing boundary array when first phase is running
+  if( phase_is_inner .eqv. .false. ) then
+    ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
+    ! uses fortran routine
+    !read(IOABS,rec=NSTEP-it+1) reclen1,b_absorb_field,reclen2
+    !if (reclen1 /= b_reclen_field .or. reclen1 /= reclen2) &
+    !  call exit_mpi(0,'Error reading absorbing contribution b_absorb_field')
+    ! uses c routine for faster reading
+    call read_abs(0,b_absorb_field,b_reclen_field,NSTEP-it+1)
+  endif
 
   ! absorbs absorbing-boundary surface using Stacey condition (Clayton & Enquist)
   do iface=1,num_abs_boundary_faces
 
-     ispec = abs_boundary_ispec(iface)
+    ispec = abs_boundary_ispec(iface)
 
-     if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+    if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
 
-        if( ispec_is_elastic(ispec) ) then
+      if( ispec_is_elastic(ispec) ) then
+        ! reference gll points on boundary face
+        do igll = 1,NGLLSQUARE
+          ! gets local indices for GLL point
+          i = abs_boundary_ijk(1,igll,iface)
+          j = abs_boundary_ijk(2,igll,iface)
+          k = abs_boundary_ijk(3,igll,iface)
 
-           ! reference gll points on boundary face
-           do igll = 1,NGLLSQUARE
+          ! gets velocity
+          iglob=ibool(i,j,k,ispec)
 
-              ! gets local indices for GLL point
-              i = abs_boundary_ijk(1,igll,iface)
-              j = abs_boundary_ijk(2,igll,iface)
-              k = abs_boundary_ijk(3,igll,iface)
-
-              ! gets velocity
-              iglob=ibool(i,j,k,ispec)
-
-              ! adjoint simulations
-              if (SIMULATION_TYPE == 3) then
-                 b_accel(:,iglob) = b_accel(:,iglob) - b_absorb_field(:,igll,iface)
-              endif !adjoint
-
-           enddo
-        endif ! ispec_is_elastic
-     endif ! ispec_is_inner
+          ! adjoint simulations
+          b_accel(:,iglob) = b_accel(:,iglob) - b_absorb_field(:,igll,iface)
+        enddo
+      endif ! ispec_is_elastic
+    endif ! ispec_is_inner
   enddo
 
   end subroutine compute_stacey_viscoelastic_bpwf
@@ -385,7 +370,7 @@
   if( num_abs_boundary_faces == 0 ) return
 
 ! adjoint simulations:
-  if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0)  then
+  if( SIMULATION_TYPE == 3 ) then
     ! reads in absorbing boundary array when first phase is running
     if( phase_is_inner .eqv. .false. ) then
       ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
@@ -398,12 +383,10 @@
     endif
   endif !adjoint
 
-  if( num_abs_boundary_faces > 0 ) &
-    call compute_stacey_viscoelastic_cuda(Mesh_pointer,phase_is_inner, &
-                                          SAVE_FORWARD,b_absorb_field)
+  call compute_stacey_viscoelastic_cuda(Mesh_pointer,phase_is_inner,b_absorb_field)
 
   ! adjoint simulations: stores absorbed wavefield part
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
+  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
     ! writes out absorbing boundary value only when second phase is running
     if( phase_is_inner .eqv. .true. ) then
       ! uses fortran routine

Added: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_total_energy.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_total_energy.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_total_energy.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -0,0 +1,325 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 1
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+!                             July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+
+  subroutine compute_total_energy()
+
+! computes kinetic, potential and total energy
+! in all the slices using an MPI reduction
+! and output that to an energy file
+
+  use specfem_par
+  use specfem_par_elastic
+  use specfem_par_acoustic
+  use pml_par
+
+  implicit none
+
+! local variables
+  integer :: i,j,k,l,ispec,iglob
+
+  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) :: duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) :: duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) :: duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) :: epsilon_xx,epsilon_yy,epsilon_zz,epsilon_xy,epsilon_xz,epsilon_yz,epsilon_yx,epsilon_zx,epsilon_zy
+  real(kind=CUSTOM_REAL) :: sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+  real(kind=CUSTOM_REAL) :: vx,vy,vz,pressure
+
+  real(kind=CUSTOM_REAL) :: hp1,hp2,hp3
+
+  real(kind=CUSTOM_REAL) :: lambdal,mul,lambdalplus2mul,rhol,cpl
+  real(kind=CUSTOM_REAL) :: kappal
+
+  real(kind=CUSTOM_REAL) :: integration_weight
+  double precision :: kinetic_energy,potential_energy
+  double precision :: kinetic_energy_glob,potential_energy_glob,total_energy_glob
+
+! local parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  kinetic_energy = 0.d0
+  potential_energy = 0.d0
+
+  if(ANISOTROPY .or. ATTENUATION) &
+    call exit_MPI(myrank,'calculation of total energy currently implemented only for media with no anisotropy and no attenuation')
+
+! loop over spectral elements
+  do ispec = 1,NSPEC_AB
+
+! if element is a CPML then do not compute energy in it, since it is non physical;
+! thus, we compute energy in the main domain only, without absorbing elements
+    if(PML_CONDITIONS) then
+      ! do not merge this second line with the first using an ".and." statement
+      ! because array is_CPML() is unallocated when PML_CONDITIONS is false
+      if(is_CPML(ispec)) cycle
+    endif
+
+    !---
+    !--- elastic spectral element
+    !---
+    if(ispec_is_elastic(ispec)) then
+
+     do k=1,NGLLZ
+       do j=1,NGLLY
+         do i=1,NGLLX
+           iglob = ibool(i,j,k,ispec)
+           dummyx_loc(i,j,k) = displ(1,iglob)
+           dummyy_loc(i,j,k) = displ(2,iglob)
+           dummyz_loc(i,j,k) = displ(3,iglob)
+         enddo
+       enddo
+     enddo
+
+     do k=1,NGLLZ
+        do j=1,NGLLY
+          do i=1,NGLLX
+
+          iglob = ibool(i,j,k,ispec)
+
+          tempx1(i,j,k) = 0._CUSTOM_REAL
+          tempx2(i,j,k) = 0._CUSTOM_REAL
+          tempx3(i,j,k) = 0._CUSTOM_REAL
+
+          tempy1(i,j,k) = 0._CUSTOM_REAL
+          tempy2(i,j,k) = 0._CUSTOM_REAL
+          tempy3(i,j,k) = 0._CUSTOM_REAL
+
+          tempz1(i,j,k) = 0._CUSTOM_REAL
+          tempz2(i,j,k) = 0._CUSTOM_REAL
+          tempz3(i,j,k) = 0._CUSTOM_REAL
+
+          do l=1,NGLLX
+            hp1 = hprime_xx(i,l)
+            tempx1(i,j,k) = tempx1(i,j,k) + dummyx_loc(l,j,k)*hp1
+            tempy1(i,j,k) = tempy1(i,j,k) + dummyy_loc(l,j,k)*hp1
+            tempz1(i,j,k) = tempz1(i,j,k) + dummyz_loc(l,j,k)*hp1
+
+            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+            hp2 = hprime_yy(j,l)
+            tempx2(i,j,k) = tempx2(i,j,k) + dummyx_loc(i,l,k)*hp2
+            tempy2(i,j,k) = tempy2(i,j,k) + dummyy_loc(i,l,k)*hp2
+            tempz2(i,j,k) = tempz2(i,j,k) + dummyz_loc(i,l,k)*hp2
+
+            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+            hp3 = hprime_zz(k,l)
+            tempx3(i,j,k) = tempx3(i,j,k) + dummyx_loc(i,j,l)*hp3
+            tempy3(i,j,k) = tempy3(i,j,k) + dummyy_loc(i,j,l)*hp3
+            tempz3(i,j,k) = tempz3(i,j,k) + dummyz_loc(i,j,l)*hp3
+          enddo
+
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+              ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+              ! compute the strain
+              epsilon_xx = duxdxl
+              epsilon_yy = duydyl
+              epsilon_zz = duzdzl
+              epsilon_xy = 0.5 * duxdyl_plus_duydxl
+              epsilon_xz = 0.5 * duzdxl_plus_duxdzl
+              epsilon_yz = 0.5 * duzdyl_plus_duydzl
+
+              ! define symmetric components of epsilon
+              epsilon_yx = epsilon_xy
+              epsilon_zx = epsilon_xz
+              epsilon_zy = epsilon_yz
+
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+              rhol = rhostore(i,j,k,ispec)
+
+              ! isotropic case
+              lambdalplus2mul = kappal + FOUR_THIRDS * mul
+              lambdal = lambdalplus2mul - 2.*mul
+
+              ! compute stress sigma
+              sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+              sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+              sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+              sigma_xy = mul*duxdyl_plus_duydxl
+              sigma_xz = mul*duzdxl_plus_duxdzl
+              sigma_yz = mul*duzdyl_plus_duydzl
+
+              ! define symmetric components of sigma
+              sigma_yx = sigma_xy
+              sigma_zx = sigma_xz
+              sigma_zy = sigma_yz
+
+              integration_weight = wxgll(i)*wygll(j)*wzgll(k)*jacobianl
+
+              ! compute kinetic energy  1/2 rho ||v||^2
+              kinetic_energy = kinetic_energy + integration_weight * rhol*(veloc(1,iglob)**2 + &
+                                   veloc(2,iglob)**2 + veloc(3,iglob)**2) / 2.
+
+              ! compute potential energy 1/2 sigma_ij epsilon_ij
+              potential_energy = potential_energy + integration_weight * &
+                (sigma_xx*epsilon_xx + sigma_xy*epsilon_xy + sigma_xz*epsilon_xz + &
+                 sigma_yx*epsilon_yx + sigma_yy*epsilon_yy + sigma_yz*epsilon_yz + &
+                 sigma_zx*epsilon_zx + sigma_zy*epsilon_zy + sigma_zz*epsilon_zz) / 2.
+
+          enddo
+        enddo
+     enddo
+
+    !---
+    !--- acoustic spectral element
+    !---
+    else if(ispec_is_acoustic(ispec)) then
+
+      ! for the definition of potential energy in an acoustic fluid, see for instance
+      ! equation (23) of M. Maess et al., Journal of Sound and Vibration 296 (2006) 264-276
+
+      ! in case of an acoustic medium, a potential Chi of (density * displacement) is used as in Chaljub and Valette,
+      ! Geophysical Journal International, vol. 158, p. 131-141 (2004) and *NOT* a velocity potential
+      ! as in Komatitsch and Tromp, Geophysical Journal International, vol. 150, p. 303-318 (2002).
+      ! This permits acoustic-elastic coupling based on a non-iterative time scheme.
+      ! Displacement is then: u = grad(Chi) / rho
+      ! Velocity is then: v = grad(Chi_dot) / rho (Chi_dot being the time derivative of Chi)
+      ! and pressure is: p = - Chi_dot_dot  (Chi_dot_dot being the time second derivative of Chi).
+
+     do k=1,NGLLZ
+       do j=1,NGLLY
+         do i=1,NGLLX
+           iglob = ibool(i,j,k,ispec)
+           dummyx_loc(i,j,k) = potential_dot_acoustic(iglob)
+         enddo
+       enddo
+     enddo
+
+     do k=1,NGLLZ
+        do j=1,NGLLY
+          do i=1,NGLLX
+
+          iglob = ibool(i,j,k,ispec)
+
+          tempx1(i,j,k) = 0._CUSTOM_REAL
+          tempx2(i,j,k) = 0._CUSTOM_REAL
+          tempx3(i,j,k) = 0._CUSTOM_REAL
+
+          do l=1,NGLLX
+            hp1 = hprime_xx(i,l)
+            tempx1(i,j,k) = tempx1(i,j,k) + dummyx_loc(l,j,k)*hp1
+
+            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+            hp2 = hprime_yy(j,l)
+            tempx2(i,j,k) = tempx2(i,j,k) + dummyx_loc(i,l,k)*hp2
+
+            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+            hp3 = hprime_zz(k,l)
+            tempx3(i,j,k) = tempx3(i,j,k) + dummyx_loc(i,j,l)*hp3
+          enddo
+
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              rhol = rhostore(i,j,k,ispec)
+              kappal = kappastore(i,j,k,ispec)
+              cpl = sqrt(kappal / rhol)
+
+              ! Velocity is v = grad(Chi_dot) / rho (Chi_dot being the time derivative of Chi)
+              vx = duxdxl / rhol
+              vy = duxdyl / rhol
+              vz = duxdzl / rhol
+
+              ! pressure is p = - Chi_dot_dot  (Chi_dot_dot being the time second derivative of Chi)
+              pressure = - potential_dot_dot_acoustic(iglob)
+
+              integration_weight = wxgll(i)*wygll(j)*wzgll(k)*jacobianl
+
+              ! compute kinetic energy  1/2 rho ||v||^2
+              kinetic_energy = kinetic_energy + integration_weight * rhol*(vx**2 + vy**2 + vz**2) / 2.
+
+              ! compute potential energy 1/2 sigma_ij epsilon_ij
+              potential_energy = potential_energy + integration_weight * pressure**2 / (2. * rhol * cpl**2)
+
+          enddo
+        enddo
+     enddo
+
+    else
+
+      call exit_MPI(myrank,'calculation of total energy implemented for acoustic and (visco)elastic elements only for now')
+
+    endif
+
+  enddo
+
+! compute the total using a reduction between all the processors
+  call sum_all_dp(kinetic_energy,kinetic_energy_glob)
+  call sum_all_dp(potential_energy,potential_energy_glob)
+  total_energy_glob = kinetic_energy_glob + potential_energy_glob
+
+! write the total to disk from the master
+  if(myrank == 0) write(IOUT_ENERGY,*) it,sngl(kinetic_energy_glob),sngl(potential_energy_glob),sngl(total_energy_glob)
+
+  end subroutine compute_total_energy

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/fault_solver_common.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/fault_solver_common.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/fault_solver_common.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -150,19 +150,19 @@
     tmp_vec = 0._CUSTOM_REAL
     if (bc%nspec>0) tmp_vec(1,bc%ibulk1) = bc%B
     ! assembles with other MPI processes
-    call assemble_MPI_vector_ext_mesh(NPROC,NGLOB_AB,tmp_vec, &
-       num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-       nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-       my_neighbours_ext_mesh)
+    call assemble_MPI_vector_blocking(NPROC,NGLOB_AB,tmp_vec, &
+                                     num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                     nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                                     my_neighbours_ext_mesh)
     if (bc%nspec>0) bc%B = tmp_vec(1,bc%ibulk1)
 
     tmp_vec = 0._CUSTOM_REAL
     if (bc%nspec>0) tmp_vec(:,bc%ibulk1) = nxyz
     ! assembles with other MPI processes
-    call assemble_MPI_vector_ext_mesh(NPROC,NGLOB_AB,tmp_vec, &
-       num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-       nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-       my_neighbours_ext_mesh)
+    call assemble_MPI_vector_blocking(NPROC,NGLOB_AB,tmp_vec, &
+                                     num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                     nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                                     my_neighbours_ext_mesh)
     if (bc%nspec>0) nxyz = tmp_vec(:,bc%ibulk1)
 
   endif

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/finalize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/finalize_simulation.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/finalize_simulation.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -38,6 +38,7 @@
   implicit none
 
   integer :: irec_local
+  integer :: ier
 
   ! write gravity perturbations
   if (GRAVITY_SIMULATION) call gravity_output()
@@ -45,46 +46,51 @@
   ! save last frame
 
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-     open(unit=27,file=prname(1:len_trim(prname))//'save_forward_arrays.bin',&
-          status='unknown',form='unformatted')
+    open(unit=IOUT,file=prname(1:len_trim(prname))//'save_forward_arrays.bin',&
+          status='unknown',form='unformatted',iostat=ier)
+    if( ier /= 0 ) then
+      print*,'error: opening save_forward_arrays.bin'
+      print*,'path: ',prname(1:len_trim(prname))//'save_forward_arrays.bin'
+      call exit_mpi(myrank,'error opening file save_forward_arrays.bin')
+    endif
 
     if( ACOUSTIC_SIMULATION ) then
-      write(27) potential_acoustic
-      write(27) potential_dot_acoustic
-      write(27) potential_dot_dot_acoustic
+      write(IOUT) potential_acoustic
+      write(IOUT) potential_dot_acoustic
+      write(IOUT) potential_dot_dot_acoustic
     endif
 
     if( ELASTIC_SIMULATION ) then
-      write(27) displ
-      write(27) veloc
-      write(27) accel
+      write(IOUT) displ
+      write(IOUT) veloc
+      write(IOUT) accel
 
       if (ATTENUATION) then
-        if(FULL_ATTENUATION_SOLID) write(27) R_trace  !ZN
-        write(27) R_xx
-        write(27) R_yy
-        write(27) R_xy
-        write(27) R_xz
-        write(27) R_yz
-        if(FULL_ATTENUATION_SOLID) write(27) epsilondev_trace !ZN
-        write(27) epsilondev_xx
-        write(27) epsilondev_yy
-        write(27) epsilondev_xy
-        write(27) epsilondev_xz
-        write(27) epsilondev_yz
+        if(FULL_ATTENUATION_SOLID) write(IOUT) R_trace  !ZN
+        write(IOUT) R_xx
+        write(IOUT) R_yy
+        write(IOUT) R_xy
+        write(IOUT) R_xz
+        write(IOUT) R_yz
+        if(FULL_ATTENUATION_SOLID) write(IOUT) epsilondev_trace !ZN
+        write(IOUT) epsilondev_xx
+        write(IOUT) epsilondev_yy
+        write(IOUT) epsilondev_xy
+        write(IOUT) epsilondev_xz
+        write(IOUT) epsilondev_yz
       endif
     endif
 
     if( POROELASTIC_SIMULATION ) then
-      write(27) displs_poroelastic
-      write(27) velocs_poroelastic
-      write(27) accels_poroelastic
-      write(27) displw_poroelastic
-      write(27) velocw_poroelastic
-      write(27) accelw_poroelastic
+      write(IOUT) displs_poroelastic
+      write(IOUT) velocs_poroelastic
+      write(IOUT) accels_poroelastic
+      write(IOUT) displw_poroelastic
+      write(IOUT) velocw_poroelastic
+      write(IOUT) accelw_poroelastic
     endif
 
-    close(27)
+    close(IOUT)
 
 ! adjoint simulations
   else if (SIMULATION_TYPE == 3) then
@@ -123,7 +129,11 @@
       do irec_local = 1, nrec_local
         write(outputname,'(a,i5.5)') OUTPUT_FILES_PATH(1:len_trim(OUTPUT_FILES_PATH)) // &
             '/src_frechet.',number_receiver_global(irec_local)
-        open(unit=27,file=trim(outputname),status='unknown')
+        open(unit=IOUT,file=trim(outputname),status='unknown',iostat=ier)
+        if( ier /= 0 ) then
+          print*,'error opening file: ',trim(outputname)
+          call exit_mpi(myrank,'error opening file src_frechet.**')
+        endif
         !
         ! r -> z, theta -> -y, phi -> x
         !
@@ -133,16 +143,16 @@
         !  Mrt = -Myz
         !  Mrp =  Mxz
         !  Mtp = -Mxy
-        write(27,*) Mzz_der(irec_local)
-        write(27,*) Myy_der(irec_local)
-        write(27,*) Mxx_der(irec_local)
-        write(27,*) -Myz_der(irec_local)
-        write(27,*) Mxz_der(irec_local)
-        write(27,*) -Mxy_der(irec_local)
-        write(27,*) sloc_der(1,irec_local)
-        write(27,*) sloc_der(2,irec_local)
-        write(27,*) sloc_der(3,irec_local)
-        close(27)
+        write(IOUT,*) Mzz_der(irec_local)
+        write(IOUT,*) Myy_der(irec_local)
+        write(IOUT,*) Mxx_der(irec_local)
+        write(IOUT,*) -Myz_der(irec_local)
+        write(IOUT,*) Mxz_der(irec_local)
+        write(IOUT,*) -Mxy_der(irec_local)
+        write(IOUT,*) sloc_der(1,irec_local)
+        write(IOUT,*) sloc_der(2,irec_local)
+        write(IOUT,*) sloc_der(3,irec_local)
+        close(IOUT)
       enddo
     endif
   endif

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/initialize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/initialize_simulation.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/initialize_simulation.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -137,16 +137,16 @@
   ! reads in numbers of spectral elements and points for the part of the mesh handled by this process
   call create_name_database(prname,myrank,LOCAL_PATH)
   if (OLD_TEST_TO_FIX_ONE_DAY) call create_name_database(dsmname,myrank,TRAC_PATH)  !! VM VM
-  open(unit=27,file=prname(1:len_trim(prname))//'external_mesh.bin',status='old',&
+  open(unit=IIN,file=prname(1:len_trim(prname))//'external_mesh.bin',status='old',&
         action='read',form='unformatted',iostat=ier)
   if( ier /= 0 ) then
     print*,'error: could not open database '
     print*,'path: ',prname(1:len_trim(prname))//'external_mesh.bin'
     call exit_mpi(myrank,'error opening database')
   endif
-  read(27) NSPEC_AB
-  read(27) NGLOB_AB
-  close(27)
+  read(IIN) NSPEC_AB
+  read(IIN) NGLOB_AB
+  close(IIN)
 
   ! attenuation arrays size
   if( ATTENUATION ) then
@@ -403,6 +403,14 @@
     NSPEC_BOUN = 1
   endif
 
+  ! transversely isotropic kernel flags
+  if( SIMULATION_TYPE == 3 ) then
+    if( SAVE_TRANSVERSE_KL .eqv. .true. .and. ANISOTROPIC_KL .eqv. .false. ) then
+      call exit_mpi(myrank, &
+        'for kernel simulations with SAVE_TRANSVERSE_KL set to .true., please also set ANISOTROPIC_KL to .true. in constants.h')
+    endif
+  endif
+
   end subroutine initialize_simulation_adjoint
 
 !

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.F90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.F90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -37,8 +37,8 @@
 
   implicit none
 
-!----  create a Gnuplot script to display the energy curve in log scale
-  if(output_energy .and. myrank == 0) then
+  !----  create a Gnuplot script to display the energy curve in log scale
+  if( OUTPUT_ENERGY .and. myrank == 0) then
     open(unit=IOUT_ENERGY,file=trim(OUTPUT_FILES)//'plot_energy.gnu',status='unknown',action='write')
     write(IOUT_ENERGY,*) 'set term wxt'
     write(IOUT_ENERGY,*) '#set term postscript landscape color solid "Helvetica" 22'
@@ -58,8 +58,9 @@
     close(IOUT_ENERGY)
   endif
 
-! open the file in which we will store the energy curve
-  if(output_energy .and. myrank == 0) open(unit=IOUT_ENERGY,file=trim(OUTPUT_FILES)//'energy.dat',status='unknown',action='write')
+  ! open the file in which we will store the energy curve
+  if( OUTPUT_ENERGY .and. myrank == 0 ) &
+    open(unit=IOUT_ENERGY,file=trim(OUTPUT_FILES)//'energy.dat',status='unknown',action='write')
 
 !
 !   s t a r t   t i m e   i t e r a t i o n s
@@ -94,65 +95,85 @@
   do it = 1,NSTEP
 
     ! simulation status output and stability check
-    if(mod(it,NTSTEP_BETWEEN_OUTPUT_INFO) == 0 .or. it == 5 .or. it == NSTEP) call it_check_stability()
+    if( mod(it,NTSTEP_BETWEEN_OUTPUT_INFO) == 0 .or. it == 5 .or. it == NSTEP ) &
+      call check_stability()
 
     ! simulation status output and stability check
-    if(output_energy .and. (mod(it,NTSTEP_BETWEEN_OUTPUT_ENERGY) == 0 .or. it == 5 .or. it == NSTEP)) &
-            call it_compute_total_energy()
+    if( OUTPUT_ENERGY ) then
+      if( mod(it,NTSTEP_BETWEEN_OUTPUT_ENERGY) == 0 .or. it == 5 .or. it == NSTEP ) &
+        call compute_total_energy()
+    endif
 
-    ! update displacement using Newmark time scheme
-    call it_update_displacement_scheme()
+    ! updates wavefields using Newmark time scheme
+    call update_displacement_scheme()
 
-    if(.not. GPU_MODE)then
-       if(SIMULATION_TYPE == 3)then
-          if(ELASTIC_SIMULATION .and. ACOUSTIC_SIMULATION)then
-            if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic()
-            if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic()
-          else
-            if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic()
-            if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic()
-          endif
-       else
+    ! calculates stiffness term
+    if( .not. GPU_MODE )then
+      ! wavefields on CPU
+
+      ! note: the order of the computations for acoustic and elastic domains is crucial for coupled simulations
+      if( SIMULATION_TYPE == 3 ) then
+        ! kernel/adjoint simulations
+
+        ! adjoint wavefields
+        if( ELASTIC_SIMULATION .and. ACOUSTIC_SIMULATION )then
+          ! coupled acoustic-elastic simulations
+          ! 1. elastic domain w/ adjoint wavefields
+          call compute_forces_viscoelastic()
+          ! 2. acoustic domain w/ adjoint wavefields
+          call compute_forces_acoustic()
+        else
+          ! non-coupled simulations
+          ! (purely acoustic or elastic)
           if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic()
           if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic()
-       endif
+        endif
 
-       if(SIMULATION_TYPE == 3)then
-          ! acoustic solver
-          ! (needs to be done after elastic one)
-          if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic_bpwf()
-          ! elastic solver
-          ! (needs to be done first, before poroelastic one)
-          if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic_bpwf()
-       endif
+        ! backward/reconstructed wavefields
+        ! acoustic solver
+        ! (needs to be done after elastic one)
+        if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic_bpwf()
+        ! elastic solver
+        ! (needs to be done first, before poroelastic one)
+        if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic_bpwf()
+
+      else
+        ! forward simulations
+
+        ! 1. acoustic domain
+        if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic()
+        ! 2. elastic domain
+        if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic()
+      endif
+
+      ! poroelastic solver
+      if( POROELASTIC_SIMULATION ) call compute_forces_poroelastic()
+
     else
-       ! acoustic solver
-       ! (needs to be done after elastic one)
-       if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic_GPU()
-       ! elastic solver
-       ! (needs to be done first, before poroelastic one)
-       if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic_GPU()
+      ! wavefields on GPU
+      ! acoustic solver
+      if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic_GPU()
+      ! elastic solver
+      ! (needs to be done first, before poroelastic one)
+      if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic_GPU()
     endif
 
-    ! poroelastic solver
-    if( POROELASTIC_SIMULATION ) call compute_forces_poroelastic()
-
     ! restores last time snapshot saved for backward/reconstruction of wavefields
     ! note: this must be read in after the Newmark time scheme
     if( SIMULATION_TYPE == 3 .and. it == 1 ) then
-     call it_read_forward_arrays()
+      call it_read_forward_arrays()
     endif
 
     ! write the seismograms with time shift (GPU_MODE transfer included)
-    if (nrec_local > 0 .or. ( WRITE_SEISMOGRAMS_BY_MASTER .and. myrank == 0 ) ) then
+    if( nrec_local > 0 .or. ( WRITE_SEISMOGRAMS_BY_MASTER .and. myrank == 0 ) ) then
       call write_seismograms()
     endif
 
     ! calculating gravity field at current timestep
-    if (GRAVITY_SIMULATION) call gravity_timeseries()
+    if( GRAVITY_SIMULATION ) call gravity_timeseries()
 
     ! resetting d/v/a/R/eps for the backward reconstruction with attenuation
-    if (ATTENUATION ) then
+    if( ATTENUATION ) then
       call it_store_attenuation_arrays()
     endif
 
@@ -167,7 +188,7 @@
     endif
 
     ! first step of noise tomography, i.e., save a surface movie at every time step
-    if ( NOISE_TOMOGRAPHY == 1) then
+    if( NOISE_TOMOGRAPHY == 1 ) then
       if( num_free_surface_faces > 0) then
         call noise_save_surface_movie(displ,ibool, &
                             noise_surface_movie,it, &
@@ -185,691 +206,17 @@
   call it_print_elapsed_time()
 
   ! Transfer fields from GPU card to host for further analysis
-  if(GPU_MODE) call it_transfer_from_GPU()
+  if( GPU_MODE ) call it_transfer_from_GPU()
 
 !----  close energy file
-  if(output_energy .and. myrank == 0) close(IOUT_ENERGY)
+  if( OUTPUT_ENERGY .and. myrank == 0 ) close(IOUT_ENERGY)
 
   end subroutine iterate_time
 
 
 !=====================================================================
 
-  subroutine it_check_stability()
 
-! computes the maximum of the norm of the displacement
-! in all the slices using an MPI reduction
-! and output timestamp file to check that simulation is running fine
-
-  use specfem_par
-  use specfem_par_elastic
-  use specfem_par_poroelastic
-  use specfem_par_acoustic
-  implicit none
-
-  double precision :: tCPU,t_remain,t_total
-  integer :: ihours,iminutes,iseconds,int_tCPU, &
-             ihours_remain,iminutes_remain,iseconds_remain,int_t_remain, &
-             ihours_total,iminutes_total,iseconds_total,int_t_total
-
-  ! maximum of the norm of the displacement
-  real(kind=CUSTOM_REAL) Usolidnorm,Usolidnorm_all ! elastic
-  real(kind=CUSTOM_REAL) Usolidnormp,Usolidnormp_all ! acoustic
-  real(kind=CUSTOM_REAL) Usolidnorms,Usolidnorms_all ! solid poroelastic
-  real(kind=CUSTOM_REAL) Usolidnormw,Usolidnormw_all ! fluid (w.r.t.s) poroelastic
-
-  ! norm of the backward displacement
-  real(kind=CUSTOM_REAL) b_Usolidnorm, b_Usolidnorm_all
-
-  ! initializes
-  Usolidnorm_all = 0.0_CUSTOM_REAL
-  Usolidnormp_all = 0.0_CUSTOM_REAL
-  Usolidnorms_all = 0.0_CUSTOM_REAL
-  Usolidnormw_all = 0.0_CUSTOM_REAL
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-!chris: Rewrite to get norm for each material when coupled simulations
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-! compute maximum of norm of displacement in each slice
-  if( ELASTIC_SIMULATION ) then
-    if( GPU_MODE) then
-      ! way 2: just get maximum of field from GPU
-      call get_norm_elastic_from_device(Usolidnorm,Mesh_pointer,1)
-    else
-      Usolidnorm = maxval(sqrt(displ(1,:)**2 + displ(2,:)**2 + displ(3,:)**2))
-    endif
-
-    ! check stability of the code, exit if unstable
-    ! negative values can occur with some compilers when the unstable value is greater
-    ! than the greatest possible floating-point number of the machine
-    !if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0.0_CUSTOM_REAL) &
-    !  call exit_MPI(myrank,'single forward simulation became unstable and blew up')
-
-    ! compute the maximum of the maxima for all the slices using an MPI reduction
-    call max_all_cr(Usolidnorm,Usolidnorm_all)
-  endif
-
-  if( ACOUSTIC_SIMULATION ) then
-    if(GPU_MODE) then
-      ! way 2: just get maximum of field from GPU
-      call get_norm_acoustic_from_device(Usolidnormp,Mesh_pointer,1)
-    else
-      Usolidnormp = maxval(abs(potential_dot_dot_acoustic(:)))
-    endif
-
-    ! compute the maximum of the maxima for all the slices using an MPI reduction
-    call max_all_cr(Usolidnormp,Usolidnormp_all)
-  endif
-
-  if( POROELASTIC_SIMULATION ) then
-    Usolidnorms = maxval(sqrt(displs_poroelastic(1,:)**2 + displs_poroelastic(2,:)**2 + &
-                             displs_poroelastic(3,:)**2))
-    Usolidnormw = maxval(sqrt(displw_poroelastic(1,:)**2 + displw_poroelastic(2,:)**2 + &
-                             displw_poroelastic(3,:)**2))
-
-    ! compute the maximum of the maxima for all the slices using an MPI reduction
-    call max_all_cr(Usolidnorms,Usolidnorms_all)
-    call max_all_cr(Usolidnormw,Usolidnormw_all)
-  endif
-
-
-  ! adjoint simulations
-  if( SIMULATION_TYPE == 3 ) then
-    if( ELASTIC_SIMULATION ) then
-      ! way 2
-      if(GPU_MODE) then
-        call get_norm_elastic_from_device(b_Usolidnorm,Mesh_pointer,3)
-      else
-        b_Usolidnorm = maxval(sqrt(b_displ(1,:)**2 + b_displ(2,:)**2 + b_displ(3,:)**2))
-      endif
-    endif
-    if( ACOUSTIC_SIMULATION ) then
-        ! way 2
-        if(GPU_MODE) then
-          call get_norm_acoustic_from_device(b_Usolidnorm,Mesh_pointer,3)
-        else
-          b_Usolidnorm = maxval(abs(b_potential_dot_dot_acoustic(:)))
-        endif
-    endif
-    if( POROELASTIC_SIMULATION ) then
-        b_Usolidnorm = maxval(sqrt(b_displs_poroelastic(1,:)**2 + b_displs_poroelastic(2,:)**2 + &
-                                 b_displs_poroelastic(3,:)**2))
-    endif
-    ! check stability of the code, exit if unstable
-    ! negative values can occur with some compilers when the unstable value is greater
-    ! than the greatest possible floating-point number of the machine
-    !if(b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0.0_CUSTOM_REAL) &
-    !  call exit_MPI(myrank,'single backward simulation became unstable and blew up')
-
-    ! compute max of all slices
-    call max_all_cr(b_Usolidnorm,b_Usolidnorm_all)
-  endif
-
-  ! user output
-  if(myrank == 0) then
-
-    write(IMAIN,*) 'Time step # ',it
-    write(IMAIN,*) 'Time: ',sngl((it-1)*DT-t0),' seconds'
-
-    ! elapsed time since beginning of the simulation
-    tCPU = wtime() - time_start
-    int_tCPU = int(tCPU)
-    ihours = int_tCPU / 3600
-    iminutes = (int_tCPU - 3600*ihours) / 60
-    iseconds = int_tCPU - 3600*ihours - 60*iminutes
-    write(IMAIN,*) 'Elapsed time in seconds = ',tCPU
-    write(IMAIN,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
-    write(IMAIN,*) 'Mean elapsed time per time step in seconds = ',sngl(tCPU/dble(it))
-    if( ELASTIC_SIMULATION ) then
-      write(IMAIN,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all
-    endif
-    if( ACOUSTIC_SIMULATION ) then
-        write(IMAIN,*) 'Max norm pressure P in all slices (Pa) = ',Usolidnormp_all
-    endif
-    if( POROELASTIC_SIMULATION ) then
-        write(IMAIN,*) 'Max norm displacement vector Us in all slices (m) = ',Usolidnorms_all
-        write(IMAIN,*) 'Max norm displacement vector W in all slices (m) = ',Usolidnormw_all
-    endif
-    ! adjoint simulations
-    if (SIMULATION_TYPE == 3) write(IMAIN,*) &
-           'Max norm U (backward) in all slices = ',b_Usolidnorm_all
-
-    ! compute estimated remaining simulation time
-    t_remain = (NSTEP - it) * (tCPU/dble(it))
-    int_t_remain = int(t_remain)
-    ihours_remain = int_t_remain / 3600
-    iminutes_remain = (int_t_remain - 3600*ihours_remain) / 60
-    iseconds_remain = int_t_remain - 3600*ihours_remain - 60*iminutes_remain
-    write(IMAIN,*) 'Time steps done = ',it,' out of ',NSTEP
-    write(IMAIN,*) 'Time steps remaining = ',NSTEP - it
-    write(IMAIN,*) 'Estimated remaining time in seconds = ',sngl(t_remain)
-    write(IMAIN,"(' Estimated remaining time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
-             ihours_remain,iminutes_remain,iseconds_remain
-
-    ! compute estimated total simulation time
-    t_total = t_remain + tCPU
-    int_t_total = int(t_total)
-    ihours_total = int_t_total / 3600
-    iminutes_total = (int_t_total - 3600*ihours_total) / 60
-    iseconds_total = int_t_total - 3600*ihours_total - 60*iminutes_total
-    write(IMAIN,*) 'Estimated total run time in seconds = ',sngl(t_total)
-    write(IMAIN,"(' Estimated total run time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
-             ihours_total,iminutes_total,iseconds_total
-    write(IMAIN,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that'
-
-    if(it < 100) then
-      write(IMAIN,*) '************************************************************'
-      write(IMAIN,*) '**** BEWARE: the above time estimates are not reliable'
-      write(IMAIN,*) '**** because fewer than 100 iterations have been performed'
-      write(IMAIN,*) '************************************************************'
-    endif
-    write(IMAIN,*)
-
-    ! flushes file buffer for main output file (IMAIN)
-    call flush_IMAIN()
-
-    ! write time stamp file to give information about progression of simulation
-    write(outputname,"('/timestamp',i6.6)") it
-    open(unit=IOUT,file=trim(OUTPUT_FILES)//outputname,status='unknown')
-    write(IOUT,*) 'Time step # ',it
-    write(IOUT,*) 'Time: ',sngl((it-1)*DT-t0),' seconds'
-    write(IOUT,*) 'Elapsed time in seconds = ',tCPU
-    write(IOUT,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
-    write(IOUT,*) 'Mean elapsed time per time step in seconds = ',tCPU/dble(it)
-    if( ELASTIC_SIMULATION ) then
-      write(IOUT,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all
-    endif
-    if( ACOUSTIC_SIMULATION ) then
-        write(IOUT,*) 'Max norm pressure P in all slices (Pa) = ',Usolidnormp_all
-    endif
-    if( POROELASTIC_SIMULATION ) then
-        write(IOUT,*) 'Max norm displacement vector Us in all slices (m) = ',Usolidnorms_all
-        write(IOUT,*) 'Max norm displacement vector W in all slices (m) = ',Usolidnormw_all
-    endif
-    ! adjoint simulations
-    if (SIMULATION_TYPE == 3) write(IOUT,*) &
-           'Max norm U (backward) in all slices = ',b_Usolidnorm_all
-    ! estimation
-    write(IOUT,*) 'Time steps done = ',it,' out of ',NSTEP
-    write(IOUT,*) 'Time steps remaining = ',NSTEP - it
-    write(IOUT,*) 'Estimated remaining time in seconds = ',t_remain
-    write(IOUT,"(' Estimated remaining time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
-             ihours_remain,iminutes_remain,iseconds_remain
-    write(IOUT,*) 'Estimated total run time in seconds = ',t_total
-    write(IOUT,"(' Estimated total run time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
-             ihours_total,iminutes_total,iseconds_total
-    write(IOUT,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that'
-    close(IOUT)
-
-    ! check stability of the code, exit if unstable
-    ! negative values can occur with some compilers when the unstable value is greater
-    ! than the greatest possible floating-point number of the machine
-    if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0.0_CUSTOM_REAL &
-     .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0.0_CUSTOM_REAL &
-     .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0.0_CUSTOM_REAL &
-     .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0.0_CUSTOM_REAL) &
-        call exit_MPI(myrank,'forward simulation became unstable and blew up')
-    ! adjoint simulations
-    if(SIMULATION_TYPE == 3 .and. (b_Usolidnorm_all > STABILITY_THRESHOLD &
-      .or. b_Usolidnorm_all < 0.0)) &
-        call exit_MPI(myrank,'backward simulation became unstable and blew up')
-
-  endif ! myrank
-
-  end subroutine it_check_stability
-
-!=====================================================================
-
-  subroutine it_compute_total_energy()
-
-! computes kinetic, potential and total energy
-! in all the slices using an MPI reduction
-! and output that to an energy file
-
-  use specfem_par
-  use specfem_par_elastic
-  use specfem_par_acoustic
-  use pml_par
-
-  implicit none
-
-! local variables
-  integer :: i,j,k,l,ispec,iglob
-
-  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) :: duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) :: duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) :: duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) :: epsilon_xx,epsilon_yy,epsilon_zz,epsilon_xy,epsilon_xz,epsilon_yz,epsilon_yx,epsilon_zx,epsilon_zy
-  real(kind=CUSTOM_REAL) :: sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-  real(kind=CUSTOM_REAL) :: vx,vy,vz,pressure
-
-  real(kind=CUSTOM_REAL) :: hp1,hp2,hp3
-
-  real(kind=CUSTOM_REAL) :: lambdal,mul,lambdalplus2mul,rhol,cpl
-  real(kind=CUSTOM_REAL) :: kappal
-
-  real(kind=CUSTOM_REAL) :: integration_weight
-  double precision :: kinetic_energy,potential_energy
-  double precision :: kinetic_energy_glob,potential_energy_glob,total_energy_glob
-
-! local parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  kinetic_energy = 0.d0
-  potential_energy = 0.d0
-
-  if(ANISOTROPY .or. ATTENUATION) &
-    call exit_MPI(myrank,'calculation of total energy currently implemented only for media with no anisotropy and no attenuation')
-
-! loop over spectral elements
-  do ispec = 1,NSPEC_AB
-
-! if element is a CPML then do not compute energy in it, since it is non physical;
-! thus, we compute energy in the main domain only, without absorbing elements
-    if(PML_CONDITIONS) then
-      ! do not merge this second line with the first using an ".and." statement
-      ! because array is_CPML() is unallocated when PML_CONDITIONS is false
-      if(is_CPML(ispec)) cycle
-    endif
-
-    !---
-    !--- elastic spectral element
-    !---
-    if(ispec_is_elastic(ispec)) then
-
-     do k=1,NGLLZ
-       do j=1,NGLLY
-         do i=1,NGLLX
-           iglob = ibool(i,j,k,ispec)
-           dummyx_loc(i,j,k) = displ(1,iglob)
-           dummyy_loc(i,j,k) = displ(2,iglob)
-           dummyz_loc(i,j,k) = displ(3,iglob)
-         enddo
-       enddo
-     enddo
-
-     do k=1,NGLLZ
-        do j=1,NGLLY
-          do i=1,NGLLX
-
-          iglob = ibool(i,j,k,ispec)
-
-          tempx1(i,j,k) = 0._CUSTOM_REAL
-          tempx2(i,j,k) = 0._CUSTOM_REAL
-          tempx3(i,j,k) = 0._CUSTOM_REAL
-
-          tempy1(i,j,k) = 0._CUSTOM_REAL
-          tempy2(i,j,k) = 0._CUSTOM_REAL
-          tempy3(i,j,k) = 0._CUSTOM_REAL
-
-          tempz1(i,j,k) = 0._CUSTOM_REAL
-          tempz2(i,j,k) = 0._CUSTOM_REAL
-          tempz3(i,j,k) = 0._CUSTOM_REAL
-
-          do l=1,NGLLX
-            hp1 = hprime_xx(i,l)
-            tempx1(i,j,k) = tempx1(i,j,k) + dummyx_loc(l,j,k)*hp1
-            tempy1(i,j,k) = tempy1(i,j,k) + dummyy_loc(l,j,k)*hp1
-            tempz1(i,j,k) = tempz1(i,j,k) + dummyz_loc(l,j,k)*hp1
-
-            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
-            hp2 = hprime_yy(j,l)
-            tempx2(i,j,k) = tempx2(i,j,k) + dummyx_loc(i,l,k)*hp2
-            tempy2(i,j,k) = tempy2(i,j,k) + dummyy_loc(i,l,k)*hp2
-            tempz2(i,j,k) = tempz2(i,j,k) + dummyz_loc(i,l,k)*hp2
-
-            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
-            hp3 = hprime_zz(k,l)
-            tempx3(i,j,k) = tempx3(i,j,k) + dummyx_loc(i,j,l)*hp3
-            tempy3(i,j,k) = tempy3(i,j,k) + dummyy_loc(i,j,l)*hp3
-            tempz3(i,j,k) = tempz3(i,j,k) + dummyz_loc(i,j,l)*hp3
-          enddo
-
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-              ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-              ! compute the strain
-              epsilon_xx = duxdxl
-              epsilon_yy = duydyl
-              epsilon_zz = duzdzl
-              epsilon_xy = 0.5 * duxdyl_plus_duydxl
-              epsilon_xz = 0.5 * duzdxl_plus_duxdzl
-              epsilon_yz = 0.5 * duzdyl_plus_duydzl
-
-              ! define symmetric components of epsilon
-              epsilon_yx = epsilon_xy
-              epsilon_zx = epsilon_xz
-              epsilon_zy = epsilon_yz
-
-              kappal = kappastore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-              rhol = rhostore(i,j,k,ispec)
-
-              ! isotropic case
-              lambdalplus2mul = kappal + FOUR_THIRDS * mul
-              lambdal = lambdalplus2mul - 2.*mul
-
-              ! compute stress sigma
-              sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-              sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-              sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-              sigma_xy = mul*duxdyl_plus_duydxl
-              sigma_xz = mul*duzdxl_plus_duxdzl
-              sigma_yz = mul*duzdyl_plus_duydzl
-
-              ! define symmetric components of sigma
-              sigma_yx = sigma_xy
-              sigma_zx = sigma_xz
-              sigma_zy = sigma_yz
-
-              integration_weight = wxgll(i)*wygll(j)*wzgll(k)*jacobianl
-
-              ! compute kinetic energy  1/2 rho ||v||^2
-              kinetic_energy = kinetic_energy + integration_weight * rhol*(veloc(1,iglob)**2 + &
-                                   veloc(2,iglob)**2 + veloc(3,iglob)**2) / 2.
-
-              ! compute potential energy 1/2 sigma_ij epsilon_ij
-              potential_energy = potential_energy + integration_weight * &
-                (sigma_xx*epsilon_xx + sigma_xy*epsilon_xy + sigma_xz*epsilon_xz + &
-                 sigma_yx*epsilon_yx + sigma_yy*epsilon_yy + sigma_yz*epsilon_yz + &
-                 sigma_zx*epsilon_zx + sigma_zy*epsilon_zy + sigma_zz*epsilon_zz) / 2.
-
-          enddo
-        enddo
-     enddo
-
-    !---
-    !--- acoustic spectral element
-    !---
-    else if(ispec_is_acoustic(ispec)) then
-
-      ! for the definition of potential energy in an acoustic fluid, see for instance
-      ! equation (23) of M. Maess et al., Journal of Sound and Vibration 296 (2006) 264-276
-
-      ! in case of an acoustic medium, a potential Chi of (density * displacement) is used as in Chaljub and Valette,
-      ! Geophysical Journal International, vol. 158, p. 131-141 (2004) and *NOT* a velocity potential
-      ! as in Komatitsch and Tromp, Geophysical Journal International, vol. 150, p. 303-318 (2002).
-      ! This permits acoustic-elastic coupling based on a non-iterative time scheme.
-      ! Displacement is then: u = grad(Chi) / rho
-      ! Velocity is then: v = grad(Chi_dot) / rho (Chi_dot being the time derivative of Chi)
-      ! and pressure is: p = - Chi_dot_dot  (Chi_dot_dot being the time second derivative of Chi).
-
-     do k=1,NGLLZ
-       do j=1,NGLLY
-         do i=1,NGLLX
-           iglob = ibool(i,j,k,ispec)
-           dummyx_loc(i,j,k) = potential_dot_acoustic(iglob)
-         enddo
-       enddo
-     enddo
-
-     do k=1,NGLLZ
-        do j=1,NGLLY
-          do i=1,NGLLX
-
-          iglob = ibool(i,j,k,ispec)
-
-          tempx1(i,j,k) = 0._CUSTOM_REAL
-          tempx2(i,j,k) = 0._CUSTOM_REAL
-          tempx3(i,j,k) = 0._CUSTOM_REAL
-
-          do l=1,NGLLX
-            hp1 = hprime_xx(i,l)
-            tempx1(i,j,k) = tempx1(i,j,k) + dummyx_loc(l,j,k)*hp1
-
-            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
-            hp2 = hprime_yy(j,l)
-            tempx2(i,j,k) = tempx2(i,j,k) + dummyx_loc(i,l,k)*hp2
-
-            !!! can merge these loops because NGLLX = NGLLY = NGLLZ
-            hp3 = hprime_zz(k,l)
-            tempx3(i,j,k) = tempx3(i,j,k) + dummyx_loc(i,j,l)*hp3
-          enddo
-
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              rhol = rhostore(i,j,k,ispec)
-              kappal = kappastore(i,j,k,ispec)
-              cpl = sqrt(kappal / rhol)
-
-              ! Velocity is v = grad(Chi_dot) / rho (Chi_dot being the time derivative of Chi)
-              vx = duxdxl / rhol
-              vy = duxdyl / rhol
-              vz = duxdzl / rhol
-
-              ! pressure is p = - Chi_dot_dot  (Chi_dot_dot being the time second derivative of Chi)
-              pressure = - potential_dot_dot_acoustic(iglob)
-
-              integration_weight = wxgll(i)*wygll(j)*wzgll(k)*jacobianl
-
-              ! compute kinetic energy  1/2 rho ||v||^2
-              kinetic_energy = kinetic_energy + integration_weight * rhol*(vx**2 + vy**2 + vz**2) / 2.
-
-              ! compute potential energy 1/2 sigma_ij epsilon_ij
-              potential_energy = potential_energy + integration_weight * pressure**2 / (2. * rhol * cpl**2)
-
-          enddo
-        enddo
-     enddo
-
-    else
-
-      call exit_MPI(myrank,'calculation of total energy implemented for acoustic and (visco)elastic elements only for now')
-
-    endif
-
-  enddo
-
-! compute the total using a reduction between all the processors
-  call sum_all_dp(kinetic_energy,kinetic_energy_glob)
-  call sum_all_dp(potential_energy,potential_energy_glob)
-  total_energy_glob = kinetic_energy_glob + potential_energy_glob
-
-! write the total to disk from the master
-  if(myrank == 0) write(IOUT_ENERGY,*) it,sngl(kinetic_energy_glob),sngl(potential_energy_glob),sngl(total_energy_glob)
-
-  end subroutine it_compute_total_energy
-
-!=====================================================================
-
-  subroutine it_update_displacement_scheme()
-
-! explicit Newmark time scheme with acoustic & elastic domains:
-! (see e.g. Hughes, 1987; Chaljub et al., 2003)
-!
-! chi(t+delta_t) = chi(t) + delta_t chi_dot(t) + 1/2 delta_t**2 chi_dot_dot(t)
-! chi_dot(t+delta_t) = chi_dot(t) + 1/2 delta_t chi_dot_dot(t) + 1/2 delta_t chi_dot_dot(t+delta_t)
-! chi_dot_dot(t+delta_t) = 1/M_acoustic( -K_acoustic chi(t+delta) + B_acoustic u(t+delta_t) + f(t+delta_t) )
-!
-! u(t+delta_t) = u(t) + delta_t  v(t) + 1/2  delta_t**2 a(t)
-! v(t+delta_t) = v(t) + 1/2 delta_t a(t) + 1/2 delta_t a(t+delta_t)
-! a(t+delta_t) = 1/M_elastic ( -K_elastic u(t+delta) + B_elastic chi_dot_dot(t+delta_t) + f( t+delta_t) )
-!
-! where
-!   chi, chi_dot, chi_dot_dot are acoustic (fluid) potentials ( dotted with respect to time)
-!   u, v, a are displacement,velocity & acceleration
-!   M is mass matrix, K stiffness matrix and B boundary term for acoustic/elastic domains
-!   f denotes a source term (acoustic/elastic)
-!
-! note that this stage calculates the predictor terms
-!
-!   for
-!   potential chi_dot(t+delta) requires + 1/2 delta_t chi_dot_dot(t+delta_t)
-!                                   at a later stage (corrector) once where chi_dot_dot(t+delta) is calculated
-!   and similar,
-!   velocity v(t+delta_t) requires  + 1/2 delta_t a(t+delta_t)
-!                                   at a later stage once where a(t+delta) is calculated
-! also:
-!   boundary term B_elastic requires chi_dot_dot(t+delta)
-!                                   thus chi_dot_dot has to be updated first before the elastic boundary term is considered
-
-  use specfem_par
-  use specfem_par_acoustic
-  use specfem_par_elastic
-  use specfem_par_poroelastic
-  use pml_par
-
-  implicit none
-
-! updates acoustic potentials
-  if( ACOUSTIC_SIMULATION ) then
-
-    if(.NOT. GPU_MODE) then
-      ! on CPU
-      potential_acoustic(:) = potential_acoustic(:) &
-                            + deltat * potential_dot_acoustic(:) &
-                            + deltatsqover2 * potential_dot_dot_acoustic(:)
-      potential_dot_acoustic(:) = potential_dot_acoustic(:) &
-                                + deltatover2 * potential_dot_dot_acoustic(:)
-      potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL
-    else
-      ! on GPU
-      call it_update_displacement_ac_cuda(Mesh_pointer, NGLOB_AB, &
-                                          deltat, deltatsqover2, deltatover2, &
-                                          b_deltat, b_deltatsqover2, b_deltatover2)
-    endif
-
-  endif ! ACOUSTIC_SIMULATION
-
-! updates elastic displacement and velocity
-  if( ELASTIC_SIMULATION ) then
-
-    if(.NOT. GPU_MODE) then
-      ! on CPU
-      displ(:,:) = displ(:,:) + deltat*veloc(:,:) + deltatsqover2*accel(:,:)
-      veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
-      if( SIMULATION_TYPE /= 1 ) accel_adj_coupling(:,:) = accel(:,:)
-      accel(:,:) = 0._CUSTOM_REAL
-    else
-      ! on GPU
-      ! Includes SIM_TYPE 1 & 3 (for noise tomography)
-      call it_update_displacement_cuda(Mesh_pointer, size(displ), deltat, deltatsqover2,&
-                                       deltatover2, b_deltat, b_deltatsqover2, b_deltatover2)
-    endif
-  endif
-
-! updates poroelastic displacements and velocities
-  if( POROELASTIC_SIMULATION ) then
-    ! solid phase
-    displs_poroelastic(:,:) = displs_poroelastic(:,:) + deltat*velocs_poroelastic(:,:) + &
-                              deltatsqover2*accels_poroelastic(:,:)
-    velocs_poroelastic(:,:) = velocs_poroelastic(:,:) + deltatover2*accels_poroelastic(:,:)
-    accels_poroelastic(:,:) = 0._CUSTOM_REAL
-
-    ! fluid phase
-    displw_poroelastic(:,:) = displw_poroelastic(:,:) + deltat*velocw_poroelastic(:,:) + &
-                              deltatsqover2*accelw_poroelastic(:,:)
-    velocw_poroelastic(:,:) = velocw_poroelastic(:,:) + deltatover2*accelw_poroelastic(:,:)
-    accelw_poroelastic(:,:) = 0._CUSTOM_REAL
-  endif
-
-! adjoint simulations
-  if (SIMULATION_TYPE == 3 .and. .NOT. GPU_MODE) then
-    ! acoustic backward fields
-    if( ACOUSTIC_SIMULATION ) then
-      if(PML_CONDITIONS)then
-        if(nglob_interface_PML_acoustic > 0)then
-          call read_potential_on_pml_interface(b_potential_dot_dot_acoustic,b_potential_dot_acoustic,b_potential_acoustic,&
-                                               nglob_interface_PML_acoustic,b_PML_potential,b_reclen_PML_potential)
-        endif
-      endif
-      b_potential_acoustic(:) = b_potential_acoustic(:) &
-                              + b_deltat * b_potential_dot_acoustic(:) &
-                              + b_deltatsqover2 * b_potential_dot_dot_acoustic(:)
-      b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) &
-                                  + b_deltatover2 * b_potential_dot_dot_acoustic(:)
-      b_potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL
-    endif
-
-    ! elastic backward fields
-    if( ELASTIC_SIMULATION ) then
-      if(PML_CONDITIONS)then
-        if(nglob_interface_PML_elastic > 0)then
-          call read_field_on_pml_interface(b_accel,b_veloc,b_displ,nglob_interface_PML_elastic,&
-                                           b_PML_field,b_reclen_PML_field)
-        endif
-      endif
-      b_displ(:,:) = b_displ(:,:) + b_deltat*b_veloc(:,:) + b_deltatsqover2*b_accel(:,:)
-      b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
-      b_accel(:,:) = 0._CUSTOM_REAL
-    endif
-    ! poroelastic backward fields
-    if( POROELASTIC_SIMULATION ) then
-    ! solid phase
-    b_displs_poroelastic(:,:) = b_displs_poroelastic(:,:) + b_deltat*b_velocs_poroelastic(:,:) + &
-                              b_deltatsqover2*b_accels_poroelastic(:,:)
-    b_velocs_poroelastic(:,:) = b_velocs_poroelastic(:,:) + b_deltatover2*b_accels_poroelastic(:,:)
-    b_accels_poroelastic(:,:) = 0._CUSTOM_REAL
-
-    ! fluid phase
-    b_displw_poroelastic(:,:) = b_displw_poroelastic(:,:) + b_deltat*b_velocw_poroelastic(:,:) + &
-                              b_deltatsqover2*b_accelw_poroelastic(:,:)
-    b_velocw_poroelastic(:,:) = b_velocw_poroelastic(:,:) + b_deltatover2*b_accelw_poroelastic(:,:)
-    b_accelw_poroelastic(:,:) = 0._CUSTOM_REAL
-    endif
-  endif
-
-! adjoint simulations: moho kernel
-  if (SAVE_MOHO_MESH .and. SIMULATION_TYPE == 3) then
-    ispec2D_moho_top = 0
-    ispec2D_moho_bot = 0
-  endif
-
-
-  end subroutine it_update_displacement_scheme
-
-!=====================================================================
-
   subroutine it_read_forward_arrays()
 
   use specfem_par
@@ -886,7 +233,7 @@
 !          that is, index it for adjoint sources will match index NSTEP - 1 for backward/reconstructed wavefields
 
   ! reads in wavefields
-  open(unit=27,file=trim(prname)//'save_forward_arrays.bin',status='old',&
+  open(unit=IIN,file=trim(prname)//'save_forward_arrays.bin',status='old',&
         action='read',form='unformatted',iostat=ier)
   if( ier /= 0 ) then
     print*,'error: opening save_forward_arrays'
@@ -895,65 +242,65 @@
   endif
 
   if( ACOUSTIC_SIMULATION ) then
-    read(27) b_potential_acoustic
-    read(27) b_potential_dot_acoustic
-    read(27) b_potential_dot_dot_acoustic
+    read(IIN) b_potential_acoustic
+    read(IIN) b_potential_dot_acoustic
+    read(IIN) b_potential_dot_dot_acoustic
 
     ! transfers fields onto GPU
-    if(GPU_MODE) &
+    if(GPU_MODE) then
       call transfer_b_fields_ac_to_device(NGLOB_AB,b_potential_acoustic, &
-                          b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
+                                          b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
+    endif
   endif
 
   ! elastic wavefields
   if( ELASTIC_SIMULATION ) then
-    read(27) b_displ
-    read(27) b_veloc
-    read(27) b_accel
+    read(IIN) b_displ
+    read(IIN) b_veloc
+    read(IIN) b_accel
 
     ! puts elastic wavefield to GPU
-    if(GPU_MODE) &
+    if(GPU_MODE) then
       call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel,Mesh_pointer)
+    endif
 
     ! memory variables if attenuation
     if( ATTENUATION ) then
-      if(FULL_ATTENUATION_SOLID) read(27) b_R_trace
-      read(27) b_R_xx
-      read(27) b_R_yy
-      read(27) b_R_xy
-      read(27) b_R_xz
-      read(27) b_R_yz
-      if(FULL_ATTENUATION_SOLID) read(27) b_epsilondev_trace
-      read(27) b_epsilondev_xx
-      read(27) b_epsilondev_yy
-      read(27) b_epsilondev_xy
-      read(27) b_epsilondev_xz
-      read(27) b_epsilondev_yz
+      if(FULL_ATTENUATION_SOLID) read(IIN) b_R_trace
+      read(IIN) b_R_xx
+      read(IIN) b_R_yy
+      read(IIN) b_R_xy
+      read(IIN) b_R_xz
+      read(IIN) b_R_yz
+      if(FULL_ATTENUATION_SOLID) read(IIN) b_epsilondev_trace
+      read(IIN) b_epsilondev_xx
+      read(IIN) b_epsilondev_yy
+      read(IIN) b_epsilondev_xy
+      read(IIN) b_epsilondev_xz
+      read(IIN) b_epsilondev_yz
 
       ! puts elastic attenuation arrays to GPU
-      if(GPU_MODE) &
+      if(GPU_MODE) then
           call transfer_b_fields_att_to_device(Mesh_pointer, &
-                    b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz,size(b_R_xx), &
-!!!                 b_R_trace,b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz,size(b_R_xx), &  ! please change the above line with this
+                    b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
+                    size(b_R_xx), &
                     b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, &
-!!!                 b_epsilondev_trace,b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, &
-!!!                 ! please change the above line with this
                     size(b_epsilondev_xx))
-    endif
-
+      endif
+    endif ! ATTENUATION
   endif
 
   ! poroelastic wavefields
   if( POROELASTIC_SIMULATION ) then
-    read(27) b_displs_poroelastic
-    read(27) b_velocs_poroelastic
-    read(27) b_accels_poroelastic
-    read(27) b_displw_poroelastic
-    read(27) b_velocw_poroelastic
-    read(27) b_accelw_poroelastic
+    read(IIN) b_displs_poroelastic
+    read(IIN) b_velocs_poroelastic
+    read(IIN) b_accels_poroelastic
+    read(IIN) b_displw_poroelastic
+    read(IIN) b_velocw_poroelastic
+    read(IIN) b_accelw_poroelastic
   endif
 
-  close(27)
+  close(IIN)
 
   end subroutine it_read_forward_arrays
 
@@ -969,6 +316,8 @@
 
   implicit none
 
+  integer :: ier
+
   if( it > 1 .and. it < NSTEP) then
     ! adjoint simulations
 
@@ -977,35 +326,43 @@
 !       reconstucted wavefield b_displ() at it corresponds to time (NSTEP-it-1)*DT - t0
 !       we read in the reconstructed wavefield at the end of the time iteration loop, i.e. after the Newmark scheme,
 !       thus, indexing is NSTEP-it (rather than something like NSTEP-(it-1) )
+
     if (SIMULATION_TYPE == 3 .and. mod(NSTEP-it,NSTEP_Q_SAVE) == 0) then
       ! reads files content
       write(outputname,"('save_Q_arrays_',i6.6,'.bin')") NSTEP-it
-      open(unit=27,file=trim(prname_Q)//trim(outputname),status='old',&
-            action='read',form='unformatted')
+      open(unit=IIN,file=trim(prname_Q)//trim(outputname),status='old',&
+            action='read',form='unformatted',iostat=ier)
+      if( ier /= 0 ) then
+        print*,'error: opening save_Q_arrays'
+        print*,'path: ',trim(prname_Q)//trim(outputname)
+        call exit_mpi(myrank,'error open file save_Q_arrays_***.bin for reading')
+      endif
+
       if( ELASTIC_SIMULATION ) then
         ! reads arrays from disk files
-        read(27) b_displ
-        read(27) b_veloc
-        read(27) b_accel
+        read(IIN) b_displ
+        read(IIN) b_veloc
+        read(IIN) b_accel
 
         ! puts elastic fields onto GPU
         if(GPU_MODE) then
           ! wavefields
           call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
         endif
-        if(FULL_ATTENUATION_SOLID) read(27) b_R_trace
-        read(27) b_R_xx
-        read(27) b_R_yy
-        read(27) b_R_xy
-        read(27) b_R_xz
-        read(27) b_R_yz
-        if(FULL_ATTENUATION_SOLID) read(27) b_epsilondev_trace
-        read(27) b_epsilondev_xx
-        read(27) b_epsilondev_yy
-        read(27) b_epsilondev_xy
-        read(27) b_epsilondev_xz
-        read(27) b_epsilondev_yz
 
+        if(FULL_ATTENUATION_SOLID) read(IIN) b_R_trace
+        read(IIN) b_R_xx
+        read(IIN) b_R_yy
+        read(IIN) b_R_xy
+        read(IIN) b_R_xz
+        read(IIN) b_R_yz
+        if(FULL_ATTENUATION_SOLID) read(IIN) b_epsilondev_trace
+        read(IIN) b_epsilondev_xx
+        read(IIN) b_epsilondev_yy
+        read(IIN) b_epsilondev_xy
+        read(IIN) b_epsilondev_xz
+        read(IIN) b_epsilondev_yz
+
         ! puts elastic fields onto GPU
         if(GPU_MODE) then
           ! attenuation arrays
@@ -1020,9 +377,9 @@
 
       if( ACOUSTIC_SIMULATION ) then
         ! reads arrays from disk files
-        read(27) b_potential_acoustic
-        read(27) b_potential_dot_acoustic
-        read(27) b_potential_dot_dot_acoustic
+        read(IIN) b_potential_acoustic
+        read(IIN) b_potential_dot_acoustic
+        read(IIN) b_potential_dot_dot_acoustic
 
         ! puts acoustic fields onto GPU
         if(GPU_MODE) &
@@ -1030,12 +387,19 @@
                               b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
 
       endif
-      close(27)
+      close(IIN)
+
     else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. mod(it,NSTEP_Q_SAVE) == 0) then
       ! stores files content
       write(outputname,"('save_Q_arrays_',i6.6,'.bin')") it
-      open(unit=27,file=trim(prname_Q)//trim(outputname),status='unknown',&
-           action='write',form='unformatted')
+      open(unit=IOUT,file=trim(prname_Q)//trim(outputname),status='unknown',&
+           action='write',form='unformatted',iostat=ier)
+      if( ier /= 0 ) then
+        print*,'error: opening save_Q_arrays'
+        print*,'path: ',trim(prname_Q)//trim(outputname)
+        call exit_mpi(myrank,'error open file save_Q_arrays_***.bin for writing')
+      endif
+
       if( ELASTIC_SIMULATION ) then
         ! gets elastic fields from GPU onto CPU
         if(GPU_MODE) then
@@ -1043,9 +407,9 @@
         endif
 
         ! writes to disk file
-        write(27) displ
-        write(27) veloc
-        write(27) accel
+        write(IOUT) displ
+        write(IOUT) veloc
+        write(IOUT) accel
 
         if(GPU_MODE) then
           ! attenuation arrays
@@ -1057,31 +421,34 @@
                      size(epsilondev_xx))
         endif
 
-        if(FULL_ATTENUATION_SOLID) write(27) R_trace
-        write(27) R_xx
-        write(27) R_yy
-        write(27) R_xy
-        write(27) R_xz
-        write(27) R_yz
-        if(FULL_ATTENUATION_SOLID) write(27) epsilondev_trace
-        write(27) epsilondev_xx
-        write(27) epsilondev_yy
-        write(27) epsilondev_xy
-        write(27) epsilondev_xz
-        write(27) epsilondev_yz
+        if(FULL_ATTENUATION_SOLID) write(IOUT) R_trace
+        write(IOUT) R_xx
+        write(IOUT) R_yy
+        write(IOUT) R_xy
+        write(IOUT) R_xz
+        write(IOUT) R_yz
+        if(FULL_ATTENUATION_SOLID) write(IOUT) epsilondev_trace
+        write(IOUT) epsilondev_xx
+        write(IOUT) epsilondev_yy
+        write(IOUT) epsilondev_xy
+        write(IOUT) epsilondev_xz
+        write(IOUT) epsilondev_yz
       endif
+
       if( ACOUSTIC_SIMULATION ) then
-       ! gets acoustic fields from GPU onto CPU
-        if(GPU_MODE) &
+        ! gets acoustic fields from GPU onto CPU
+        if(GPU_MODE) then
           call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
-                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+                                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+        endif
 
         ! writes to disk file
-        write(27) potential_acoustic
-        write(27) potential_dot_acoustic
-        write(27) potential_dot_dot_acoustic
+        write(IOUT) potential_acoustic
+        write(IOUT) potential_dot_acoustic
+        write(IOUT) potential_dot_dot_acoustic
       endif
-      close(27)
+      close(IOUT)
+
     endif ! SIMULATION_TYPE
   endif ! it
 
@@ -1091,27 +458,28 @@
 
   subroutine it_print_elapsed_time()
 
-    use specfem_par
-    use specfem_par_elastic
-    use specfem_par_acoustic
-    implicit none
+  use specfem_par
+  use specfem_par_elastic
+  use specfem_par_acoustic
 
-    ! local parameters
-    double precision :: tCPU
-    integer :: ihours,iminutes,iseconds,int_tCPU
+  implicit none
 
-    if(myrank == 0) then
-       ! elapsed time since beginning of the simulation
-       tCPU = wtime() - time_start
-       int_tCPU = int(tCPU)
-       ihours = int_tCPU / 3600
-       iminutes = (int_tCPU - 3600*ihours) / 60
-       iseconds = int_tCPU - 3600*ihours - 60*iminutes
-       write(IMAIN,*) 'Time-Loop Complete. Timing info:'
-       write(IMAIN,*) 'Total elapsed time in seconds = ',tCPU
-       write(IMAIN,"(' Total elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
-    endif
+  ! local parameters
+  double precision :: tCPU
+  integer :: ihours,iminutes,iseconds,int_tCPU
 
+  if( myrank == 0 ) then
+    ! elapsed time since beginning of the simulation
+    tCPU = wtime() - time_start
+    int_tCPU = int(tCPU)
+    ihours = int_tCPU / 3600
+    iminutes = (int_tCPU - 3600*ihours) / 60
+    iseconds = int_tCPU - 3600*ihours - 60*iminutes
+    write(IMAIN,*) 'Time-Loop Complete. Timing info:'
+    write(IMAIN,*) 'Total elapsed time in seconds = ',tCPU
+    write(IMAIN,"(' Total elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
+  endif
+
   end subroutine it_print_elapsed_time
 
 !=====================================================================
@@ -1127,16 +495,17 @@
   implicit none
 
   ! to store forward wave fields
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+  if( SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
 
     ! acoustic potentials
     if( ACOUSTIC_SIMULATION ) &
       call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
-                            potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+                                          potential_dot_acoustic, potential_dot_dot_acoustic, &
+                                          Mesh_pointer)
 
     ! elastic wavefield
     if( ELASTIC_SIMULATION ) then
-      call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
+      call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc,accel,Mesh_pointer)
 
       if (ATTENUATION) &
         call transfer_fields_att_from_device(Mesh_pointer, &
@@ -1147,7 +516,7 @@
                     size(epsilondev_xx))
 
     endif
-  else if (SIMULATION_TYPE == 3) then
+  else if( SIMULATION_TYPE == 3 ) then
 
     ! to store kernels
     ! acoustic domains
@@ -1166,7 +535,7 @@
       !call transfer_b_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
 
       ! elastic kernels
-      call transfer_kernels_el_to_host(Mesh_pointer,rho_kl,mu_kl,kappa_kl,NSPEC_AB)
+      call transfer_kernels_el_to_host(Mesh_pointer,rho_kl,mu_kl,kappa_kl,cijkl_kl,NSPEC_AB)
     endif
 
     ! specific noise strength kernel
@@ -1175,7 +544,7 @@
     endif
 
     ! approximative hessian for preconditioning kernels
-    if ( APPROXIMATE_HESS_KL ) then
+    if( APPROXIMATE_HESS_KL ) then
       if( ELASTIC_SIMULATION ) call transfer_kernels_hess_el_tohost(Mesh_pointer,hess_kl,NSPEC_AB)
       if( ACOUSTIC_SIMULATION ) call transfer_kernels_hess_ac_tohost(Mesh_pointer,hess_ac_kl,NSPEC_AB)
     endif
@@ -1184,7 +553,6 @@
 
   ! frees allocated memory on GPU
   call prepare_cleanup_device(Mesh_pointer, &
-                              SAVE_FORWARD, &
                               ACOUSTIC_SIMULATION,ELASTIC_SIMULATION, &
                               STACEY_ABSORBING_CONDITIONS,NOISE_TOMOGRAPHY,COMPUTE_AND_STORE_STRAIN, &
                               ATTENUATION,ANISOTROPY,APPROXIMATE_OCEAN_LOAD, &

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -250,12 +250,12 @@
       deallocate(rmassz_acoustic)
     endif
 
-    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_acoustic,&
+    call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmass_acoustic,&
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
                         my_neighbours_ext_mesh)
 
-    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_acoustic_interface, &
+    call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmass_acoustic_interface, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
                         my_neighbours_ext_mesh)
@@ -284,15 +284,15 @@
     deallocate(rmass)
 
     ! assemble mass matrix
-    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmassx, &
+    call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmassx, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
-    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmassy, &
+    call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmassy, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
-    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmassz, &
+    call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmassz, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
@@ -307,7 +307,7 @@
 
     if(PML_CONDITIONS)then
       if(ACOUSTIC_SIMULATION)then
-        call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_elastic_interface, &
+        call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmass_elastic_interface, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
@@ -318,7 +318,7 @@
 
     ! ocean load
     if(APPROXIMATE_OCEAN_LOAD ) then
-      call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_ocean_load, &
+      call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmass_ocean_load, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
@@ -328,12 +328,12 @@
  endif
 
   if(POROELASTIC_SIMULATION) then
-    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_solid_poroelastic, &
+    call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmass_solid_poroelastic, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
 
-    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_fluid_poroelastic, &
+    call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmass_fluid_poroelastic, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
@@ -852,12 +852,17 @@
     ! elastic domain
     if( ELASTIC_SIMULATION ) then
       rho_kl(:,:,:,:)   = 0._CUSTOM_REAL
-      mu_kl(:,:,:,:)    = 0._CUSTOM_REAL
-      kappa_kl(:,:,:,:) = 0._CUSTOM_REAL
-      cijkl_kl(:,:,:,:,:) = 0._CUSTOM_REAL
 
-      if ( APPROXIMATE_HESS_KL ) &
+      if (ANISOTROPIC_KL) then
+        cijkl_kl(:,:,:,:,:) = 0._CUSTOM_REAL
+      else
+        mu_kl(:,:,:,:)    = 0._CUSTOM_REAL
+        kappa_kl(:,:,:,:) = 0._CUSTOM_REAL
+      endif
+
+      if ( APPROXIMATE_HESS_KL ) then
         hess_kl(:,:,:,:)   = 0._CUSTOM_REAL
+      endif
 
       ! reconstructed/backward elastic wavefields
       b_displ = 0._CUSTOM_REAL
@@ -961,7 +966,7 @@
 
         ! total file size
         filesize = b_reclen_field
-        filesize = filesize*NSTEP
+        filesize = filesize * NSTEP
 
         if (SIMULATION_TYPE == 3) then
           ! opens existing files
@@ -1108,8 +1113,9 @@
         endif
       endif
     else
+      ! num_abs_boundary_faces is zero
       ! needs dummy array
-      b_num_abs_boundary_faces = 1
+      b_num_abs_boundary_faces = 0
       if( ELASTIC_SIMULATION ) then
         allocate(b_absorb_field(NDIM,NGLLSQUARE,b_num_abs_boundary_faces),stat=ier)
         if( ier /= 0 ) stop 'error allocating array b_absorb_field'
@@ -1128,7 +1134,7 @@
     endif
   else ! STACEY_ABSORBING_CONDITIONS
     ! needs dummy array
-    b_num_abs_boundary_faces = 1
+    b_num_abs_boundary_faces = 0
     if( ELASTIC_SIMULATION ) then
       allocate(b_absorb_field(NDIM,NGLLSQUARE,b_num_abs_boundary_faces),stat=ier)
       if( ier /= 0 ) stop 'error allocating array b_absorb_field'
@@ -1246,100 +1252,94 @@
   ! prepares general fields on GPU
   !§!§ JC JC here we will need to add GPU support for the new C-PML routines
   call prepare_constants_device(Mesh_pointer, &
-                                  NGLLX, NSPEC_AB, NGLOB_AB, &
-                                  xix, xiy, xiz, etax,etay,etaz, gammax, gammay, gammaz, &
-                                  kappastore, mustore,ibool, &
-                                  num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
-                                  nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, &
-                                  hprime_xx, &
-                                  hprimewgll_xx, &
-                                  wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
-                                  STACEY_ABSORBING_CONDITIONS, &
-                                  abs_boundary_ispec, abs_boundary_ijk, &
-                                  abs_boundary_normal, &
-                                  abs_boundary_jacobian2Dw, &
-                                  num_abs_boundary_faces, &
-                                  ispec_is_inner, &
-                                  NSOURCES, nsources_local, &
-                                  sourcearrays, islice_selected_source, ispec_selected_source, &
-                                  number_receiver_global, ispec_selected_rec, &
-                                  nrec, nrec_local, &
-                                  SIMULATION_TYPE, &
-                                  USE_MESH_COLORING_GPU, &
-                                  nspec_acoustic,nspec_elastic,&
-                                  my_neighbours_ext_mesh,&
-                                  request_send_vector_ext_mesh,&
-                                  request_recv_vector_ext_mesh,&
-                                  buffer_recv_vector_ext_mesh)
+                                NGLLX, NSPEC_AB, NGLOB_AB, &
+                                xix, xiy, xiz, etax,etay,etaz, gammax, gammay, gammaz, &
+                                kappastore, mustore, &
+                                ibool, &
+                                num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+                                nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, &
+                                hprime_xx,hprimewgll_xx, &
+                                wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
+                                STACEY_ABSORBING_CONDITIONS, &
+                                abs_boundary_ispec, abs_boundary_ijk, &
+                                abs_boundary_normal, &
+                                abs_boundary_jacobian2Dw, &
+                                num_abs_boundary_faces, &
+                                ispec_is_inner, &
+                                NSOURCES, nsources_local, &
+                                sourcearrays, islice_selected_source, ispec_selected_source, &
+                                number_receiver_global, ispec_selected_rec, &
+                                nrec, nrec_local, &
+                                SIMULATION_TYPE, &
+                                USE_MESH_COLORING_GPU, &
+                                nspec_acoustic,nspec_elastic,&
+                                myrank,SAVE_FORWARD)
 
 
   ! prepares fields on GPU for acoustic simulations
   if( ACOUSTIC_SIMULATION ) then
-    call prepare_fields_acoustic_device(Mesh_pointer,rmass_acoustic,rhostore,kappastore, &
-                                  num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
-                                  ispec_is_acoustic, &
-                                  NOISE_TOMOGRAPHY,num_free_surface_faces, &
-                                  free_surface_ispec,free_surface_ijk, &
-                                  b_reclen_potential,b_absorb_potential, &
-                                  ELASTIC_SIMULATION, num_coupling_ac_el_faces, &
-                                  coupling_ac_el_ispec,coupling_ac_el_ijk, &
-                                  coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
-                                  num_colors_outer_acoustic,num_colors_inner_acoustic, &
-                                  num_elem_colors_acoustic)
+    call prepare_fields_acoustic_device(Mesh_pointer, &
+                                rmass_acoustic,rhostore,kappastore, &
+                                num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
+                                ispec_is_acoustic, &
+                                NOISE_TOMOGRAPHY,num_free_surface_faces, &
+                                free_surface_ispec,free_surface_ijk, &
+                                b_reclen_potential,b_absorb_potential, &
+                                ELASTIC_SIMULATION, num_coupling_ac_el_faces, &
+                                coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                                coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
+                                num_colors_outer_acoustic,num_colors_inner_acoustic, &
+                                num_elem_colors_acoustic)
 
     if( SIMULATION_TYPE == 3 ) &
       call prepare_fields_acoustic_adj_dev(Mesh_pointer, &
-                                           APPROXIMATE_HESS_KL)
+                                APPROXIMATE_HESS_KL)
 
   endif
 
   ! prepares fields on GPU for elastic simulations
   !§!§ JC JC here we will need to add GPU support for the new C-PML routines
   if( ELASTIC_SIMULATION ) then
-    call prepare_fields_elastic_device(Mesh_pointer, NDIM*NGLOB_AB, &
-                                  rmassx,rmassy,rmassz, &
-                                  rho_vp,rho_vs, &
-                                  num_phase_ispec_elastic,phase_ispec_inner_elastic, &
-                                  ispec_is_elastic, &
-                                  b_absorb_field,b_reclen_field, &
-                                  SAVE_FORWARD, &
-                                  COMPUTE_AND_STORE_STRAIN, &
-                                  epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-!!!                               epsilondev_trace,epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                                  epsilondev_xz,epsilondev_yz, &
-                                  ATTENUATION, &
-                                  size(R_xx), &
-                                  R_xx,R_yy,R_xy,R_xz,R_yz, &
-!!!                               R_trace,R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                  one_minus_sum_beta,factor_common, &
-!!!                                  one_minus_sum_beta_kappa,factor_commonkappa, &
-                                  alphaval,betaval,gammaval, &
-                                  APPROXIMATE_OCEAN_LOAD,rmass_ocean_load, &
-                                  NOISE_TOMOGRAPHY, &
-                                  free_surface_normal,free_surface_ispec,free_surface_ijk, &
-                                  num_free_surface_faces, &
-                                  ACOUSTIC_SIMULATION, &
-                                  num_colors_outer_elastic,num_colors_inner_elastic, &
-                                  num_elem_colors_elastic, &
-                                  ANISOTROPY, &
-                                  c11store,c12store,c13store,c14store,c15store,c16store, &
-                                  c22store,c23store,c24store,c25store,c26store, &
-                                  c33store,c34store,c35store,c36store, &
-                                  c44store,c45store,c46store,c55store,c56store,c66store)
+    call prepare_fields_elastic_device(Mesh_pointer, &
+                                rmassx,rmassy,rmassz, &
+                                rho_vp,rho_vs, &
+                                num_phase_ispec_elastic,phase_ispec_inner_elastic, &
+                                ispec_is_elastic, &
+                                b_absorb_field,b_reclen_field, &
+                                COMPUTE_AND_STORE_STRAIN, &
+                                epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                epsilondev_xz,epsilondev_yz, &
+                                ATTENUATION, &
+                                size(R_xx), &
+                                R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                one_minus_sum_beta,factor_common, &
+                                alphaval,betaval,gammaval, &
+                                APPROXIMATE_OCEAN_LOAD,rmass_ocean_load, &
+                                NOISE_TOMOGRAPHY, &
+                                free_surface_normal,free_surface_ispec,free_surface_ijk, &
+                                num_free_surface_faces, &
+                                ACOUSTIC_SIMULATION, &
+                                num_colors_outer_elastic,num_colors_inner_elastic, &
+                                num_elem_colors_elastic, &
+                                ANISOTROPY, &
+                                c11store,c12store,c13store,c14store,c15store,c16store, &
+                                c22store,c23store,c24store,c25store,c26store, &
+                                c33store,c34store,c35store,c36store, &
+                                c44store,c45store,c46store,c55store,c56store,c66store)
 
     if( SIMULATION_TYPE == 3 ) &
-      call prepare_fields_elastic_adj_dev(Mesh_pointer, NDIM*NGLOB_AB, &
-                                  COMPUTE_AND_STORE_STRAIN, &
-                                  epsilon_trace_over_3, &
-                                  b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, &
-!!!                               b_epsilondev_trace,b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, &
-                                  b_epsilondev_xz,b_epsilondev_yz, &
-                                  b_epsilon_trace_over_3, &
-                                  ATTENUATION,size(R_xx), &
-                                  b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
-!!!                               b_R_trace,b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
-                                  b_alphaval,b_betaval,b_gammaval, &
-                                  APPROXIMATE_HESS_KL)
+      call prepare_fields_elastic_adj_dev(Mesh_pointer, &
+                                NDIM*NGLOB_AB, &
+                                COMPUTE_AND_STORE_STRAIN, &
+                                epsilon_trace_over_3, &
+                                b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, &
+                                b_epsilondev_xz,b_epsilondev_yz, &
+                                b_epsilon_trace_over_3, &
+                                ATTENUATION,size(R_xx), &
+                                b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
+                                b_alphaval,b_betaval,b_gammaval, &
+                                ANISOTROPIC_KL, &
+                                APPROXIMATE_HESS_KL)
 
   endif
 
@@ -1348,44 +1348,51 @@
     stop 'todo poroelastic simulations on GPU'
   endif
 
+  ! synchronizes processes
+  !call sync_all()
+
   ! prepares needed receiver array for adjoint runs
   if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) &
     call prepare_sim2_or_3_const_device(Mesh_pointer, &
-                                       islice_selected_rec,size(islice_selected_rec), &
-                                       nadj_rec_local,nrec,myrank)
+                                islice_selected_rec,size(islice_selected_rec), &
+                                nadj_rec_local,nrec)
 
   ! prepares fields on GPU for noise simulations
   if ( NOISE_TOMOGRAPHY > 0 ) then
     ! note: noise tomography is only supported for elastic domains so far.
 
     ! copies noise  arrays to GPU
-    call prepare_fields_noise_device(Mesh_pointer, NSPEC_AB, NGLOB_AB, &
-                                  free_surface_ispec, &
-                                  free_surface_ijk, &
-                                  num_free_surface_faces, &
-                                  NOISE_TOMOGRAPHY, &
-                                  NSTEP,noise_sourcearray, &
-                                  normal_x_noise,normal_y_noise,normal_z_noise, &
-                                  mask_noise,free_surface_jacobian2Dw)
+    call prepare_fields_noise_device(Mesh_pointer, &
+                                NSPEC_AB, NGLOB_AB, &
+                                free_surface_ispec, &
+                                free_surface_ijk, &
+                                num_free_surface_faces, &
+                                NOISE_TOMOGRAPHY, &
+                                NSTEP,noise_sourcearray, &
+                                normal_x_noise,normal_y_noise,normal_z_noise, &
+                                mask_noise,free_surface_jacobian2Dw)
 
   endif ! NOISE_TOMOGRAPHY
 
   ! prepares gravity arrays
   if( GRAVITY ) then
     call prepare_fields_gravity_device(Mesh_pointer,GRAVITY, &
-                                    minus_deriv_gravity,minus_g,wgll_cube,&
-                                    ACOUSTIC_SIMULATION,rhostore)
+                                minus_deriv_gravity,minus_g,wgll_cube,&
+                                ACOUSTIC_SIMULATION,rhostore)
   endif
 
+  ! synchronizes processes
+  call sync_all()
+
   ! sends initial data to device
 
   ! puts acoustic initial fields onto GPU
   if( ACOUSTIC_SIMULATION ) then
     call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, &
-                          potential_dot_acoustic,potential_dot_dot_acoustic,Mesh_pointer)
+                                      potential_dot_acoustic,potential_dot_dot_acoustic,Mesh_pointer)
     if( SIMULATION_TYPE == 3 ) &
       call transfer_b_fields_ac_to_device(NGLOB_AB,b_potential_acoustic, &
-                          b_potential_dot_acoustic,b_potential_dot_dot_acoustic,Mesh_pointer)
+                                          b_potential_dot_acoustic,b_potential_dot_dot_acoustic,Mesh_pointer)
   endif
 
   ! puts elastic initial fields onto GPU
@@ -1396,6 +1403,9 @@
       call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel,Mesh_pointer)
   endif
 
+  ! synchronizes processes
+  call sync_all()
+
   ! outputs GPU usage to files for all processes
   call output_free_device_memory(myrank)
 

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/read_mesh_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/read_mesh_databases.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/read_mesh_databases.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -436,6 +436,12 @@
 
   ! absorbing boundary surface
   read(27) num_abs_boundary_faces
+
+  ! checks
+  if( num_abs_boundary_faces < 0 ) then
+    print*,'read_mesh_databases: reading in negative num_abs_boundary_faces ',num_abs_boundary_faces,'...resetting to zero'
+    num_abs_boundary_faces = 0
+  endif
   allocate(abs_boundary_ispec(num_abs_boundary_faces), &
           abs_boundary_ijk(3,NGLLSQUARE,num_abs_boundary_faces), &
           abs_boundary_jacobian2Dw(NGLLSQUARE,num_abs_boundary_faces), &
@@ -696,23 +702,34 @@
   !endif
 
   ! MPI communications
-  allocate(buffer_send_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
-    buffer_recv_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
-    buffer_send_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
-    buffer_recv_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
-    request_send_vector_ext_mesh(num_interfaces_ext_mesh), &
-    request_recv_vector_ext_mesh(num_interfaces_ext_mesh), &
-    request_send_scalar_ext_mesh(num_interfaces_ext_mesh), &
-    request_recv_scalar_ext_mesh(num_interfaces_ext_mesh), &
-    buffer_send_vector_ext_mesh_s(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
-    buffer_recv_vector_ext_mesh_s(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
-    buffer_send_vector_ext_mesh_w(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
-    buffer_recv_vector_ext_mesh_w(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
-    request_send_vector_ext_mesh_s(num_interfaces_ext_mesh), &
-    request_recv_vector_ext_mesh_s(num_interfaces_ext_mesh), &
-    request_send_vector_ext_mesh_w(num_interfaces_ext_mesh), &
-    request_recv_vector_ext_mesh_w(num_interfaces_ext_mesh),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array buffer_send_vector_ext_mesh etc.'
+  if( ACOUSTIC_SIMULATION ) then
+    allocate(buffer_send_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+             buffer_recv_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+             request_send_scalar_ext_mesh(num_interfaces_ext_mesh), &
+             request_recv_scalar_ext_mesh(num_interfaces_ext_mesh), &
+             stat=ier)
+    if( ier /= 0 ) stop 'error allocating array buffer_send_scalar_ext_mesh,.. for acoustic simulations'
+  endif
+  if( ELASTIC_SIMULATION ) then
+    allocate(buffer_send_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+             buffer_recv_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+             request_send_vector_ext_mesh(num_interfaces_ext_mesh), &
+             request_recv_vector_ext_mesh(num_interfaces_ext_mesh), &
+             stat=ier)
+    if( ier /= 0 ) stop 'error allocating array buffer_send_vector_ext_mesh,.. for elastic simulations'
+  endif
+  if( POROELASTIC_SIMULATION ) then
+    allocate(buffer_send_vector_ext_mesh_s(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+             buffer_recv_vector_ext_mesh_s(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+             buffer_send_vector_ext_mesh_w(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+             buffer_recv_vector_ext_mesh_w(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+             request_send_vector_ext_mesh_s(num_interfaces_ext_mesh), &
+             request_recv_vector_ext_mesh_s(num_interfaces_ext_mesh), &
+             request_send_vector_ext_mesh_w(num_interfaces_ext_mesh), &
+             request_recv_vector_ext_mesh_w(num_interfaces_ext_mesh), &
+             stat=ier)
+    if( ier /= 0 ) stop 'error allocating array buffer_send_vector_ext_mesh_s,.. for poroelastic simulations'
+  endif
 
   ! gets model dimensions
   minl = minval( xstore )
@@ -800,31 +817,25 @@
     ! density kernel
     allocate(rho_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
     if( ier /= 0 ) stop 'error allocating array rho_kl'
-    ! shear modulus kernel
-    allocate(mu_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array mu_kl'
-    ! compressional modulus kernel
-    allocate(kappa_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array kappa_kl'
 
-    ! anisotropic kernels
-!! DK DK commented this out for now; must be made optional
-!   allocate(cijkl_kl(21,NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
-!   if( ier /= 0 ) stop 'error allocating array cijkl_kl'
-!! DK DK added this for now
-    allocate(cijkl_kl(1,1,1,1,1),stat=ier)
+    if (ANISOTROPIC_KL) then
+      ! anisotropic kernels
+      allocate(cijkl_kl(21,NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+      if( ier /= 0 ) stop 'error allocating array cijkl_kl'
+      !dummy
+      allocate(mu_kl(1,1,1,1))
+      allocate(kappa_kl(1,1,1,1))
+    else
+      ! shear modulus kernel
+      allocate(mu_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+      if( ier /= 0 ) stop 'error allocating array mu_kl'
+      ! compressional modulus kernel
+      allocate(kappa_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+      if( ier /= 0 ) stop 'error allocating array kappa_kl'
+      !dummy
+      allocate(cijkl_kl(1,1,1,1,1))
+    endif
 
-    ! derived kernels
-    ! density prime kernel
-    allocate(rhop_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array rhop_kl'
-    ! vp kernel
-    allocate(alpha_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array alpha_kl'
-    ! vs kernel
-    allocate(beta_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array beta_kl'
-
     ! noise source strength kernel
     if (NOISE_TOMOGRAPHY == 3) then
       allocate(sigma_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
@@ -843,17 +854,17 @@
 
     ! MPI handling
     allocate(b_request_send_vector_ext_mesh(num_interfaces_ext_mesh), &
-      b_request_recv_vector_ext_mesh(num_interfaces_ext_mesh), &
-      b_buffer_send_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
-      b_buffer_recv_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+             b_request_recv_vector_ext_mesh(num_interfaces_ext_mesh), &
+             b_buffer_send_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+             b_buffer_recv_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
     if( ier /= 0 ) stop 'error allocating array b_request_send_vector_ext_mesh etc.'
 
     ! allocates attenuation solids
     allocate(b_R_xx(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
-            b_R_yy(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
-            b_R_xy(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
-            b_R_xz(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
-            b_R_yz(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS),stat=ier)
+             b_R_yy(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
+             b_R_xy(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
+             b_R_xz(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
+             b_R_yz(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS),stat=ier)
     if( ier /= 0 ) stop 'error allocating array b_R_xx etc.'
 
     ! note: these arrays are needed for attenuation and/or kernel computations

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/save_adjoint_kernels.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/save_adjoint_kernels.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/save_adjoint_kernels.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -68,10 +68,10 @@
           enddo ! j
         enddo ! k
     enddo ! ispec
-    open(unit=27,file=prname(1:len_trim(prname))//'weights_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+    open(unit=IOUT,file=prname(1:len_trim(prname))//'weights_kernel.bin',status='unknown',form='unformatted',iostat=ier)
     if( ier /= 0 ) stop 'error opening file weights_kernel.bin'
-    write(27) weights_kernel
-    close(27)
+    write(IOUT) weights_kernel
+    close(IOUT)
   endif
 
   ! for noise simulations --- noise strength kernel
@@ -123,25 +123,25 @@
   enddo
 
   ! save kernels to binary files
-  open(unit=27,file=prname(1:len_trim(prname))//'rho_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'rho_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file rho_acoustic_kernel.bin'
-  write(27) rho_ac_kl
-  close(27)
+  write(IOUT) rho_ac_kl
+  close(IOUT)
 
-  open(unit=27,file=prname(1:len_trim(prname))//'kappa_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'kappa_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file kappa_acoustic_kernel.bin'
-  write(27) kappa_ac_kl
-  close(27)
+  write(IOUT) kappa_ac_kl
+  close(IOUT)
 
-  open(unit=27,file=prname(1:len_trim(prname))//'rhop_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'rhop_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file rhop_acoustic_kernel.bin'
-  write(27) rhop_ac_kl
-  close(27)
+  write(IOUT) rhop_ac_kl
+  close(IOUT)
 
-  open(unit=27,file=prname(1:len_trim(prname))//'alpha_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'alpha_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file alpha_acoustic_kernel.bin'
-  write(27) alpha_ac_kl
-  close(27)
+  write(IOUT) alpha_ac_kl
+  close(IOUT)
 
   end subroutine save_kernels_acoustic
 
@@ -159,15 +159,48 @@
   ! local parameters
   integer:: ispec,i,j,k,iglob,ier
   real(kind=CUSTOM_REAL) :: rhol,mul,kappal
-  real(kind=CUSTOM_REAL),dimension(21) ::  cijkl_kl_local
 
   ! Transverse isotropic paramters
   real(kind=CUSTOM_REAL) :: A,N,C,L,F,eta
-  real(kind=CUSTOM_REAL), dimension(21) :: an_kl
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT):: &
-    alphav_kl,alphah_kl,betav_kl,betah_kl, &
-    eta_kl
+  real(kind=CUSTOM_REAL), dimension(21) :: cijkl_kl_local
+  real(kind=CUSTOM_REAL), dimension(5) :: an_kl
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:),allocatable :: alphav_kl,alphah_kl,betav_kl,betah_kl,eta_kl
 
+  ! temporary isotropic kernels
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:),allocatable :: rhop_kl,alpha_kl,beta_kl
+
+  ! allocates temporary transversely isotropic kernels
+  if( ANISOTROPIC_KL ) then
+    if( SAVE_TRANSVERSE_KL ) then
+      allocate(alphav_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT), &
+               alphah_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT), &
+               betav_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT), &
+               betah_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT), &
+               eta_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT), &
+               stat=ier)
+      if( ier /=0 ) stop 'error allocating arrays alphav_kl,...'
+
+      ! derived kernels
+      ! vp kernel
+      allocate(alpha_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+      if( ier /= 0 ) stop 'error allocating array alpha_kl'
+      ! vs kernel
+      allocate(beta_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+      if( ier /= 0 ) stop 'error allocating array beta_kl'
+    endif
+  else
+    ! derived kernels
+    ! vp kernel
+    allocate(alpha_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array alpha_kl'
+    ! vs kernel
+    allocate(beta_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array beta_kl'
+    ! density prime kernel
+    allocate(rhop_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array rhop_kl'
+  endif
+
   ! finalizes calculation of rhop, beta, alpha kernels
   do ispec = 1, NSPEC_AB
 
@@ -180,71 +213,68 @@
             iglob = ibool(i,j,k,ispec)
 
             ! Store local material values
-              rhol = rho_vs(i,j,k,ispec)*rho_vs(i,j,k,ispec) / mustore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-              kappal = kappastore(i,j,k,ispec)
+            rhol = rho_vs(i,j,k,ispec)*rho_vs(i,j,k,ispec) / mustore(i,j,k,ispec)
+            mul = mustore(i,j,k,ispec)
+            kappal = kappastore(i,j,k,ispec)
 
-            if (ANISOTROPIC_KL) then
+            if( ANISOTROPIC_KL ) then
+              if( SAVE_TRANSVERSE_KL ) then
+                cijkl_kl_local(:) = - cijkl_kl(:,i,j,k,ispec)
 
-               cijkl_kl_local = - cijkl_kl(:,i,j,k,ispec)
+                ! Computes parameters for an isotropic model
+                A = kappal + FOUR_THIRDS * mul
+                C = A
+                L = mul
+                N = mul
+                F = kappal - 2._CUSTOM_REAL/3._CUSTOM_REAL * mul
+                eta = 1._CUSTOM_REAL
 
-               if (SAVE_TRANSVERSE_KL) then
+                ! note: cijkl_kl_local() is fully anisotropic C_ij kernel components (non-dimensionalized)
+                !          for GLL point at (i,j,k,ispec)
 
-                 ! Computes parameters for an isotropic model
-                 A = kappal + FOUR_THIRDS * mul
-                 C = A
-                 L = mul
-                 N = mul
-                 F = kappal - 2._CUSTOM_REAL/3._CUSTOM_REAL * mul
-                 eta = 1._CUSTOM_REAL
+                ! Purpose : compute the kernels for the An coeffs (an_kl)
+                ! from the kernels for Cij (cijkl_kl_local)
 
-                 ! note: cijkl_kl_local() is fully anisotropic C_ij kernel components (non-dimensionalized)
-                 !          for GLL point at (i,j,k,ispec)
+                ! Definition of the input array cij_kl :
+                ! cij_kl(1) = C11 ; cij_kl(2) = C12 ; cij_kl(3) = C13
+                ! cij_kl(4) = C14 ; cij_kl(5) = C15 ; cij_kl(6) = C16
+                ! cij_kl(7) = C22 ; cij_kl(8) = C23 ; cij_kl(9) = C24
+                ! cij_kl(10) = C25 ; cij_kl(11) = C26 ; cij_kl(12) = C33
+                ! cij_kl(13) = C34 ; cij_kl(14) = C35 ; cij_kl(15) = C36
+                ! cij_kl(16) = C44 ; cij_kl(17) = C45 ; cij_kl(18) = C46
+                ! cij_kl(19) = C55 ; cij_kl(20) = C56 ; cij_kl(21) = C66
+                ! where the Cij (Voigt's notation) are defined as function of
+                ! the components of the elastic tensor in spherical coordinates
+                ! by eq. (A.1) of Chen & Tromp, GJI 168 (2007)
 
-                 ! Purpose : compute the kernels for the An coeffs (an_kl)
-                 ! from the kernels for Cij (cijkl_kl_local)
+                ! From the relations giving Cij in function of An
+                ! Checked with Min Chen's results (routine build_cij)
 
-                 ! Definition of the input array cij_kl :
-                 ! cij_kl(1) = C11 ; cij_kl(2) = C12 ; cij_kl(3) = C13
-                 ! cij_kl(4) = C14 ; cij_kl(5) = C15 ; cij_kl(6) = C16
-                 ! cij_kl(7) = C22 ; cij_kl(8) = C23 ; cij_kl(9) = C24
-                 ! cij_kl(10) = C25 ; cij_kl(11) = C26 ; cij_kl(12) = C33
-                 ! cij_kl(13) = C34 ; cij_kl(14) = C35 ; cij_kl(15) = C36
-                 ! cij_kl(16) = C44 ; cij_kl(17) = C45 ; cij_kl(18) = C46
-                 ! cij_kl(19) = C55 ; cij_kl(20) = C56 ; cij_kl(21) = C66
-                 ! where the Cij (Voigt's notation) are defined as function of
-                 ! the components of the elastic tensor in spherical coordinates
-                 ! by eq. (A.1) of Chen & Tromp, GJI 168 (2007)
+                an_kl(1) = cijkl_kl_local(1)+cijkl_kl_local(2)+cijkl_kl_local(7)  !A
+                an_kl(2) = cijkl_kl_local(12)                                     !C
+                an_kl(3) = -2*cijkl_kl_local(2)+cijkl_kl_local(21)                !N
+                an_kl(4) = cijkl_kl_local(16)+cijkl_kl_local(19)                  !L
+                an_kl(5) = cijkl_kl_local(3)+cijkl_kl_local(8)                    !F
 
-                 ! From the relations giving Cij in function of An
-                 ! Checked with Min Chen's results (routine build_cij)
+                ! for parameterization: ( alpha_v, alpha_h, beta_v, beta_h, eta, rho )
+                ! K_alpha_v
+                alphav_kl(i,j,k,ispec) = 2.0 * C * an_kl(2)
+                ! K_alpha_h
+                alphah_kl(i,j,k,ispec) = 2.0 * A * an_kl(1) + 2.0 * A * eta * an_kl(5)
+                ! K_beta_v
+                betav_kl(i,j,k,ispec) = 2.0 * L * an_kl(4) - 4.0 * L * eta * an_kl(5)
+                ! K_beta_h
+                betah_kl(i,j,k,ispec) = 2.0 * N * an_kl(3)
+                ! K_eta
+                eta_kl(i,j,k,ispec) = F * an_kl(5)
 
-                 an_kl(1) = cijkl_kl_local(1)+cijkl_kl_local(2)+cijkl_kl_local(7)  !A
-                 an_kl(2) = cijkl_kl_local(12)                                     !C
-                 an_kl(3) = -2*cijkl_kl_local(2)+cijkl_kl_local(21)                !N
-                 an_kl(4) = cijkl_kl_local(16)+cijkl_kl_local(19)                  !L
-                 an_kl(5) = cijkl_kl_local(3)+cijkl_kl_local(8)                    !F
-
-                 ! for parameterization: ( alpha_v, alpha_h, beta_v, beta_h, eta, rho )
-                 ! K_alpha_v
-                 alphav_kl(i,j,k,ispec) = 2*C*an_kl(2)
-                 ! K_alpha_h
-                 alphah_kl(i,j,k,ispec) = 2*A*an_kl(1) + 2*A*eta*an_kl(5)
-                 ! K_beta_v
-                 betav_kl(i,j,k,ispec) = 2*L*an_kl(4) - 4*L*eta*an_kl(5)
-                 ! K_beta_h
-                 betah_kl(i,j,k,ispec) = 2*N*an_kl(3)
-                 ! K_eta
-                 eta_kl(i,j,k,ispec) = F*an_kl(5)
-
-                 ! to check: isotropic kernels from transverse isotropic ones
-                 alpha_kl(i,j,k,ispec) = alphav_kl(i,j,k,ispec) &
+                ! to check: isotropic kernels from transverse isotropic ones
+                alpha_kl(i,j,k,ispec) = alphav_kl(i,j,k,ispec) &
                                                   + alphah_kl(i,j,k,ispec)
-                 beta_kl(i,j,k,ispec) = betav_kl(i,j,k,ispec) &
+                beta_kl(i,j,k,ispec) = betav_kl(i,j,k,ispec) &
                                                   + betah_kl(i,j,k,ispec)
+              endif ! SAVE_TRANSVERSE_KL
 
-                endif ! SAVE_TRANSVERSE_KL
-
             else
 
               ! isotropic kernels
@@ -283,91 +313,96 @@
 
   enddo
 
-  if (ANISOTROPIC_KL) then
+  if( ANISOTROPIC_KL ) then
+    ! outputs transverse isotropic kernels only
+    if( SAVE_TRANSVERSE_KL ) then
+      ! transverse isotropic kernels
+      ! (alpha_v, alpha_h, beta_v, beta_h, eta, rho ) parameterization
+      open(unit=IOUT,file=trim(prname)//'alphav_kernel.bin',status='unknown',form='unformatted',action='write')
+      write(IOUT) alphav_kl
+      close(IOUT)
+      open(unit=IOUT,file=trim(prname)//'alphah_kernel.bin',status='unknown',form='unformatted',action='write')
+      write(IOUT) alphah_kl
+      close(IOUT)
+      open(unit=IOUT,file=trim(prname)//'betav_kernel.bin',status='unknown',form='unformatted',action='write')
+      write(IOUT) betav_kl
+      close(IOUT)
+      open(unit=IOUT,file=trim(prname)//'betah_kernel.bin',status='unknown',form='unformatted',action='write')
+      write(IOUT) betah_kl
+      close(IOUT)
+      open(unit=IOUT,file=trim(prname)//'eta_kernel.bin',status='unknown',form='unformatted',action='write')
+      write(IOUT) eta_kl
+      close(IOUT)
 
-     ! outputs transverse isotropic kernels only
-     if (SAVE_TRANSVERSE_KL) then
-       ! transverse isotropic kernels
-       ! (alpha_v, alpha_h, beta_v, beta_h, eta, rho ) parameterization
-       open(unit=27,file=trim(prname)//'alphav_kernel.bin',status='unknown',form='unformatted',action='write')
-       write(27) alphav_kl
-       close(27)
-       open(unit=27,file=trim(prname)//'alphah_kernel.bin',status='unknown',form='unformatted',action='write')
-       write(27) alphah_kl
-       close(27)
-       open(unit=27,file=trim(prname)//'betav_kernel.bin',status='unknown',form='unformatted',action='write')
-       write(27) betav_kl
-       close(27)
-       open(unit=27,file=trim(prname)//'betah_kernel.bin',status='unknown',form='unformatted',action='write')
-       write(27) betah_kl
-       close(27)
-       open(unit=27,file=trim(prname)//'eta_kernel.bin',status='unknown',form='unformatted',action='write')
-       write(27) eta_kl
-       close(27)
-
-       ! transverse isotropic test kernels
-       open(unit=27,file=trim(prname)//'alpha_kernel.bin',status='unknown',form='unformatted',action='write')
-       write(27)  alpha_kl
-       close(27)
-       open(unit=27,file=trim(prname)//'beta_kernel.bin',status='unknown',form='unformatted',action='write')
-       write(27)  beta_kl
-       close(27)
-
-     else
-       ! fully anisotropic kernels
-       ! note: the C_ij and density kernels are not for relative perturbations (delta ln( m_i) = delta m_i / m_i),
-       !          but absolute perturbations (delta m_i = m_i - m_0).
-       ! Kappa and mu are for absolute perturbations, can be used to check with purely isotropic versions.
-       open(unit=27,file=trim(prname)//'rho_kernel.bin',status='unknown',form='unformatted',action='write')
-       write(27)  - rho_kl
-       close(27)
-       open(unit=27,file=trim(prname)//'cijkl_kernel.bin',status='unknown',form='unformatted',action='write')
-       write(27) - cijkl_kl
-       close(27)
-
-      endif
-
+      ! transverse isotropic test kernels
+      open(unit=IOUT,file=trim(prname)//'alpha_kernel.bin',status='unknown',form='unformatted',action='write')
+      write(IOUT) alpha_kl
+      close(IOUT)
+      open(unit=IOUT,file=trim(prname)//'beta_kernel.bin',status='unknown',form='unformatted',action='write')
+      write(IOUT) beta_kl
+      close(IOUT)
+    else
+      ! fully anisotropic kernels
+      ! note: the C_ij and density kernels are not for relative perturbations (delta ln( m_i) = delta m_i / m_i),
+      !          but absolute perturbations (delta m_i = m_i - m_0).
+      ! Kappa and mu are for absolute perturbations, can be used to check with purely isotropic versions.
+      open(unit=IOUT,file=trim(prname)//'rho_kernel.bin',status='unknown',form='unformatted',action='write')
+      write(IOUT) - rho_kl
+      close(IOUT)
+      open(unit=IOUT,file=trim(prname)//'cijkl_kernel.bin',status='unknown',form='unformatted',action='write')
+      write(IOUT) - cijkl_kl
+      close(IOUT)
+    endif
   else
-
     ! save kernels to binary files
-    open(unit=27,file=prname(1:len_trim(prname))//'rho_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+    open(unit=IOUT,file=prname(1:len_trim(prname))//'rho_kernel.bin',status='unknown',form='unformatted',iostat=ier)
     if( ier /= 0 ) stop 'error opening file rho_kernel.bin'
-    write(27) rho_kl
-    close(27)
+    write(IOUT) rho_kl
+    close(IOUT)
 
-    open(unit=27,file=prname(1:len_trim(prname))//'mu_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+    open(unit=IOUT,file=prname(1:len_trim(prname))//'mu_kernel.bin',status='unknown',form='unformatted',iostat=ier)
     if( ier /= 0 ) stop 'error opening file mu_kernel.bin'
-    write(27) mu_kl
-    close(27)
+    write(IOUT) mu_kl
+    close(IOUT)
 
-    open(unit=27,file=prname(1:len_trim(prname))//'kappa_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+    open(unit=IOUT,file=prname(1:len_trim(prname))//'kappa_kernel.bin',status='unknown',form='unformatted',iostat=ier)
     if( ier /= 0 ) stop 'error opening file kappa_kernel.bin'
-    write(27) kappa_kl
-    close(27)
+    write(IOUT) kappa_kl
+    close(IOUT)
 
-    open(unit=27,file=prname(1:len_trim(prname))//'rhop_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+    open(unit=IOUT,file=prname(1:len_trim(prname))//'rhop_kernel.bin',status='unknown',form='unformatted',iostat=ier)
     if( ier /= 0 ) stop 'error opening file rhop_kernel.bin'
-    write(27) rhop_kl
-    close(27)
+    write(IOUT) rhop_kl
+    close(IOUT)
 
-    open(unit=27,file=prname(1:len_trim(prname))//'beta_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+    open(unit=IOUT,file=prname(1:len_trim(prname))//'beta_kernel.bin',status='unknown',form='unformatted',iostat=ier)
     if( ier /= 0 ) stop 'error opening file beta_kernel.bin'
-    write(27) beta_kl
-    close(27)
+    write(IOUT) beta_kl
+    close(IOUT)
 
-    open(unit=27,file=prname(1:len_trim(prname))//'alpha_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+    open(unit=IOUT,file=prname(1:len_trim(prname))//'alpha_kernel.bin',status='unknown',form='unformatted',iostat=ier)
     if( ier /= 0 ) stop 'error opening file alpha_kernel.bin'
-    write(27) alpha_kl
-    close(27)
-  endif
+    write(IOUT) alpha_kl
+    close(IOUT)
+  endif ! ANISOTROPIC_KL
 
   if (SAVE_MOHO_MESH) then
-    open(unit=27,file=prname(1:len_trim(prname))//'moho_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+    open(unit=IOUT,file=prname(1:len_trim(prname))//'moho_kernel.bin',status='unknown',form='unformatted',iostat=ier)
     if( ier /= 0 ) stop 'error opening file moho_kernel.bin'
-    write(27) moho_kl
-    close(27)
+    write(IOUT) moho_kl
+    close(IOUT)
   endif
 
+  ! frees temporary arrays
+  if( ANISOTROPIC_KL ) then
+    if( SAVE_TRANSVERSE_KL ) then
+      deallocate(alphav_kl,alphah_kl,betav_kl,betah_kl,eta_kl)
+      deallocate(alpha_kl,beta_kl)
+    endif
+  else
+    deallocate(rhop_kl,alpha_kl,beta_kl)
+  endif
+
   end subroutine save_kernels_elastic
 
 !
@@ -610,98 +645,98 @@
   ! save kernels to binary files
 
   ! primary kernels
-  open(unit=27,file=prname(1:len_trim(prname))//'rhot_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'rhot_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file rhot_primeporo_kernel.bin'
-  write(27) rhot_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'rhof_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) rhot_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'rhof_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file rhof_primeporo_kernel.bin'
-  write(27) rhof_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'sm_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) rhof_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'sm_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file sm_primeporo_kernel.bin'
-  write(27) sm_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'eta_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) sm_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'eta_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file eta_primeporo_kernel.bin'
-  write(27) eta_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'mufr_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) eta_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'mufr_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file mufr_primeporo_kernel.bin'
-  write(27) mufr_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'B_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) mufr_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'B_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file B_primeporo_kernel.bin'
-  write(27) B_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'C_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) B_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'C_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file C_primeporo_kernel.bin'
-  write(27) C_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'M_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) C_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'M_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file M_primeporo_kernel.bin'
-  write(27) M_kl
-  close(27)
+  write(IOUT) M_kl
+  close(IOUT)
 
   ! density kernels
-  open(unit=27,file=prname(1:len_trim(prname))//'rhob_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'rhob_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file rhob_densityporo_kernel.bin'
-  write(27) rhob_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'rhofb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) rhob_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'rhofb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file rhofb_densityporo_kernel.bin'
-  write(27) rhofb_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'phi_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) rhofb_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'phi_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file phi_densityporo_kernel.bin'
-  write(27) phi_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'mufrb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) phi_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'mufrb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file mufrb_densityporo_kernel.bin'
-  write(27) mufrb_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'Bb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) mufrb_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'Bb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file Bb_densityporo_kernel.bin'
-  write(27) Bb_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'Cb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) Bb_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'Cb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file Cb_densityporo_kernel.bin'
-  write(27) Cb_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'Mb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) Cb_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'Mb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file Mb_densityporo_kernel.bin'
-  write(27) Mb_kl
-  close(27)
+  write(IOUT) Mb_kl
+  close(IOUT)
 
   ! wavespeed kernels
-  open(unit=27,file=prname(1:len_trim(prname))//'rhobb_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'rhobb_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file rhobb_waveporo_kernel.bin'
-  write(27) rhobb_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'rhofbb_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) rhobb_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'rhofbb_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file rhofbb_waveporo_kernel.bin'
-  write(27) rhofbb_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'phib_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) rhofbb_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'phib_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file phib_waveporo_kernel.bin'
-  write(27) phib_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'cs_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) phib_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'cs_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file cs_waveporo_kernel.bin'
-  write(27) cs_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'cpI_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) cs_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'cpI_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file cpI_waveporo_kernel.bin'
-  write(27) cpI_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'cpII_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) cpI_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'cpII_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file cpII_waveporo_kernel.bin'
-  write(27) cpII_kl
-  close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'ratio_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+  write(IOUT) cpII_kl
+  close(IOUT)
+  open(unit=IOUT,file=prname(1:len_trim(prname))//'ratio_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening file ratio_waveporo_kernel.bin'
-  write(27) ratio_kl
-  close(27)
+  write(IOUT) ratio_kl
+  close(IOUT)
 
   end subroutine save_kernels_poroelastic
 
@@ -724,11 +759,11 @@
     hess_ac_kl(:,:,:,:) = 2._CUSTOM_REAL * hess_ac_kl(:,:,:,:)
 
     ! stores into file
-    open(unit=27,file=trim(prname)//'hess_acoustic_kernel.bin', &
+    open(unit=IOUT,file=trim(prname)//'hess_acoustic_kernel.bin', &
           status='unknown',form='unformatted',action='write',iostat=ier)
     if( ier /= 0 ) stop 'error opening file hess_acoustic_kernel.bin'
-    write(27) hess_ac_kl
-    close(27)
+    write(IOUT) hess_ac_kl
+    close(IOUT)
   endif
 
   ! elastic domains
@@ -737,11 +772,11 @@
     hess_kl(:,:,:,:) = 2._CUSTOM_REAL * hess_kl(:,:,:,:)
 
     ! stores into file
-    open(unit=27,file=trim(prname)//'hess_kernel.bin', &
+    open(unit=IOUT,file=trim(prname)//'hess_kernel.bin', &
           status='unknown',form='unformatted',action='write',iostat=ier)
     if( ier /= 0 ) stop 'error opening file hess_kernel.bin'
-    write(27) hess_kl
-    close(27)
+    write(IOUT) hess_kl
+    close(IOUT)
   endif
 
   end subroutine save_kernels_hessian

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -896,7 +896,7 @@
         enddo
 
         ! writes out to VTK file
-        write(IOVTK,*) xmesh,ymesh,zmesh
+        write(IOVTK,'(3e18.6)') xmesh,ymesh,zmesh
       endif
     enddo ! NSOURCES
   endif
@@ -941,7 +941,7 @@
       enddo
 
       ! writes out to VTK file
-      write(IOVTK,*) xmesh,ymesh,zmesh
+      write(IOVTK,'(3e18.6)') xmesh,ymesh,zmesh
     endif
   enddo
 

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/specfem3D_par.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/specfem3D_par.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/specfem3D_par.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -363,8 +363,7 @@
   real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: b_epsilon_trace_over_3
 
   ! adjoint kernels
-  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: rho_kl, mu_kl, kappa_kl, &
-    rhop_kl, beta_kl, alpha_kl
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: rho_kl, mu_kl, kappa_kl
 
   ! anisotropic kernels
   real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: cijkl_kl

Added: seismo/3D/SPECFEM3D/trunk/src/specfem3D/update_displacement_scheme.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/update_displacement_scheme.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/update_displacement_scheme.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -0,0 +1,259 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 1
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+!                             July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+
+  subroutine update_displacement_scheme()
+
+! explicit Newmark time scheme with acoustic & elastic domains:
+! (see e.g. Hughes, 1987; Chaljub et al., 2003)
+!
+! chi(t+delta_t) = chi(t) + delta_t chi_dot(t) + 1/2 delta_t**2 chi_dot_dot(t)
+! chi_dot(t+delta_t) = chi_dot(t) + 1/2 delta_t chi_dot_dot(t) + 1/2 delta_t chi_dot_dot(t+delta_t)
+! chi_dot_dot(t+delta_t) = 1/M_acoustic( -K_acoustic chi(t+delta) + B_acoustic u(t+delta_t) + f(t+delta_t) )
+!
+! u(t+delta_t) = u(t) + delta_t  v(t) + 1/2  delta_t**2 a(t)
+! v(t+delta_t) = v(t) + 1/2 delta_t a(t) + 1/2 delta_t a(t+delta_t)
+! a(t+delta_t) = 1/M_elastic ( -K_elastic u(t+delta) + B_elastic chi_dot_dot(t+delta_t) + f( t+delta_t) )
+!
+! where
+!   chi, chi_dot, chi_dot_dot are acoustic (fluid) potentials ( dotted with respect to time)
+!   u, v, a are displacement,velocity & acceleration
+!   M is mass matrix, K stiffness matrix and B boundary term for acoustic/elastic domains
+!   f denotes a source term (acoustic/elastic)
+!
+! note that this stage calculates the predictor terms
+!
+!   for
+!   potential chi_dot(t+delta) requires + 1/2 delta_t chi_dot_dot(t+delta_t)
+!                                   at a later stage (corrector) once where chi_dot_dot(t+delta) is calculated
+!   and similar,
+!   velocity v(t+delta_t) requires  + 1/2 delta_t a(t+delta_t)
+!                                   at a later stage once where a(t+delta) is calculated
+! also:
+!   boundary term B_elastic requires chi_dot_dot(t+delta)
+!                                   thus chi_dot_dot has to be updated first before the elastic boundary term is considered
+
+  use specfem_par
+  use specfem_par_acoustic
+  use specfem_par_elastic
+  use specfem_par_poroelastic
+  use pml_par
+
+  implicit none
+
+  ! time marching
+
+  ! acoustic domain
+  if( ACOUSTIC_SIMULATION ) call update_displacement_acoustic()
+
+  ! elastic domain
+  if( ELASTIC_SIMULATION ) call update_displacement_elastic()
+
+  ! poroelastic domain
+  if( POROELASTIC_SIMULATION ) call update_displacement_poroelastic()
+
+  ! adjoint simulations: moho kernel
+  if (SAVE_MOHO_MESH .and. SIMULATION_TYPE == 3) then
+    ispec2D_moho_top = 0
+    ispec2D_moho_bot = 0
+  endif
+
+  end subroutine update_displacement_scheme
+
+!
+!--------------------------------------------------------------------------------------------------------------
+!
+
+  subroutine update_displacement_acoustic()
+
+! updates acoustic potentials
+
+  use specfem_par
+  use specfem_par_acoustic
+  use pml_par
+
+  implicit none
+
+  ! Newmark time marching
+
+  if( .not. GPU_MODE ) then
+    ! wavefields on CPU
+
+    ! updates (forward) acoustic potentials
+    potential_acoustic(:) = potential_acoustic(:) &
+                          + deltat * potential_dot_acoustic(:) &
+                          + deltatsqover2 * potential_dot_dot_acoustic(:)
+    potential_dot_acoustic(:) = potential_dot_acoustic(:) &
+                              + deltatover2 * potential_dot_dot_acoustic(:)
+    potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL
+
+    ! adjoint simulations
+    if( SIMULATION_TYPE == 3 ) then
+      ! updates acoustic backward/reconstructed fields
+      if( PML_CONDITIONS )then
+        if( nglob_interface_PML_acoustic > 0 )then
+          call read_potential_on_pml_interface(b_potential_dot_dot_acoustic,b_potential_dot_acoustic,b_potential_acoustic,&
+                                               nglob_interface_PML_acoustic,b_PML_potential,b_reclen_PML_potential)
+        endif
+      endif
+      b_potential_acoustic(:) = b_potential_acoustic(:) &
+                              + b_deltat * b_potential_dot_acoustic(:) &
+                              + b_deltatsqover2 * b_potential_dot_dot_acoustic(:)
+      b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) &
+                                  + b_deltatover2 * b_potential_dot_dot_acoustic(:)
+      b_potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL
+    endif
+
+  else
+    ! wavefields on GPU
+    ! check
+    if( SIMULATION_TYPE == 3 ) then
+      if( PML_CONDITIONS )then
+        call exit_MPI(myrank,'acoustic time marching scheme with PML_CONDITIONS on GPU not implemented yet...')
+      endif
+    endif
+
+    ! updates acoustic potentials
+    call it_update_displacement_ac_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,b_deltat,b_deltatsqover2,b_deltatover2)
+  endif ! GPU_MODE
+
+  end subroutine update_displacement_acoustic
+
+
+!
+!--------------------------------------------------------------------------------------------------------------
+!
+
+
+  subroutine update_displacement_elastic()
+
+! updates elastic wavefields
+
+  use specfem_par
+  use specfem_par_elastic
+  use pml_par
+
+  implicit none
+
+  ! Newmark time marching
+
+  if( .not. GPU_MODE ) then
+    ! wavefields on CPU
+
+    ! updates elastic displacement and velocity
+    displ(:,:) = displ(:,:) + deltat*veloc(:,:) + deltatsqover2*accel(:,:)
+    veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
+    if( SIMULATION_TYPE /= 1 ) accel_adj_coupling(:,:) = accel(:,:)
+    accel(:,:) = 0._CUSTOM_REAL
+
+    ! adjoint simulations
+    if( SIMULATION_TYPE == 3 ) then
+      ! elastic backward fields
+      if(PML_CONDITIONS)then
+        if(nglob_interface_PML_elastic > 0)then
+          call read_field_on_pml_interface(b_accel,b_veloc,b_displ,nglob_interface_PML_elastic,&
+                                           b_PML_field,b_reclen_PML_field)
+        endif
+      endif
+      b_displ(:,:) = b_displ(:,:) + b_deltat*b_veloc(:,:) + b_deltatsqover2*b_accel(:,:)
+      b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
+      b_accel(:,:) = 0._CUSTOM_REAL
+    endif
+
+  else
+    ! wavefields on GPU
+
+    ! check
+    if( SIMULATION_TYPE == 3 ) then
+      if( PML_CONDITIONS )then
+        if(nglob_interface_PML_elastic > 0)then
+          call exit_MPI(myrank,'elastic time marching scheme with PML_CONDITIONS on GPU not implemented yet...')
+        endif
+      endif
+    endif
+
+    ! updates elastic displacement and velocity
+    ! Includes SIM_TYPE 1 & 3 (for noise tomography)
+    call it_update_displacement_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,b_deltat,b_deltatsqover2,b_deltatover2)
+  endif ! GPU_MODE
+
+  end subroutine update_displacement_elastic
+
+!
+!--------------------------------------------------------------------------------------------------------------
+!
+
+  subroutine update_displacement_poroelastic()
+
+! updates poroelastic wavefields
+
+  use specfem_par
+  use specfem_par_poroelastic
+
+  implicit none
+
+  ! Newmark time marching
+
+  if( .not. GPU_MODE ) then
+    ! wavefields on CPU
+
+    ! updates poroelastic displacements and velocities
+    ! solid phase
+    displs_poroelastic(:,:) = displs_poroelastic(:,:) + deltat*velocs_poroelastic(:,:) + &
+                              deltatsqover2*accels_poroelastic(:,:)
+    velocs_poroelastic(:,:) = velocs_poroelastic(:,:) + deltatover2*accels_poroelastic(:,:)
+    accels_poroelastic(:,:) = 0._CUSTOM_REAL
+
+    ! fluid phase
+    displw_poroelastic(:,:) = displw_poroelastic(:,:) + deltat*velocw_poroelastic(:,:) + &
+                              deltatsqover2*accelw_poroelastic(:,:)
+    velocw_poroelastic(:,:) = velocw_poroelastic(:,:) + deltatover2*accelw_poroelastic(:,:)
+    accelw_poroelastic(:,:) = 0._CUSTOM_REAL
+
+    ! adjoint simulations
+    if( SIMULATION_TYPE == 3 ) then
+      ! poroelastic backward fields
+      ! solid phase
+      b_displs_poroelastic(:,:) = b_displs_poroelastic(:,:) + b_deltat*b_velocs_poroelastic(:,:) + &
+                                b_deltatsqover2*b_accels_poroelastic(:,:)
+      b_velocs_poroelastic(:,:) = b_velocs_poroelastic(:,:) + b_deltatover2*b_accels_poroelastic(:,:)
+      b_accels_poroelastic(:,:) = 0._CUSTOM_REAL
+
+      ! fluid phase
+      b_displw_poroelastic(:,:) = b_displw_poroelastic(:,:) + b_deltat*b_velocw_poroelastic(:,:) + &
+                                b_deltatsqover2*b_accelw_poroelastic(:,:)
+      b_velocw_poroelastic(:,:) = b_velocw_poroelastic(:,:) + b_deltatover2*b_accelw_poroelastic(:,:)
+      b_accelw_poroelastic(:,:) = 0._CUSTOM_REAL
+    endif
+
+  else
+    ! wavefields on GPU
+    call exit_MPI(myrank,'poroelastic time marching scheme on GPU not implemented yet...')
+  endif ! GPU_MODE
+
+  end subroutine update_displacement_poroelastic
+
+

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/write_seismograms.f90	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/write_seismograms.f90	2013-08-20 14:13:26 UTC (rev 22718)
@@ -87,204 +87,204 @@
 
   if(.not. GPU_MODE .or. (GPU_MODE .and. (.not. USE_CUDA_SEISMOGRAMS))) then
 
-  do irec_local = 1,nrec_local
+    do irec_local = 1,nrec_local
 
-    ! gets global number of that receiver
-    irec = number_receiver_global(irec_local)
+      ! gets global number of that receiver
+      irec = number_receiver_global(irec_local)
 
-    ! gets local receiver interpolators
-    ! (1-D Lagrange interpolators)
-    hxir(:) = hxir_store(irec_local,:)
-    hetar(:) = hetar_store(irec_local,:)
-    hgammar(:) = hgammar_store(irec_local,:)
+      ! gets local receiver interpolators
+      ! (1-D Lagrange interpolators)
+      hxir(:) = hxir_store(irec_local,:)
+      hetar(:) = hetar_store(irec_local,:)
+      hgammar(:) = hgammar_store(irec_local,:)
 
-    ! forward simulations
-    select case( SIMULATION_TYPE )
-    case( 1 )
+      ! forward simulations
+      select case( SIMULATION_TYPE )
+      case( 1 )
 
-      ! receiver's spectral element
-      ispec = ispec_selected_rec(irec)
+        ! receiver's spectral element
+        ispec = ispec_selected_rec(irec)
 
-      ! elastic wave field
-      if( ispec_is_elastic(ispec) ) then
-        ! interpolates displ/veloc/accel at receiver locations
-        call compute_interpolated_dva(displ,veloc,accel,NGLOB_AB, &
-                        ispec,NSPEC_AB,ibool, &
-                        xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                        hxir,hetar,hgammar, &
-                        dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
-      endif !elastic
+        ! elastic wave field
+        if( ispec_is_elastic(ispec) ) then
+          ! interpolates displ/veloc/accel at receiver locations
+          call compute_interpolated_dva(displ,veloc,accel,NGLOB_AB, &
+                          ispec,NSPEC_AB,ibool, &
+                          xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                          hxir,hetar,hgammar, &
+                          dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+        endif !elastic
 
-      ! acoustic wave field
-      if( ispec_is_acoustic(ispec) ) then
-        ! displacement vector
-        call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
-                        potential_acoustic, displ_element,&
-                        hprime_xx,hprime_yy,hprime_zz, &
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore,GRAVITY)
-        ! velocity vector
-        call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
-                        potential_dot_acoustic, veloc_element,&
-                        hprime_xx,hprime_yy,hprime_zz, &
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore,GRAVITY)
+        ! acoustic wave field
+        if( ispec_is_acoustic(ispec) ) then
+          ! displacement vector
+          call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
+                          potential_acoustic, displ_element,&
+                          hprime_xx,hprime_yy,hprime_zz, &
+                          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                          ibool,rhostore,GRAVITY)
+          ! velocity vector
+          call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
+                          potential_dot_acoustic, veloc_element,&
+                          hprime_xx,hprime_yy,hprime_zz, &
+                          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                          ibool,rhostore,GRAVITY)
 
-        ! interpolates displ/veloc/pressure at receiver locations
-        call compute_interpolated_dva_ac(displ_element,veloc_element,&
-                        potential_dot_dot_acoustic,potential_dot_acoustic,&
-                        potential_acoustic,NGLOB_AB, &
-                        ispec,NSPEC_AB,ibool, &
-                        xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                        hxir,hetar,hgammar, &
-                        dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
-      endif ! acoustic
+          ! interpolates displ/veloc/pressure at receiver locations
+          call compute_interpolated_dva_ac(displ_element,veloc_element,&
+                          potential_dot_dot_acoustic,potential_dot_acoustic,&
+                          potential_acoustic,NGLOB_AB, &
+                          ispec,NSPEC_AB,ibool, &
+                          xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                          hxir,hetar,hgammar, &
+                          dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+        endif ! acoustic
 
-     ! poroelastic wave field
-      if( ispec_is_poroelastic(ispec) ) then
-        ! interpolates displ/veloc/accel at receiver locations
-      !  call compute_interpolated_dva(displw_poroelastic,velocw_poroelastic,accelw_poroelastic,NGLOB_AB, &
-        call compute_interpolated_dva(displs_poroelastic,velocs_poroelastic,accels_poroelastic,NGLOB_AB, &
-                        ispec,NSPEC_AB,ibool, &
-                        xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                        hxir,hetar,hgammar, &
-                        dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
-      endif !poroelastic
+       ! poroelastic wave field
+        if( ispec_is_poroelastic(ispec) ) then
+          ! interpolates displ/veloc/accel at receiver locations
+        !  call compute_interpolated_dva(displw_poroelastic,velocw_poroelastic,accelw_poroelastic,NGLOB_AB, &
+          call compute_interpolated_dva(displs_poroelastic,velocs_poroelastic,accels_poroelastic,NGLOB_AB, &
+                          ispec,NSPEC_AB,ibool, &
+                          xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                          hxir,hetar,hgammar, &
+                          dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+        endif !poroelastic
 
-    !adjoint simulations
-    case( 2 )
+      !adjoint simulations
+      case( 2 )
 
-      ! adjoint source is placed at receiver
-      ispec = ispec_selected_source(irec)
+        ! adjoint source is placed at receiver
+        ispec = ispec_selected_source(irec)
 
-      ! elastic wave field
-      if( ispec_is_elastic(ispec) ) then
-        ! interpolates displ/veloc/accel at receiver locations
-        call compute_interpolated_dva(displ,veloc,accel,NGLOB_AB, &
-                        ispec,NSPEC_AB,ibool, &
-                        xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                        hxir,hetar,hgammar, &
-                        dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+        ! elastic wave field
+        if( ispec_is_elastic(ispec) ) then
+          ! interpolates displ/veloc/accel at receiver locations
+          call compute_interpolated_dva(displ,veloc,accel,NGLOB_AB, &
+                          ispec,NSPEC_AB,ibool, &
+                          xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                          hxir,hetar,hgammar, &
+                          dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
 
-        ! stores elements displacement field
-        do k = 1,NGLLZ
-          do j = 1,NGLLY
-            do i = 1,NGLLX
-              iglob = ibool(i,j,k,ispec)
-              displ_element(:,i,j,k) = displ(:,iglob)
+          ! stores elements displacement field
+          do k = 1,NGLLZ
+            do j = 1,NGLLY
+              do i = 1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                displ_element(:,i,j,k) = displ(:,iglob)
+              enddo
             enddo
           enddo
-        enddo
 
-        ! gets derivatives of local receiver interpolators
-        hpxir(:) = hpxir_store(irec_local,:)
-        hpetar(:) = hpetar_store(irec_local,:)
-        hpgammar(:) = hpgammar_store(irec_local,:)
+          ! gets derivatives of local receiver interpolators
+          hpxir(:) = hpxir_store(irec_local,:)
+          hpetar(:) = hpetar_store(irec_local,:)
+          hpgammar(:) = hpgammar_store(irec_local,:)
 
-        ! computes the integrated derivatives of source parameters (M_jk and X_s)
-        call compute_adj_source_frechet(displ_element,Mxx(irec),Myy(irec),Mzz(irec),&
-                      Mxy(irec),Mxz(irec),Myz(irec),eps_s,eps_m_s, &
-                      hxir,hetar,hgammar,hpxir,hpetar,hpgammar, &
-                      hprime_xx,hprime_yy,hprime_zz, &
-                      xix(:,:,:,ispec),xiy(:,:,:,ispec),xiz(:,:,:,ispec), &
-                      etax(:,:,:,ispec),etay(:,:,:,ispec),etaz(:,:,:,ispec), &
-                      gammax(:,:,:,ispec),gammay(:,:,:,ispec),gammaz(:,:,:,ispec))
+          ! computes the integrated derivatives of source parameters (M_jk and X_s)
+          call compute_adj_source_frechet(displ_element,Mxx(irec),Myy(irec),Mzz(irec),&
+                        Mxy(irec),Mxz(irec),Myz(irec),eps_s,eps_m_s, &
+                        hxir,hetar,hgammar,hpxir,hpetar,hpgammar, &
+                        hprime_xx,hprime_yy,hprime_zz, &
+                        xix(:,:,:,ispec),xiy(:,:,:,ispec),xiz(:,:,:,ispec), &
+                        etax(:,:,:,ispec),etay(:,:,:,ispec),etaz(:,:,:,ispec), &
+                        gammax(:,:,:,ispec),gammay(:,:,:,ispec),gammaz(:,:,:,ispec))
 
-        stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_src(irec),hdur_gaussian(irec))
-        stf_deltat = stf * deltat
-        Mxx_der(irec_local) = Mxx_der(irec_local) + eps_s(1,1) * stf_deltat
-        Myy_der(irec_local) = Myy_der(irec_local) + eps_s(2,2) * stf_deltat
-        Mzz_der(irec_local) = Mzz_der(irec_local) + eps_s(3,3) * stf_deltat
-        Mxy_der(irec_local) = Mxy_der(irec_local) + 2 * eps_s(1,2) * stf_deltat
-        Mxz_der(irec_local) = Mxz_der(irec_local) + 2 * eps_s(1,3) * stf_deltat
-        Myz_der(irec_local) = Myz_der(irec_local) + 2 * eps_s(2,3) * stf_deltat
+          stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_src(irec),hdur_gaussian(irec))
+          stf_deltat = stf * deltat
+          Mxx_der(irec_local) = Mxx_der(irec_local) + eps_s(1,1) * stf_deltat
+          Myy_der(irec_local) = Myy_der(irec_local) + eps_s(2,2) * stf_deltat
+          Mzz_der(irec_local) = Mzz_der(irec_local) + eps_s(3,3) * stf_deltat
+          Mxy_der(irec_local) = Mxy_der(irec_local) + 2 * eps_s(1,2) * stf_deltat
+          Mxz_der(irec_local) = Mxz_der(irec_local) + 2 * eps_s(1,3) * stf_deltat
+          Myz_der(irec_local) = Myz_der(irec_local) + 2 * eps_s(2,3) * stf_deltat
 
-        sloc_der(:,irec_local) = sloc_der(:,irec_local) + eps_m_s(:) * stf_deltat
-      endif ! elastic
+          sloc_der(:,irec_local) = sloc_der(:,irec_local) + eps_m_s(:) * stf_deltat
+        endif ! elastic
 
-      ! acoustic wave field
-      if( ispec_is_acoustic(ispec) ) then
-        ! displacement vector
-        call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
-                        potential_acoustic, displ_element,&
-                        hprime_xx,hprime_yy,hprime_zz, &
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore,GRAVITY)
-        ! velocity vector
-        call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
-                        potential_dot_acoustic, veloc_element,&
-                        hprime_xx,hprime_yy,hprime_zz, &
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore,GRAVITY)
+        ! acoustic wave field
+        if( ispec_is_acoustic(ispec) ) then
+          ! displacement vector
+          call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
+                          potential_acoustic, displ_element,&
+                          hprime_xx,hprime_yy,hprime_zz, &
+                          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                          ibool,rhostore,GRAVITY)
+          ! velocity vector
+          call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
+                          potential_dot_acoustic, veloc_element,&
+                          hprime_xx,hprime_yy,hprime_zz, &
+                          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                          ibool,rhostore,GRAVITY)
 
-        ! interpolates displ/veloc/pressure at receiver locations
-        call compute_interpolated_dva_ac(displ_element,veloc_element,&
-                        potential_dot_dot_acoustic,potential_dot_acoustic,&
-                        potential_acoustic,NGLOB_AB, &
-                        ispec,NSPEC_AB,ibool, &
-                        xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                        hxir,hetar,hgammar, &
-                        dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
-      endif ! acoustic
+          ! interpolates displ/veloc/pressure at receiver locations
+          call compute_interpolated_dva_ac(displ_element,veloc_element,&
+                          potential_dot_dot_acoustic,potential_dot_acoustic,&
+                          potential_acoustic,NGLOB_AB, &
+                          ispec,NSPEC_AB,ibool, &
+                          xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                          hxir,hetar,hgammar, &
+                          dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+        endif ! acoustic
 
-    !adjoint simulations
-    case( 3 )
+      !adjoint simulations
+      case( 3 )
 
-      ispec = ispec_selected_rec(irec)
+        ispec = ispec_selected_rec(irec)
 
-      ! elastic wave field
-      if( ispec_is_elastic(ispec) ) then
-        ! backward fields: interpolates displ/veloc/accel at receiver locations
-        call compute_interpolated_dva(b_displ,b_veloc,b_accel,NGLOB_ADJOINT,&
-                        ispec,NSPEC_AB,ibool, &
-                        xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                        hxir,hetar,hgammar, &
-                        dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
-      endif ! elastic
+        ! elastic wave field
+        if( ispec_is_elastic(ispec) ) then
+          ! backward fields: interpolates displ/veloc/accel at receiver locations
+          call compute_interpolated_dva(b_displ,b_veloc,b_accel,NGLOB_ADJOINT,&
+                          ispec,NSPEC_AB,ibool, &
+                          xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                          hxir,hetar,hgammar, &
+                          dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+        endif ! elastic
 
-      ! acoustic wave field
-      if( ispec_is_acoustic(ispec) ) then
-        ! backward fields: displacement vector
-        call compute_gradient(ispec,NSPEC_AB,NGLOB_ADJOINT, &
-                        b_potential_acoustic, displ_element,&
-                        hprime_xx,hprime_yy,hprime_zz, &
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore,GRAVITY)
-        ! backward fields: velocity vector
-        call compute_gradient(ispec,NSPEC_AB,NGLOB_ADJOINT, &
-                        b_potential_dot_acoustic, veloc_element,&
-                        hprime_xx,hprime_yy,hprime_zz, &
-                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore,GRAVITY)
+        ! acoustic wave field
+        if( ispec_is_acoustic(ispec) ) then
+          ! backward fields: displacement vector
+          call compute_gradient(ispec,NSPEC_AB,NGLOB_ADJOINT, &
+                          b_potential_acoustic, displ_element,&
+                          hprime_xx,hprime_yy,hprime_zz, &
+                          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                          ibool,rhostore,GRAVITY)
+          ! backward fields: velocity vector
+          call compute_gradient(ispec,NSPEC_AB,NGLOB_ADJOINT, &
+                          b_potential_dot_acoustic, veloc_element,&
+                          hprime_xx,hprime_yy,hprime_zz, &
+                          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                          ibool,rhostore,GRAVITY)
 
-        ! backward fields: interpolates displ/veloc/pressure at receiver locations
-        call compute_interpolated_dva_ac(displ_element,veloc_element,&
-                        b_potential_dot_dot_acoustic,b_potential_dot_acoustic,&
-                        b_potential_acoustic,NGLOB_ADJOINT, &
-                        ispec,NSPEC_AB,ibool, &
-                        xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                        hxir,hetar,hgammar, &
-                        dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
-      endif ! acoustic
+          ! backward fields: interpolates displ/veloc/pressure at receiver locations
+          call compute_interpolated_dva_ac(displ_element,veloc_element,&
+                          b_potential_dot_dot_acoustic,b_potential_dot_acoustic,&
+                          b_potential_acoustic,NGLOB_ADJOINT, &
+                          ispec,NSPEC_AB,ibool, &
+                          xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                          hxir,hetar,hgammar, &
+                          dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+        endif ! acoustic
 
-    end select ! SIMULATION_TYPE
+      end select ! SIMULATION_TYPE
 
-    ! store North, East and Vertical components
-    ! distinguish between single and double precision for reals
-    if(CUSTOM_REAL == SIZE_REAL) then
-      seismograms_d(:,irec_local,it) = sngl((nu(:,1,irec)*dxd + nu(:,2,irec)*dyd + nu(:,3,irec)*dzd))
-      seismograms_v(:,irec_local,it) = sngl((nu(:,1,irec)*vxd + nu(:,2,irec)*vyd + nu(:,3,irec)*vzd))
-      seismograms_a(:,irec_local,it) = sngl((nu(:,1,irec)*axd + nu(:,2,irec)*ayd + nu(:,3,irec)*azd))
-    else
-      seismograms_d(:,irec_local,it) = (nu(:,1,irec)*dxd + nu(:,2,irec)*dyd + nu(:,3,irec)*dzd)
-      seismograms_v(:,irec_local,it) = (nu(:,1,irec)*vxd + nu(:,2,irec)*vyd + nu(:,3,irec)*vzd)
-      seismograms_a(:,irec_local,it) = (nu(:,1,irec)*axd + nu(:,2,irec)*ayd + nu(:,3,irec)*azd)
-    endif
+      ! store North, East and Vertical components
+      ! distinguish between single and double precision for reals
+      if(CUSTOM_REAL == SIZE_REAL) then
+        seismograms_d(:,irec_local,it) = sngl((nu(:,1,irec)*dxd + nu(:,2,irec)*dyd + nu(:,3,irec)*dzd))
+        seismograms_v(:,irec_local,it) = sngl((nu(:,1,irec)*vxd + nu(:,2,irec)*vyd + nu(:,3,irec)*vzd))
+        seismograms_a(:,irec_local,it) = sngl((nu(:,1,irec)*axd + nu(:,2,irec)*ayd + nu(:,3,irec)*azd))
+      else
+        seismograms_d(:,irec_local,it) = (nu(:,1,irec)*dxd + nu(:,2,irec)*dyd + nu(:,3,irec)*dzd)
+        seismograms_v(:,irec_local,it) = (nu(:,1,irec)*vxd + nu(:,2,irec)*vyd + nu(:,3,irec)*vzd)
+        seismograms_a(:,irec_local,it) = (nu(:,1,irec)*axd + nu(:,2,irec)*ayd + nu(:,3,irec)*azd)
+      endif
 
-    !adjoint simulations
-    if (SIMULATION_TYPE == 2) seismograms_eps(:,:,irec_local,it) = eps_s(:,:)
+      !adjoint simulations
+      if (SIMULATION_TYPE == 2) seismograms_eps(:,:,irec_local,it) = eps_s(:,:)
 
-  enddo ! nrec_local
+    enddo ! nrec_local
 
   endif
 

Modified: seismo/3D/SPECFEM3D/trunk/todo_list_please_dont_remove.txt
===================================================================
--- seismo/3D/SPECFEM3D/trunk/todo_list_please_dont_remove.txt	2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/todo_list_please_dont_remove.txt	2013-08-20 14:13:26 UTC (rev 22718)
@@ -323,7 +323,7 @@
 ------------------------------------------------
 
 + fault_solver_common:
-      - make ordered version of subroutine assemble_MPI_vector_ext_mesh, and use it in subroutine initialize_fault
+      - make ordered version of subroutine assemble_MPI_vector_blocking, and use it in subroutine initialize_fault
 
 + fault_solver_dynamic:
       - many hard-coded ad hoc features need to be set instead as user options