[cig-commits] r20529 - in seismo/3D/SPECFEM3D/trunk: src/cuda src/generate_databases src/specfem3D utils

Thu Jul 19 14:55:39 PDT 2012

Author: danielpeter
Date: 2012-07-19 14:55:39 -0700 (Thu, 19 Jul 2012)
New Revision: 20529

Added:
   seismo/3D/SPECFEM3D/trunk/src/cuda/initialize_cuda.cu
Modified:
   seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c.bak
   seismo/3D/SPECFEM3D/trunk/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/generate_databases/create_regions_mesh.f90
   seismo/3D/SPECFEM3D/trunk/src/generate_databases/generate_databases.f90
   seismo/3D/SPECFEM3D/trunk/src/generate_databases/save_arrays_solver.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_kernels.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/initialize_simulation.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/save_adjoint_kernels.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90
   seismo/3D/SPECFEM3D/trunk/utils/create_specfem3D_gpu_cuda_method_stubs.pl
   seismo/3D/SPECFEM3D/trunk/utils/update_headers_change_word_f90.pl
Log:
adds cuda initialization file initialize_cuda.cu

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu
===================================================================

--- seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu	2012-07-19 21:55:39 UTC (rev 20529)
@@ -41,9 +41,455 @@
 #include "mesh_constants_cuda.h"
 #include "prepare_constants_cuda.h"
 
+/* ----------------------------------------------------------------------------------------------- */
 
+// Helper functions
+
 /* ----------------------------------------------------------------------------------------------- */
 
+double get_time()
+{
+  struct timeval t;
+  struct timezone tzp;
+  gettimeofday(&t, &tzp);
+  return t.tv_sec + t.tv_usec*1e-6;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {
+  TRACE("pause_for_debug");
+
+  pause_for_debugger(1);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void pause_for_debugger(int pause) {
+  if(pause) {
+    int myrank;
+#ifdef WITH_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+    myrank = 0;
+#endif
+    printf("I'm rank %d\n",myrank);
+    int i = 0;
+    char hostname[256];
+    gethostname(hostname, sizeof(hostname));
+    printf("PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank);
+    FILE *file = fopen("./attach_gdb.txt","w+");
+    if (file != NULL){
+      fprintf(file,"PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank);
+      fclose(file);
+    }
+    fflush(stdout);
+    while (0 == i)
+      sleep(5);
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void exit_on_cuda_error(char* kernel_name) {
+  // sync and check to catch errors from previous async operations
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess){
+    fprintf(stderr,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
+
+    //debugging
+    //pause_for_debugger(0);
+
+    // outputs error file
+    FILE* fp;
+    int myrank;
+    char filename[BUFSIZ];
+#ifdef WITH_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+    myrank = 0;
+#endif
+    sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+    fp = fopen(filename,"a+");
+    if (fp != NULL){
+      fprintf(fp,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
+      fclose(fp);
+    }
+
+    // stops program
+    //free(kernel_name);
+#ifdef WITH_MPI
+    MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+    exit(EXIT_FAILURE);
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void exit_on_error(char* info) {
+  printf("\nERROR: %s\n",info);
+  fflush(stdout);
+
+  // outputs error file
+  FILE* fp;
+  int myrank;
+  char filename[BUFSIZ];
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+  myrank = 0;
+#endif
+  sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+  fp = fopen(filename,"a+");
+  if (fp != NULL){
+    fprintf(fp,"ERROR: %s\n",info);
+    fclose(fp);
+  }
+
+  // stops program
+#ifdef WITH_MPI
+  MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+  //free(info);
+  exit(EXIT_FAILURE);
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void print_CUDA_error_if_any(cudaError_t err, int num) {
+  if (cudaSuccess != err)
+  {
+    printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
+    fflush(stdout);
+
+    // outputs error file
+    FILE* fp;
+    int myrank;
+    char filename[BUFSIZ];
+#ifdef WITH_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+    myrank = 0;
+#endif
+    sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+    fp = fopen(filename,"a+");
+    if (fp != NULL){
+      fprintf(fp,"\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
+      fclose(fp);
+    }
+
+    // stops program
+#ifdef WITH_MPI
+    MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+    exit(EXIT_FAILURE);
+  }
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void get_free_memory(double* free_db, double* used_db, double* total_db) {
+
+  // gets memory usage in byte
+  size_t free_byte ;
+  size_t total_byte ;
+  cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+  if ( cudaSuccess != cuda_status ){
+    printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
+    exit(EXIT_FAILURE);
+  }
+
+  *free_db = (double)free_byte ;
+  *total_db = (double)total_byte ;
+  *used_db = *total_db - *free_db ;
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Saves GPU memory usage to file
+void output_free_memory(int myrank,char* info_str) {
+
+  FILE* fp;
+  char filename[BUFSIZ];
+  double free_db,used_db,total_db;
+
+  get_free_memory(&free_db,&used_db,&total_db);
+
+  sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_device_mem_usage_proc_%06d.txt",myrank);
+  fp = fopen(filename,"a+");
+  if (fp != NULL){
+    fprintf(fp,"%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
+            used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+    fclose(fp);
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Fortran-callable version of above method
+extern "C"
+void FC_FUNC_(output_free_device_memory,
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {
+  TRACE("output_free_device_memory");
+
+  char info[6];
+  sprintf(info,"f %d:",*myrank);
+  output_free_memory(*myrank,info);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+ void show_free_memory(char* info_str) {
+
+ // show memory usage of GPU
+ int myrank;
+ #ifdef WITH_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
+ #else
+ myrank = 0;
+ #endif
+ double free_db,used_db,total_db;
+
+ get_free_memory(&free_db,&used_db,&total_db);
+
+ printf("%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
+ used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+
+ }
+ */
+
+/*
+ extern "C"
+ void FC_FUNC_(show_free_device_memory,
+ SHOW_FREE_DEVICE_MEMORY)() {
+ TRACE("show_free_device_memory");
+
+ show_free_memory("from fortran");
+ }
+ */
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(get_free_device_memory,
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {
+  TRACE("get_free_device_memory");
+
+  double free_db,used_db,total_db;
+
+  get_free_memory(&free_db,&used_db,&total_db);
+
+  // converts to MB
+  *free = (realw) free_db/1024.0/1024.0;
+  *used = (realw) used_db/1024.0/1024.0;
+  *total = (realw) total_db/1024.0/1024.0;
+  return;
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+//daniel: helper function
+/*
+ __global__ void check_phase_ispec_kernel(int num_phase_ispec,
+ int* phase_ispec,
+ int NSPEC_AB,
+ int* ier) {
+
+ int i,ispec,iphase,count0,count1;
+ *ier = 0;
+
+ for(iphase=0; iphase < 2; iphase++){
+ count0 = 0;
+ count1 = 0;
+
+ for(i=0; i < num_phase_ispec; i++){
+ ispec = phase_ispec[iphase*num_phase_ispec + i] - 1;
+ if( ispec < -1 || ispec >= NSPEC_AB ){
+ printf("Error in d_phase_ispec_inner_elastic %d %d\n",i,ispec);
+ *ier = 1;
+ return;
+ }
+ if( ispec >= 0 ){ count0++;}
+ if( ispec < 0 ){ count1++;}
+ }
+
+ printf("check_phase_ispec done: phase %d, count = %d %d \n",iphase,count0,count1);
+
+ }
+ }
+
+ void check_phase_ispec(long* Mesh_pointer_f,int type){
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ printf("check phase_ispec for type=%d\n",type);
+
+ dim3 grid(1,1);
+ dim3 threads(1,1,1);
+
+ int* h_debug = (int*) calloc(1,sizeof(int));
+ int* d_debug;
+ cudaMalloc((void**)&d_debug,sizeof(int));
+
+ if( type == 1 ){
+ check_phase_ispec_kernel<<<grid,threads>>>(mp->num_phase_ispec_elastic,
+ mp->d_phase_ispec_inner_elastic,
+ mp->NSPEC_AB,
+ d_debug);
+ }else if( type == 2 ){
+ check_phase_ispec_kernel<<<grid,threads>>>(mp->num_phase_ispec_acoustic,
+ mp->d_phase_ispec_inner_acoustic,
+ mp->NSPEC_AB,
+ d_debug);
+ }
+
+ cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
+ cudaFree(d_debug);
+ if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
+ free(h_debug);
+ fflush(stdout);
+
+ #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("check_phase_ispec");
+ #endif
+
+ }
+ */
+
+/* ----------------------------------------------------------------------------------------------- */
+//daniel: helper function
+/*
+ __global__ void check_ispec_is_kernel(int NSPEC_AB,
+ int* ispec_is,
+ int* ier) {
+
+ int ispec,count0,count1;
+
+ *ier = 0;
+ count0 = 0;
+ count1 = 0;
+ for(ispec=0; ispec < NSPEC_AB; ispec++){
+ if( ispec_is[ispec] < -1 || ispec_is[ispec] > 1 ){
+ printf("Error in ispec_is %d %d\n",ispec,ispec_is[ispec]);
+ *ier = 1;
+ return;
+ //exit(1);
+ }
+ if( ispec_is[ispec] == 0 ){count0++;}
+ if( ispec_is[ispec] != 0 ){count1++;}
+ }
+ printf("check_ispec_is done: count = %d %d\n",count0,count1);
+ }
+
+ void check_ispec_is(long* Mesh_pointer_f,int type){
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ printf("check ispec_is for type=%d\n",type);
+
+ dim3 grid(1,1);
+ dim3 threads(1,1,1);
+
+ int* h_debug = (int*) calloc(1,sizeof(int));
+ int* d_debug;
+ cudaMalloc((void**)&d_debug,sizeof(int));
+
+ if( type == 0 ){
+ check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
+ mp->d_ispec_is_inner,
+ d_debug);
+ }else if( type == 1 ){
+ check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
+ mp->d_ispec_is_elastic,
+ d_debug);
+ }else if( type == 2 ){
+ check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
+ mp->d_ispec_is_acoustic,
+ d_debug);
+ }
+
+ cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
+ cudaFree(d_debug);
+ if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
+ free(h_debug);
+ fflush(stdout);
+
+ #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("check_ispec_is");
+ #endif
+ }
+ */
+/* ----------------------------------------------------------------------------------------------- */
+//daniel: helper function
+/*
+ __global__ void check_array_ispec_kernel(int num_array_ispec,
+ int* array_ispec,
+ int NSPEC_AB,
+ int* ier) {
+
+ int i,ispec,count0,count1;
+
+ *ier = 0;
+ count0 = 0;
+ count1 = 0;
+
+ for(i=0; i < num_array_ispec; i++){
+ ispec = array_ispec[i] - 1;
+ if( ispec < -1 || ispec >= NSPEC_AB ){
+ printf("Error in d_array_ispec %d %d\n",i,ispec);
+ *ier = 1;
+ return;
+ }
+ if( ispec >= 0 ){ count0++;}
+ if( ispec < 0 ){ count1++;}
+ }
+
+ printf("check_array_ispec done: count = %d %d \n",count0,count1);
+ }
+
+ void check_array_ispec(long* Mesh_pointer_f,int type){
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ printf("check array_ispec for type=%d\n",type);
+
+ dim3 grid(1,1);
+ dim3 threads(1,1,1);
+
+ int* h_debug = (int*) calloc(1,sizeof(int));
+ int* d_debug;
+ cudaMalloc((void**)&d_debug,sizeof(int));
+
+ if( type == 1 ){
+ check_array_ispec_kernel<<<grid,threads>>>(mp->d_num_abs_boundary_faces,
+ mp->d_abs_boundary_ispec,
+ mp->NSPEC_AB,
+ d_debug);
+ }
+
+ cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
+ cudaFree(d_debug);
+ if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
+ free(h_debug);
+ fflush(stdout);
+
+ #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("check_array_ispec");
+ #endif
+
+ }
+ */
+
+/* ----------------------------------------------------------------------------------------------- */
+
 // Check functions
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -308,8 +754,8 @@
   // load shared mem
   unsigned int tid = threadIdx.x;
   unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
-  unsigned int i = tid + bx*blockDim.x;  
-  
+  unsigned int i = tid + bx*blockDim.x;
+
   // loads absolute values into shared memory
   sdata[tid] = (i < size) ? fabs(array[i]) : 0.0 ;
 
@@ -347,7 +793,7 @@
   realw max;
   realw *d_max;
 
-  max = 0;
+  max = 0.0;
 
   /* way 1 : timing Elapsed time: 8.464813e-03
    realw* h_array;
@@ -383,14 +829,16 @@
   // launch simple reduction kernel
   realw* h_max;
   int blocksize = BLOCKSIZE_TRANSFER;
-  int size_padded = ((int)ceil(((double)mp->NGLOB_AB)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;  
+
+  int size = mp->NGLOB_AB;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
-  
+
   //printf("num_blocks_x %i \n",num_blocks_x);
 
   h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
@@ -400,17 +848,11 @@
   dim3 threads(blocksize,1,1);
 
   if(*SIMULATION_TYPE == 1 ){
-    get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
-                                         mp->NGLOB_AB,
-                                         d_max);
+    get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,size,d_max);
+  }else if(*SIMULATION_TYPE == 3 ){
+    get_maximum_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,size,d_max);
   }
 
-  if(*SIMULATION_TYPE == 3 ){
-    get_maximum_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
-                                         mp->NGLOB_AB,
-                                         d_max);
-  }
-
   print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
                                      cudaMemcpyDeviceToHost),222);
 
@@ -483,7 +925,7 @@
   // load shared mem
   unsigned int tid = threadIdx.x;
   unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
-  unsigned int i = tid + bx*blockDim.x;  
+  unsigned int i = tid + bx*blockDim.x;
 
   // loads values into shared memory: assume array is a vector array
   sdata[tid] = (i < size) ? sqrt(array[i*3]*array[i*3]
@@ -529,14 +971,16 @@
   // launch simple reduction kernel
   realw* h_max;
   int blocksize = BLOCKSIZE_TRANSFER;
-  int size_padded = ((int)ceil(((double)mp->NGLOB_AB)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;  
+
+  int size = mp->NGLOB_AB;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
-  
+
   //printf("num_blocks_x %i \n",num_blocks_x);
   h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
   cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
@@ -545,23 +989,17 @@
   dim3 threads(blocksize,1,1);
 
   if(*SIMULATION_TYPE == 1 ){
-    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ,
-                                                mp->NGLOB_AB,
-                                                d_max);
+    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ,size,d_max);
+  }else if(*SIMULATION_TYPE == 3 ){
+    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ,size,d_max);
   }
 
-  if(*SIMULATION_TYPE == 3 ){
-    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ,
-                                                mp->NGLOB_AB,
-                                                d_max);
-  }
-
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //double end_time = get_time();
   //printf("Elapsed time: %e\n",end_time-start_time);
   exit_on_cuda_error("kernel get_norm_elastic_from_device");
 #endif
-  
+
   print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
                                      cudaMemcpyDeviceToHost),222);
 
@@ -583,5 +1021,3 @@
   exit_on_cuda_error("get_norm_elastic_from_device");
 #endif
 }
-
-

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu	2012-07-19 21:55:39 UTC (rev 20529)
@@ -185,7 +185,7 @@
   // cudaEventCreate(&start);
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
-  
+
   if( *num_interfaces_ext_mesh == 0 ) return;
 
   // copies buffer onto GPU

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu	2012-07-19 21:55:39 UTC (rev 20529)
@@ -150,7 +150,7 @@
     // cudaEventDestroy( stop );
     // printf("boundary xfer d->h Time: %f ms\n",time);
   }
-  
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("transfer_boun_accel_from_device");
 #endif
@@ -170,7 +170,7 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
   if( mp->size_mpi_buffer > 0 ){
-  
+
 //daniel: todo - check below with this...
     int blocksize = BLOCKSIZE_TRANSFER;
     int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
@@ -277,7 +277,7 @@
   if( mp->size_mpi_buffer > 0 ){
     // copy on host memory
     memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_buffer*sizeof(realw));
-  
+
     // cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
     //        mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice,mp->compute_stream);
     //printf("xfer to device\n");
@@ -372,7 +372,7 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   if( mp->size_mpi_buffer > 0 ){
-  
+
     //daniel: todo - check if this copy is only needed for adjoint simulation, otherwise it is called asynchronously?
     if(*FORWARD_OR_ADJOINT == 1 ){
       // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
@@ -422,7 +422,7 @@
     // cudaEventDestroy( stop );
     // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
   }
-  
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //double end_time = get_time();
   //printf("Elapsed time: %e\n",end_time-start_time);
@@ -1636,10 +1636,10 @@
       //daniel: we use the same stream, so kernels are executed one after the other
       // synchronizes in case we run on only 1 process to avoid race-conditions
       //if( mp->NPROC == 1 ){
-      //  // Wait until previous compute stream finishes.         
-      //  cudaStreamSynchronize(mp->compute_stream);        
+      //  // Wait until previous compute stream finishes.
+      //  cudaStreamSynchronize(mp->compute_stream);
       //}
-      
+
     }
 
   }else{

Added: seismo/3D/SPECFEM3D/trunk/src/cuda/initialize_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/initialize_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/initialize_cuda.cu	2012-07-19 21:55:39 UTC (rev 20529)
@@ -0,0 +1,181 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 1
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and CNRS / INRIA / University of Pau
+ ! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+ !                             July 2012
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+#include "prepare_constants_cuda.h"
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// GPU initialization
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(initialize_cuda_device,
+              INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+  TRACE("initialize_cuda_device");
+
+  // Gets rank number of MPI process
+  int myrank = *myrank_f;
+
+  /*
+   // cuda initialization (needs -lcuda library)
+   // note:   cuInit initializes the driver API.
+   //             it is needed for any following CUDA driver API function call (format cuFUNCTION(..) )
+   //             however, for the CUDA runtime API functions (format cudaFUNCTION(..) )
+   //             the initialization is implicit, thus cuInit() here would not be needed...
+   CUresult status = cuInit(0);
+   if ( CUDA_SUCCESS != status ) exit_on_error("CUDA driver API device initialization failed\n");
+
+   // returns a handle to the first cuda compute device
+   CUdevice dev;
+   status = cuDeviceGet(&dev, 0);
+   if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device not found\n");
+
+   // gets device properties
+   int major,minor;
+   status = cuDeviceComputeCapability(&major,&minor,dev);
+   if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device information not found\n");
+
+   // make sure that the device has compute capability >= 1.3
+   if (major < 1){
+   fprintf(stderr,"Compute capability major number should be at least 1, got: %d \nexiting...\n",major);
+   exit_on_error("CUDA Compute capability major number should be at least 1\n");
+   }
+   if (major == 1 && minor < 3){
+   fprintf(stderr,"Compute capability should be at least 1.3, got: %d.%d \nexiting...\n",major,minor);
+   exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
+   }
+   */
+
+  // note: from here on we use the runtime API  ...
+
+  // Gets number of GPU devices
+  int device_count = 0;
+  cudaGetDeviceCount(&device_count);
+  exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\ncheck if driver and runtime libraries work together\nexiting...\n");
+
+  // returns device count to fortran
+  if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
+  *ncuda_devices = device_count;
+
+  // Sets the active device
+  if(device_count >= 1) {
+    // generalized for more GPUs per node
+    // note: without previous context release, cudaSetDevice will complain with the cuda error
+    //         "setting the device when a process is active is not allowed"
+    // releases previous contexts
+    cudaThreadExit();
+
+    //printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count);
+    //MPI_Barrier(MPI_COMM_WORLD);
+
+    // sets active device
+    cudaSetDevice( myrank % device_count );
+    exit_on_cuda_error("cudaSetDevice");
+  }
+
+  // returns a handle to the active device
+  int device;
+  cudaGetDevice(&device);
+
+  // get device properties
+  struct cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp,device);
+
+  // exit if the machine has no CUDA-enabled device
+  if (deviceProp.major == 9999 && deviceProp.minor == 9999){
+    fprintf(stderr,"No CUDA-enabled device found, exiting...\n\n");
+    exit_on_error("CUDA runtime error: there is no CUDA-enabled device found\n");
+  }
+
+  // outputs device infos to file
+  char filename[BUFSIZ];
+  FILE* fp;
+  sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_device_info_proc_%06d.txt",myrank);
+  fp = fopen(filename,"a+");
+  if (fp != NULL){
+    // display device properties
+    fprintf(fp,"Device Name = %s\n",deviceProp.name);
+    fprintf(fp,"multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
+    fprintf(fp,"totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
+    fprintf(fp,"totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
+    fprintf(fp,"sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
+    fprintf(fp,"Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
+    fprintf(fp,"Maximum size of each dimension of a block: %d x %d x %d\n",
+            deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
+    fprintf(fp,"Maximum sizes of each dimension of a grid: %d x %d x %d\n",
+            deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
+    fprintf(fp,"Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
+    if(deviceProp.canMapHostMemory){
+      fprintf(fp,"canMapHostMemory: TRUE\n");
+    }else{
+      fprintf(fp,"canMapHostMemory: FALSE\n");
+    }
+    if(deviceProp.deviceOverlap){
+      fprintf(fp,"deviceOverlap: TRUE\n");
+    }else{
+      fprintf(fp,"deviceOverlap: FALSE\n");
+    }
+
+    // outputs initial memory infos via cudaMemGetInfo()
+    double free_db,used_db,total_db;
+    get_free_memory(&free_db,&used_db,&total_db);
+    fprintf(fp,"%d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank,
+            used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+
+    fclose(fp);
+  }
+
+  // make sure that the device has compute capability >= 1.3
+  if (deviceProp.major < 1){
+    fprintf(stderr,"Compute capability major number should be at least 1, exiting...\n\n");
+    exit_on_error("CUDA Compute capability major number should be at least 1\n");
+  }
+  if (deviceProp.major == 1 && deviceProp.minor < 3){
+    fprintf(stderr,"Compute capability should be at least 1.3, exiting...\n");
+    exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
+  }
+  // we use pinned memory for asynchronous copy
+  if( ! deviceProp.canMapHostMemory){
+    fprintf(stderr,"Device capability should allow to map host memory, exiting...\n");
+    exit_on_error("CUDA Device capability canMapHostMemory should be TRUE\n");
+  }
+}

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu	2012-07-19 21:55:39 UTC (rev 20529)
@@ -107,8 +107,8 @@
   realw deltat = *deltat_F;
   realw deltatsqover2 = *deltatsqover2_F;
   realw deltatover2 = *deltatover2_F;
-  
-  //launch kernel  
+
+  //launch kernel
   UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ,mp->d_veloc,mp->d_accel,
                                            size,deltat,deltatsqover2,deltatover2);
 
@@ -124,7 +124,7 @@
     realw b_deltat = *b_deltat_F;
     realw b_deltatsqover2 = *b_deltatsqover2_F;
     realw b_deltatover2 = *b_deltatover2_F;
-    
+
     UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
                                              size,b_deltat,b_deltatsqover2,b_deltatover2);
 
@@ -205,7 +205,7 @@
   realw deltat = *deltat_F;
   realw deltatsqover2 = *deltatsqover2_F;
   realw deltatover2 = *deltatover2_F;
-  
+
   //launch kernel
   UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_acoustic,
                                            mp->d_potential_dot_acoustic,
@@ -216,7 +216,7 @@
     realw b_deltat = *b_deltat_F;
     realw b_deltatsqover2 = *b_deltatsqover2_F;
     realw b_deltatover2 = *b_deltatover2_F;
-    
+
     UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_acoustic,
                                              mp->d_b_potential_dot_acoustic,
                                              mp->d_b_potential_dot_dot_acoustic,

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h	2012-07-19 21:55:39 UTC (rev 20529)
@@ -73,21 +73,18 @@
 // error checking after cuda function calls
 //#define ENABLE_VERY_SLOW_ERROR_CHECKING
 
-// helper functions
+// maximum function
 #define MAX(x,y)                    (((x) < (y)) ? (y) : (x))
 
+// utility functions: defined in check_fields_cuda.cu
 double get_time();
-
+void get_free_memory(double* free_db, double* used_db, double* total_db);
 void print_CUDA_error_if_any(cudaError_t err, int num);
-
 void pause_for_debugger(int pause);
-
 void exit_on_cuda_error(char* kernel_name);
-
 void exit_on_error(char* info);
+//void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y);
 
-void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y);
-
 /* ----------------------------------------------------------------------------------------------- */
 
 // cuda constant arrays

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu	2012-07-19 21:55:39 UTC (rev 20529)
@@ -44,595 +44,11 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-// Helper functions
-
-/* ----------------------------------------------------------------------------------------------- */
-
-double get_time()
-{
-  struct timeval t;
-  struct timezone tzp;
-  gettimeofday(&t, &tzp);
-  return t.tv_sec + t.tv_usec*1e-6;
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {
-TRACE("pause_for_debug");
-
-  pause_for_debugger(1);
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-void pause_for_debugger(int pause) {
-  if(pause) {
-    int myrank;
-#ifdef WITH_MPI
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-#else
-    myrank = 0;
-#endif
-    printf("I'm rank %d\n",myrank);
-    int i = 0;
-    char hostname[256];
-    gethostname(hostname, sizeof(hostname));
-    printf("PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank);
-    FILE *file = fopen("./attach_gdb.txt","w+");
-    if (file != NULL){
-      fprintf(file,"PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank);
-      fclose(file);
-    }
-    fflush(stdout);
-    while (0 == i)
-      sleep(5);
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-void exit_on_cuda_error(char* kernel_name) {
-  // sync and check to catch errors from previous async operations
-  cudaThreadSynchronize();
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess){
-    fprintf(stderr,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
-
-    //debugging
-    //pause_for_debugger(0);
-
-    // outputs error file
-    FILE* fp;
-    int myrank;
-    char filename[BUFSIZ];  
-#ifdef WITH_MPI
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-#else
-    myrank = 0;
-#endif  
-    sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
-    fp = fopen(filename,"a+");
-    if (fp != NULL){
-      fprintf(fp,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
-      fclose(fp);
-    }
-    
-    // stops program
-    //free(kernel_name);
-#ifdef WITH_MPI
-    MPI_Abort(MPI_COMM_WORLD,1);
-#endif
-    exit(EXIT_FAILURE);
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-void exit_on_error(char* info)
-{
-  printf("\nERROR: %s\n",info);
-  fflush(stdout);
-  
-  // outputs error file
-  FILE* fp;
-  int myrank;
-  char filename[BUFSIZ];  
-#ifdef WITH_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-#else
-  myrank = 0;
-#endif  
-  sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
-  fp = fopen(filename,"a+");
-  if (fp != NULL){
-    fprintf(fp,"ERROR: %s\n",info);
-    fclose(fp);
-  }
-  
-  // stops program
-#ifdef WITH_MPI
-  MPI_Abort(MPI_COMM_WORLD,1);
-#endif
-  //free(info);
-  exit(EXIT_FAILURE);
-  return;
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-void print_CUDA_error_if_any(cudaError_t err, int num)
-{
-  if (cudaSuccess != err)
-  {
-    printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
-    fflush(stdout);
-
-    // outputs error file
-    FILE* fp;
-    int myrank;
-    char filename[BUFSIZ];  
-#ifdef WITH_MPI
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-#else
-    myrank = 0;
-#endif  
-    sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
-    fp = fopen(filename,"a+");
-    if (fp != NULL){
-      fprintf(fp,"\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
-      fclose(fp);
-    }
-
-    // stops program
-#ifdef WITH_MPI
-    MPI_Abort(MPI_COMM_WORLD,1);
-#endif
-    exit(EXIT_FAILURE);
-  }
-  return;
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-void get_free_memory(double* free_db, double* used_db, double* total_db) {
-
-  // gets memory usage in byte
-  size_t free_byte ;
-  size_t total_byte ;
-  cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
-  if ( cudaSuccess != cuda_status ){
-    printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
-    exit(EXIT_FAILURE);
-  }
-
-  *free_db = (double)free_byte ;
-  *total_db = (double)total_byte ;
-  *used_db = *total_db - *free_db ;
-  return;
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// Saves GPU memory usage to file
-void output_free_memory(int myrank,char* info_str) {
-
-  FILE* fp;
-  char filename[BUFSIZ];
-  double free_db,used_db,total_db;
-
-  get_free_memory(&free_db,&used_db,&total_db);
-
-  sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_device_mem_usage_proc_%06d.txt",myrank);
-  fp = fopen(filename,"a+");
-  if (fp != NULL){
-    fprintf(fp,"%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
-            used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
-    fclose(fp);
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// Fortran-callable version of above method
-extern "C"
-void FC_FUNC_(output_free_device_memory,
-              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {
-TRACE("output_free_device_memory");
-
-  char info[6];
-  sprintf(info,"f %d:",*myrank);
-  output_free_memory(*myrank,info);
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-/*
-void show_free_memory(char* info_str) {
-
-  // show memory usage of GPU
-  int myrank;
-#ifdef WITH_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
-#else
-  myrank = 0;
-#endif
-  double free_db,used_db,total_db;
-
-  get_free_memory(&free_db,&used_db,&total_db);
-
-  printf("%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
-   used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
-
-}
-*/
-
-/*
-extern "C"
-void FC_FUNC_(show_free_device_memory,
-              SHOW_FREE_DEVICE_MEMORY)() {
- TRACE("show_free_device_memory");
-
- show_free_memory("from fortran");
-}
-*/
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-extern "C"
-void FC_FUNC_(get_free_device_memory,
-              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {
-TRACE("get_free_device_memory");
-
-  double free_db,used_db,total_db;
-
-  get_free_memory(&free_db,&used_db,&total_db);
-
-  // converts to MB
-  *free = (realw) free_db/1024.0/1024.0;
-  *used = (realw) used_db/1024.0/1024.0;
-  *total = (realw) total_db/1024.0/1024.0;
-  return;
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-//daniel: helper function
-/*
-__global__ void check_phase_ispec_kernel(int num_phase_ispec,
-                                         int* phase_ispec,
-                                         int NSPEC_AB,
-                                         int* ier) {
-
-  int i,ispec,iphase,count0,count1;
-  *ier = 0;
-
-  for(iphase=0; iphase < 2; iphase++){
-    count0 = 0;
-    count1 = 0;
-
-    for(i=0; i < num_phase_ispec; i++){
-      ispec = phase_ispec[iphase*num_phase_ispec + i] - 1;
-      if( ispec < -1 || ispec >= NSPEC_AB ){
-        printf("Error in d_phase_ispec_inner_elastic %d %d\n",i,ispec);
-        *ier = 1;
-        return;
-      }
-      if( ispec >= 0 ){ count0++;}
-      if( ispec < 0 ){ count1++;}
-    }
-
-    printf("check_phase_ispec done: phase %d, count = %d %d \n",iphase,count0,count1);
-
-  }
-}
-
-void check_phase_ispec(long* Mesh_pointer_f,int type){
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  printf("check phase_ispec for type=%d\n",type);
-
-  dim3 grid(1,1);
-  dim3 threads(1,1,1);
-
-  int* h_debug = (int*) calloc(1,sizeof(int));
-  int* d_debug;
-  cudaMalloc((void**)&d_debug,sizeof(int));
-
-  if( type == 1 ){
-    check_phase_ispec_kernel<<<grid,threads>>>(mp->num_phase_ispec_elastic,
-                                             mp->d_phase_ispec_inner_elastic,
-                                             mp->NSPEC_AB,
-                                             d_debug);
-  }else if( type == 2 ){
-    check_phase_ispec_kernel<<<grid,threads>>>(mp->num_phase_ispec_acoustic,
-                                               mp->d_phase_ispec_inner_acoustic,
-                                               mp->NSPEC_AB,
-                                               d_debug);
-  }
-
-  cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
-  cudaFree(d_debug);
-  if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
-  free(h_debug);
-  fflush(stdout);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("check_phase_ispec");
-#endif
-
-}
-*/
-
-/* ----------------------------------------------------------------------------------------------- */
-//daniel: helper function
-/*
-__global__ void check_ispec_is_kernel(int NSPEC_AB,
-                                      int* ispec_is,
-                                      int* ier) {
-
-  int ispec,count0,count1;
-
-  *ier = 0;
-  count0 = 0;
-  count1 = 0;
-  for(ispec=0; ispec < NSPEC_AB; ispec++){
-    if( ispec_is[ispec] < -1 || ispec_is[ispec] > 1 ){
-      printf("Error in ispec_is %d %d\n",ispec,ispec_is[ispec]);
-      *ier = 1;
-      return;
-      //exit(1);
-    }
-    if( ispec_is[ispec] == 0 ){count0++;}
-    if( ispec_is[ispec] != 0 ){count1++;}
-  }
-  printf("check_ispec_is done: count = %d %d\n",count0,count1);
-}
-
-void check_ispec_is(long* Mesh_pointer_f,int type){
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  printf("check ispec_is for type=%d\n",type);
-
-  dim3 grid(1,1);
-  dim3 threads(1,1,1);
-
-  int* h_debug = (int*) calloc(1,sizeof(int));
-  int* d_debug;
-  cudaMalloc((void**)&d_debug,sizeof(int));
-
-  if( type == 0 ){
-    check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
-                                            mp->d_ispec_is_inner,
-                                            d_debug);
-  }else if( type == 1 ){
-    check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
-                                            mp->d_ispec_is_elastic,
-                                            d_debug);
-  }else if( type == 2 ){
-    check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
-                                            mp->d_ispec_is_acoustic,
-                                            d_debug);
-  }
-
-  cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
-  cudaFree(d_debug);
-  if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
-  free(h_debug);
-  fflush(stdout);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("check_ispec_is");
-#endif
-}
-*/
-/* ----------------------------------------------------------------------------------------------- */
-//daniel: helper function
-/*
-__global__ void check_array_ispec_kernel(int num_array_ispec,
-                                         int* array_ispec,
-                                         int NSPEC_AB,
-                                         int* ier) {
-
-  int i,ispec,count0,count1;
-
-  *ier = 0;
-  count0 = 0;
-  count1 = 0;
-
-  for(i=0; i < num_array_ispec; i++){
-    ispec = array_ispec[i] - 1;
-    if( ispec < -1 || ispec >= NSPEC_AB ){
-      printf("Error in d_array_ispec %d %d\n",i,ispec);
-      *ier = 1;
-      return;
-    }
-    if( ispec >= 0 ){ count0++;}
-    if( ispec < 0 ){ count1++;}
-  }
-
-  printf("check_array_ispec done: count = %d %d \n",count0,count1);
-}
-
-void check_array_ispec(long* Mesh_pointer_f,int type){
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  printf("check array_ispec for type=%d\n",type);
-
-  dim3 grid(1,1);
-  dim3 threads(1,1,1);
-
-  int* h_debug = (int*) calloc(1,sizeof(int));
-  int* d_debug;
-  cudaMalloc((void**)&d_debug,sizeof(int));
-
-  if( type == 1 ){
-    check_array_ispec_kernel<<<grid,threads>>>(mp->d_num_abs_boundary_faces,
-                                               mp->d_abs_boundary_ispec,
-                                               mp->NSPEC_AB,
-                                               d_debug);
-  }
-
-  cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
-  cudaFree(d_debug);
-  if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
-  free(h_debug);
-  fflush(stdout);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("check_array_ispec");
-#endif
-
-}
-*/
-
-/* ----------------------------------------------------------------------------------------------- */
-
 // GPU preparation
 
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(prepare_cuda_device,
-              PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
-  TRACE("prepare_cuda_device");
-
-  // Gets rank number of MPI process
-  int myrank = *myrank_f;
-
-/*
-  // cuda initialization (needs -lcuda library)
-  // note:   cuInit initializes the driver API.
-  //             it is needed for any following CUDA driver API function call (format cuFUNCTION(..) )
-  //             however, for the CUDA runtime API functions (format cudaFUNCTION(..) )
-  //             the initialization is implicit, thus cuInit() here would not be needed...
-  CUresult status = cuInit(0);
-  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA driver API device initialization failed\n");
-
-  // returns a handle to the first cuda compute device
-  CUdevice dev;
-  status = cuDeviceGet(&dev, 0);
-  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device not found\n");
-
-  // gets device properties
-  int major,minor;
-  status = cuDeviceComputeCapability(&major,&minor,dev);
-  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device information not found\n");
-
-  // make sure that the device has compute capability >= 1.3
-  if (major < 1){
-    fprintf(stderr,"Compute capability major number should be at least 1, got: %d \nexiting...\n",major);
-    exit_on_error("CUDA Compute capability major number should be at least 1\n");
-  }
-  if (major == 1 && minor < 3){
-    fprintf(stderr,"Compute capability should be at least 1.3, got: %d.%d \nexiting...\n",major,minor);
-    exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
-  }
-*/
-
-  // note: from here on we use the runtime API  ...
-
-  // Gets number of GPU devices
-  int device_count = 0;
-  cudaGetDeviceCount(&device_count);
-  exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\ncheck if driver and runtime libraries work together\nexiting...\n");
-
-  // returns device count to fortran
-  if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
-  *ncuda_devices = device_count;
-
-  // Sets the active device
-  if(device_count >= 1) {
-    // generalized for more GPUs per node
-    // note: without previous context release, cudaSetDevice will complain with the cuda error
-    //         "setting the device when a process is active is not allowed"
-    // releases previous contexts
-    cudaThreadExit();
-
-    //printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count);
-    //MPI_Barrier(MPI_COMM_WORLD);
-
-    // sets active device
-    cudaSetDevice( myrank % device_count );
-    exit_on_cuda_error("cudaSetDevice");
-  }
-
-  // returns a handle to the active device
-  int device;
-  cudaGetDevice(&device);
-
-  // get device properties
-  struct cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp,device);
-
-  // exit if the machine has no CUDA-enabled device
-  if (deviceProp.major == 9999 && deviceProp.minor == 9999){
-    fprintf(stderr,"No CUDA-enabled device found, exiting...\n\n");
-    exit_on_error("CUDA runtime error: there is no CUDA-enabled device found\n");
-  }
-
-  // outputs device infos to file
-  char filename[BUFSIZ];
-  FILE* fp;
-  sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_device_info_proc_%06d.txt",myrank);
-  fp = fopen(filename,"a+");
-  if (fp != NULL){
-    // display device properties
-    fprintf(fp,"Device Name = %s\n",deviceProp.name);
-    fprintf(fp,"multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
-    fprintf(fp,"totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
-    fprintf(fp,"totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
-    fprintf(fp,"sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
-    fprintf(fp,"Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
-    fprintf(fp,"Maximum size of each dimension of a block: %d x %d x %d\n",
-            deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
-    fprintf(fp,"Maximum sizes of each dimension of a grid: %d x %d x %d\n",
-            deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
-    fprintf(fp,"Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
-    if(deviceProp.canMapHostMemory){
-      fprintf(fp,"canMapHostMemory: TRUE\n");
-    }else{
-      fprintf(fp,"canMapHostMemory: FALSE\n");
-    }
-    if(deviceProp.deviceOverlap){
-      fprintf(fp,"deviceOverlap: TRUE\n");
-    }else{
-      fprintf(fp,"deviceOverlap: FALSE\n");
-    }
-
-    // outputs initial memory infos via cudaMemGetInfo()
-    double free_db,used_db,total_db;
-    get_free_memory(&free_db,&used_db,&total_db);
-    fprintf(fp,"%d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank,
-            used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
-
-    fclose(fp);
-  }
-
-  // make sure that the device has compute capability >= 1.3
-  if (deviceProp.major < 1){
-    fprintf(stderr,"Compute capability major number should be at least 1, exiting...\n\n");
-    exit_on_error("CUDA Compute capability major number should be at least 1\n");
-  }
-  if (deviceProp.major == 1 && deviceProp.minor < 3){
-    fprintf(stderr,"Compute capability should be at least 1.3, exiting...\n");
-    exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
-  }
-  // we use pinned memory for asynchronous copy
-  if( ! deviceProp.canMapHostMemory){
-    fprintf(stderr,"Device capability should allow to map host memory, exiting...\n");
-    exit_on_error("CUDA Device capability canMapHostMemory should be TRUE\n");
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
 void FC_FUNC_(prepare_constants_device,
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
                                         int* h_NGLLX,
@@ -793,7 +209,7 @@
   // device
   int size_mpi_buffer = 3 * (mp->num_interfaces_ext_mesh) * (mp->max_nibool_interfaces_ext_mesh);
   // send buffer
-  mp->size_mpi_buffer = size_mpi_buffer;  
+  mp->size_mpi_buffer = size_mpi_buffer;
   if( mp->size_mpi_buffer > 0 ){
     print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
     mp->send_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
@@ -803,7 +219,7 @@
 
     // receive buffer
     print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
-    mp->recv_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));  
+    mp->recv_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
   }
 
   //daniel: check if needed
@@ -812,16 +228,16 @@
   //mp->request_send_vector_ext_mesh = request_send_vector_ext_mesh;
   //mp->request_recv_vector_ext_mesh = request_recv_vector_ext_mesh;
   //mp->buffer_recv_vector_ext_mesh = buffer_recv_vector_ext_mesh;
-  
+
   // setup two streams, one for compute and one for host<->device memory copies
   // compute stream
   cudaStreamCreate(&mp->compute_stream);
   // copy stream (needed to transfer mpi buffers)
   if( mp->size_mpi_buffer > 0 ){
     cudaStreamCreate(&mp->copy_stream);
-    //cudaStreamCreate(&mp->b_copy_stream);  
+    //cudaStreamCreate(&mp->b_copy_stream);
   }
-  
+
   // inner elements
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,mp->NSPEC_AB*sizeof(int)),1205);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner,
@@ -968,7 +384,7 @@
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),
                       (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),2004);
   }
-  
+
   // mass matrix
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),2005);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
@@ -1210,7 +626,7 @@
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
                                       mp->size_mpi_buffer*sizeof(realw)),4004);
   }
-  
+
   // mass matrix
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
   // transfer element data

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-07-19 21:55:39 UTC (rev 20529)
@@ -5,9 +5,9 @@
 !               ---------------------------------------
 !
 !          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Princeton University, USA and CNRS / INRIA / University of Pau
-! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
-!                             July 2012
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
 !
 ! This program is free software; you can redistribute it and/or modify
 ! it under the terms of the GNU General Public License as published by
@@ -39,6 +39,17 @@
 // src/cuda/check_fields_cuda.cu
 //
 
+void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {}
+
+void FC_FUNC_(output_free_device_memory,
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
+
+ void FC_FUNC_(show_free_device_memory,
+ SHOW_FREE_DEVICE_MEMORY)() {}
+
+void FC_FUNC_(get_free_device_memory,
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
+
 void FC_FUNC_(check_max_norm_displ_gpu,
               CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
 
@@ -76,8 +87,8 @@
 
 void FC_FUNC_(get_norm_elastic_from_device,
               GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
-                                                 long* Mesh_pointer_f,
-                                                 int* SIMULATION_TYPE) {}
+                                            long* Mesh_pointer_f,
+                                            int* SIMULATION_TYPE) {}
 
 
 //
@@ -353,6 +364,17 @@
 
 
 //
+// src/cuda/initialize_cuda.cu
+//
+
+void FC_FUNC_(initialize_cuda_device,
+              INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+ fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
+ exit(1);
+}
+
+
+//
 // src/cuda/it_update_displacement_cuda.cu
 //
 
@@ -407,23 +429,6 @@
 // src/cuda/prepare_mesh_constants_cuda.cu
 //
 
-void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {}
-
-void FC_FUNC_(output_free_device_memory,
-              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
-
-void FC_FUNC_(show_free_device_memory,
-              SHOW_FREE_DEVICE_MEMORY)() {}
-
-void FC_FUNC_(get_free_device_memory,
-              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
-
-void FC_FUNC_(prepare_cuda_device,
-              PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
- fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
- exit(1);
-}
-
 void FC_FUNC_(prepare_constants_device,
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
                                         int* h_NGLLX,

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c.bak
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c.bak	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c.bak	2012-07-19 21:55:39 UTC (rev 20529)
@@ -5,9 +5,9 @@
 !               ---------------------------------------
 !
 !          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Princeton University, USA and University of Pau / CNRS / INRIA
-! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
-!                            April 2011
+!    Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+!                             July 2012
 !
 ! This program is free software; you can redistribute it and/or modify
 ! it under the terms of the GNU General Public License as published by

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/write_seismograms_cuda.cu	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/write_seismograms_cuda.cu	2012-07-19 21:55:39 UTC (rev 20529)
@@ -55,6 +55,7 @@
 
 */
 
+/* ----------------------------------------------------------------------------------------------- */
 
 // Initially sets the blocks_x to be the num_blocks, and adds rows as
 // needed. If an additional row is added, the row length is cut in

Modified: seismo/3D/SPECFEM3D/trunk/src/generate_databases/create_regions_mesh.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/generate_databases/create_regions_mesh.f90	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/generate_databases/create_regions_mesh.f90	2012-07-19 21:55:39 UTC (rev 20529)
@@ -334,53 +334,10 @@
     write(IMAIN,*) '  ...saving databases'
   endif
   !call create_name_database(prname,myrank,LOCAL_PATH)
-  call save_arrays_solver_ext_mesh(nspec,nglob_dummy, &
-!                        xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore,&
-!                        gammaxstore,gammaystore,gammazstore, &
-!                        jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, &
-!                        rhostore,kappastore,mustore, &
-!                        rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, &
-!                        rho_vpI,rho_vpII,rho_vsI, &
-!                        rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, &
-                        OCEANS, &
-!                        rmass_ocean_load,NGLOB_OCEAN, &
-                        ibool, &
-!                        xstore_dummy,ystore_dummy,zstore_dummy, &
-!                        abs_boundary_normal,abs_boundary_jacobian2Dw, &
-!                        abs_boundary_ijk,abs_boundary_ispec,num_abs_boundary_faces, &
-!                        free_surface_normal,free_surface_jacobian2Dw, &
-!                        free_surface_ijk,free_surface_ispec, &
-!                        num_free_surface_faces, &
-!                        coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
-!                        coupling_ac_el_ijk,coupling_ac_el_ispec, &
-!                        num_coupling_ac_el_faces, &
-!                        coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, &
-!                        coupling_ac_po_ijk,coupling_ac_po_ispec, &
-!                        num_coupling_ac_po_faces, &
-!                        coupling_el_po_normal,coupling_el_po_jacobian2Dw, &
-!                        coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, &
-!                        coupling_po_el_ispec,num_coupling_el_po_faces, &
+  call save_arrays_solver_ext_mesh(nspec,nglob_dummy,OCEANS,ibool, &
                         num_interfaces_ext_mesh,my_neighbours_ext_mesh,nibool_interfaces_ext_mesh, &
                         max_interface_size_ext_mesh,ibool_interfaces_ext_mesh, &
-!                        prname, &
-                        SAVE_MESH_FILES, &
-                        ANISOTROPY &
-!                        NSPEC_ANISO, &
-!                        c11store,c12store,c13store,c14store,c15store,c16store, &
-!                        c22store,c23store,c24store,c25store,c26store,c33store, &
-!                        c34store,c35store,c36store,c44store,c45store,c46store, &
-!                        c55store,c56store,c66store, &
-!                        ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, &
-!                        ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, &
-!                        nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, &
-!                        num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
-!                        num_phase_ispec_elastic,phase_ispec_inner_elastic, &
-!                        num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic, &
-!                        num_colors_outer_acoustic,num_colors_inner_acoustic, &
-!                        num_elem_colors_acoustic, &
-!                        num_colors_outer_elastic,num_colors_inner_elastic, &
-!                        num_elem_colors_elastic, &
-                      )
+                        SAVE_MESH_FILES,ANISOTROPY)
 
 ! saves moho surface
   if( SAVE_MOHO_MESH ) then

Modified: seismo/3D/SPECFEM3D/trunk/src/generate_databases/generate_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/generate_databases/generate_databases.f90	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/generate_databases/generate_databases.f90	2012-07-19 21:55:39 UTC (rev 20529)
@@ -933,15 +933,6 @@
   endif
   call create_regions_mesh()
 
-! now done inside create_regions_mesh_ext routine...
-! now done inside create_regions_mesh_ext routine...
-! Moho boundary parameters, 2-D jacobians and normals
-!  if( SAVE_MOHO_MESH ) then
-!    call create_regions_mesh_save_moho(myrank,nglob,NSPEC_AB, &
-!                        nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho, &
-!                        nodes_coords_ext_mesh,nnodes_ext_mesh,ibool )
-!  endif
-
 ! print min and max of topography included
   min_elevation = HUGEVAL
   max_elevation = -HUGEVAL

Modified: seismo/3D/SPECFEM3D/trunk/src/generate_databases/save_arrays_solver.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/generate_databases/save_arrays_solver.f90	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/generate_databases/save_arrays_solver.f90	2012-07-19 21:55:39 UTC (rev 20529)
@@ -27,181 +27,38 @@
 
 ! for external mesh
 
-  subroutine save_arrays_solver_ext_mesh(nspec,nglob, &
-!                    xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore, &
-!                    gammaxstore,gammaystore,gammazstore, &
-!                    jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, &
-!                    rhostore,kappastore,mustore, &
-!                    rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, &
-!                    rho_vpI,rho_vpII,rho_vsI, &
-!                    rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, &
-                    OCEANS, &
-!                    rmass_ocean_load,NGLOB_OCEAN,&
-                    ibool, &
-!                    xstore_dummy,ystore_dummy,zstore_dummy, &
-!                    abs_boundary_normal,abs_boundary_jacobian2Dw, &
-!                    abs_boundary_ijk,abs_boundary_ispec, &
-!                    num_abs_boundary_faces, &
-!                    free_surface_normal,free_surface_jacobian2Dw, &
-!                    free_surface_ijk,free_surface_ispec, &
-!                    num_free_surface_faces, &
-!                    coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
-!                    coupling_ac_el_ijk,coupling_ac_el_ispec, &
-!                    num_coupling_ac_el_faces, &
-!                    coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, &
-!                    coupling_ac_po_ijk,coupling_ac_po_ispec, &
-!                    num_coupling_ac_po_faces, &
-!                    coupling_el_po_normal,coupling_el_po_jacobian2Dw, &
-!                    coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, &
-!                    coupling_po_el_ispec,num_coupling_el_po_faces, &
+  subroutine save_arrays_solver_ext_mesh(nspec,nglob,OCEANS,ibool, &
                     num_interfaces_ext_mesh,my_neighbours_ext_mesh,nibool_interfaces_ext_mesh, &
                     max_interface_size_ext_mesh,ibool_interfaces_ext_mesh, &
-!                    prname, &
-                    SAVE_MESH_FILES, &
-                    ANISOTROPY &
-!                    NSPEC_ANISO, &
-!                    c11store,c12store,c13store,c14store,c15store,c16store, &
-!                    c22store,c23store,c24store,c25store,c26store,c33store, &
-!                    c34store,c35store,c36store,c44store,c45store,c46store, &
-!                    c55store,c56store,c66store, &
-!                    ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, &
-!                    ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, &
-!                    nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, &
-!                    num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
-!                    num_phase_ispec_elastic,phase_ispec_inner_elastic, &
-!                    num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic, &
-!                    num_colors_outer_acoustic,num_colors_inner_acoustic, &
-!                    num_elem_colors_acoustic, &
-!                    num_colors_outer_elastic,num_colors_inner_elastic, &
-!                    num_elem_colors_elastic, &
-              )
+                    SAVE_MESH_FILES,ANISOTROPY)
 
   use create_regions_mesh_ext_par
 
   implicit none
 
-!  include "constants.h"
-
   integer :: nspec,nglob
-
-! jacobian
-!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: xixstore,xiystore,xizstore, &
-!            etaxstore,etaystore,etazstore,gammaxstore,gammaystore,gammazstore,jacobianstore
-!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vp,rho_vs
-
-! attenuation
-!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: qmu_attenuation_store
-
-! material
-!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rhostore,kappastore,mustore
-!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: etastore,phistore,tortstore
-!  real(kind=CUSTOM_REAL), dimension(2,NGLLX,NGLLY,NGLLZ,nspec) :: rhoarraystore
-!  real(kind=CUSTOM_REAL), dimension(3,NGLLX,NGLLY,NGLLZ,nspec) :: kappaarraystore
-!  real(kind=CUSTOM_REAL), dimension(6,NGLLX,NGLLY,NGLLZ,nspec) :: permstore
-!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vpI,rho_vpII,rho_vsI
-!  real(kind=CUSTOM_REAL), dimension(nglob) :: rmass,rmass_acoustic, &
-!            rmass_solid_poroelastic,rmass_fluid_poroelastic
-! ocean load
+  ! ocean load
   logical :: OCEANS
-!  integer :: NGLOB_OCEAN
-!  real(kind=CUSTOM_REAL),dimension(NGLOB_OCEAN) :: rmass_ocean_load
-
-! mesh coordinates
+  ! mesh coordinates
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
-!  real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy
-
-! absorbing boundary surface
-!  integer :: num_abs_boundary_faces
-!  real(kind=CUSTOM_REAL) :: abs_boundary_normal(NDIM,NGLLSQUARE,num_abs_boundary_faces)
-!  real(kind=CUSTOM_REAL) :: abs_boundary_jacobian2Dw(NGLLSQUARE,num_abs_boundary_faces)
-!  integer :: abs_boundary_ijk(3,NGLLSQUARE,num_abs_boundary_faces)
-!  integer :: abs_boundary_ispec(num_abs_boundary_faces)
-
-! free surface
-!  integer :: num_free_surface_faces
-!  real(kind=CUSTOM_REAL) :: free_surface_normal(NDIM,NGLLSQUARE,num_free_surface_faces)
-!  real(kind=CUSTOM_REAL) :: free_surface_jacobian2Dw(NGLLSQUARE,num_free_surface_faces)
-!  integer :: free_surface_ijk(3,NGLLSQUARE,num_free_surface_faces)
-!  integer :: free_surface_ispec(num_free_surface_faces)
-
-! acoustic-elastic coupling surface
-!  integer :: num_coupling_ac_el_faces
-!  real(kind=CUSTOM_REAL) :: coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces)
-!  real(kind=CUSTOM_REAL) :: coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces)
-!  integer :: coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces)
-!  integer :: coupling_ac_el_ispec(num_coupling_ac_el_faces)
-
-! acoustic-poroelastic coupling surface
-!  integer :: num_coupling_ac_po_faces
-!  real(kind=CUSTOM_REAL) :: coupling_ac_po_normal(NDIM,NGLLSQUARE,num_coupling_ac_po_faces)
-!  real(kind=CUSTOM_REAL) :: coupling_ac_po_jacobian2Dw(NGLLSQUARE,num_coupling_ac_po_faces)
-!  integer :: coupling_ac_po_ijk(3,NGLLSQUARE,num_coupling_ac_po_faces)
-!  integer :: coupling_ac_po_ispec(num_coupling_ac_po_faces)
-
-! elastic-poroelastic coupling surface
-!  integer :: num_coupling_el_po_faces
-!  real(kind=CUSTOM_REAL) :: coupling_el_po_normal(NDIM,NGLLSQUARE,num_coupling_el_po_faces)
-!  real(kind=CUSTOM_REAL) :: coupling_el_po_jacobian2Dw(NGLLSQUARE,num_coupling_el_po_faces)
-!  integer :: coupling_el_po_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
-!  integer :: coupling_po_el_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
-!  integer :: coupling_el_po_ispec(num_coupling_el_po_faces)
-!  integer :: coupling_po_el_ispec(num_coupling_el_po_faces)
-
-! MPI interfaces
+  ! MPI interfaces
   integer :: num_interfaces_ext_mesh
   integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
   integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
   integer :: max_interface_size_ext_mesh
   integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
-  integer :: max_nibool_interfaces_ext_mesh
 
-! file name
-!  character(len=256) prname
   logical :: SAVE_MESH_FILES
-
-! anisotropy
   logical :: ANISOTROPY
-!  integer :: NSPEC_ANISO
-!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-!            c11store,c12store,c13store,c14store,c15store,c16store, &
-!            c22store,c23store,c24store,c25store,c26store,c33store, &
-!            c34store,c35store,c36store,c44store,c45store,c46store, &
-!            c55store,c56store,c66store
 
-! material domain flags
-!  logical, dimension(nspec) :: ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic
-
-! inner/outer elements
-!  logical,dimension(nspec) :: ispec_is_inner
-!  integer :: nspec_inner_acoustic,nspec_outer_acoustic
-!  integer :: nspec_inner_elastic,nspec_outer_elastic
-!  integer :: nspec_inner_poroelastic,nspec_outer_poroelastic
-
-!  integer :: num_phase_ispec_acoustic
-!  integer,dimension(num_phase_ispec_acoustic,2) :: phase_ispec_inner_acoustic
-
-!  integer :: num_phase_ispec_elastic
-!  integer,dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-!  integer :: num_phase_ispec_poroelastic
-!  integer,dimension(num_phase_ispec_poroelastic,2) :: phase_ispec_inner_poroelastic
-
-  ! mesh coloring
-!  integer :: num_colors_outer_acoustic,num_colors_inner_acoustic
-!  integer, dimension(num_colors_outer_acoustic + num_colors_inner_acoustic) :: &
-!    num_elem_colors_acoustic
-!  integer :: num_colors_outer_elastic,num_colors_inner_elastic
-!  integer, dimension(num_colors_outer_elastic + num_colors_inner_elastic) :: &
-!    num_elem_colors_elastic
-
-! local parameters
+  ! local parameters
   real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: v_tmp
   integer,dimension(:),allocatable :: v_tmp_i
 
-  !real(kind=CUSTOM_REAL) :: minimum(1)
   integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
+  integer :: max_nibool_interfaces_ext_mesh
+
   integer :: ier,i
-!  logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION,POROELASTIC_SIMULATION
   character(len=256) :: filename
 
   integer, dimension(:), allocatable :: iglob_tmp

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in	2012-07-19 21:55:39 UTC (rev 20529)
@@ -29,14 +29,6 @@
 
 # @configure_input@
 
-## example:
-# CUDA_LIBS = -L/apps/eiger/Cuda-4.0/cuda/lib64 -lcuda -lcudart -lcublas
-# CUDA_INC = -I/apps/eiger/Cuda-4.0/cuda/include
-# MPI_INC = -I/apps/eiger/mvapich2/1.5.1p1/mvapich2-gnu/include
-##
-#CUDA_LIBS= -L/u/dpeter/install/cuda/lib64 -lcudart -lcublas
-#MPI_INC= -I/usr/local/openmpi/1.4.3/gcc/x86_64/include
-
 # CUDA
 # with configure: ./configure --with-cuda CUDA_LIB=.. CUDA_INC=.. MPI_INC=..
 
@@ -150,6 +142,7 @@
 	$O/compute_kernels_cuda.cuda.o \
 	$O/compute_stacey_acoustic_cuda.cuda.o \
 	$O/compute_stacey_elastic_cuda.cuda.o \
+	$O/initialize_cuda.cuda.o \
 	$O/it_update_displacement_cuda.cuda.o \
 	$O/noise_tomography_cuda.cuda.o \
 	$O/prepare_mesh_constants_cuda.cuda.o \

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_kernels.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_kernels.f90	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_kernels.f90	2012-07-19 21:55:39 UTC (rev 20529)
@@ -287,7 +287,7 @@
                        + deltat * dot_product(accelw_poroelastic(:,iglob), b_displw_poroelastic(:,iglob))
 
                   ! kernel for viscous damping, see e.g. Morency et al. (2009),
-                  ! equation (42)      
+                  ! equation (42)
                   eta_kl(i,j,k,ispec) =  eta_kl(i,j,k,ispec) &
                        + deltat * dot_product(velocw_poroelastic(:,iglob), b_displw_poroelastic(:,iglob))
 
@@ -320,7 +320,7 @@
                   B_kl(i,j,k,ispec) = B_kl(i,j,k,ispec) &
                        + deltat * (9 * epsilons_trace_over_3(i,j,k,ispec) &
                        * b_epsilons_trace_over_3(i,j,k,ispec))
-                
+
                   C_kl(i,j,k,ispec) = C_kl(i,j,k,ispec) &
                        + deltat * (9 * epsilons_trace_over_3(i,j,k,ispec) &
                        * b_epsilonw_trace_over_3(i,j,k,ispec) &

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/initialize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/initialize_simulation.f90	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/initialize_simulation.f90	2012-07-19 21:55:39 UTC (rev 20529)
@@ -348,6 +348,7 @@
   use specfem_par_acoustic
   use specfem_par_poroelastic
   implicit none
+  ! local parameters
   integer :: ncuda_devices,ncuda_devices_min,ncuda_devices_max
 
   ! GPU_MODE now defined in Par_file
@@ -372,7 +373,7 @@
   endif
 
   ! initializes GPU and outputs info to files for all processes
-  call prepare_cuda_device(myrank,ncuda_devices)
+  call initialize_cuda_device(myrank,ncuda_devices)
 
   ! collects min/max of local devices found for statistics
   call sync_all()

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90	2012-07-19 21:55:39 UTC (rev 20529)
@@ -723,7 +723,7 @@
 !=====================================================================
 
   subroutine it_print_elapsed_time()
-  
+
     use specfem_par
     use specfem_par_elastic
     use specfem_par_acoustic

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90	2012-07-19 21:55:39 UTC (rev 20529)
@@ -33,7 +33,7 @@
   use specfem_par_elastic
   use specfem_par_poroelastic
   use specfem_par_movie
-  implicit none  
+  implicit none
   ! local parameters
   double precision :: tCPU
 
@@ -103,8 +103,8 @@
     !daniel debug: time estimation
     ! elastic elements: time per element t_per_element = 1.40789368e-05 s
     ! total time = nspec * nstep * t_per_element
-  endif  
-  
+  endif
+
   end subroutine prepare_timerun
 
 !
@@ -117,7 +117,7 @@
   use specfem_par_acoustic
   use specfem_par_elastic
   use specfem_par_poroelastic
-  use specfem_par_movie  
+  use specfem_par_movie
   implicit none
 
   ! flag for any movie simulation

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/save_adjoint_kernels.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/save_adjoint_kernels.f90	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/save_adjoint_kernels.f90	2012-07-19 21:55:39 UTC (rev 20529)
@@ -38,7 +38,7 @@
   real(kind=CUSTOM_REAL) :: rhol,mul,kappal
   real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: weights_kernel
   real(kind=CUSTOM_REAL) :: rhol_s,rhol_f,rhol_bar,phil,tortl
-  real(kind=CUSTOM_REAL) :: mul_s,kappal_s
+  real(kind=CUSTOM_REAL) :: kappal_s ! mul_s
   real(kind=CUSTOM_REAL) :: kappal_f,etal_f
   real(kind=CUSTOM_REAL) :: mul_fr,kappal_fr
   real(kind=CUSTOM_REAL) :: permlxx,permlxy,permlxz,permlyz,permlyy,permlzz

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90	2012-07-19 21:55:39 UTC (rev 20529)
@@ -53,7 +53,7 @@
     write(IMAIN,*)
     write(IMAIN,*) 'found a total of ',nrec_tot_found,' receivers in all the slices'
     if(NSOURCES > 1) write(IMAIN,*) 'Using ',NSOURCES,' point sources'
-    write(IMAIN,*)    
+    write(IMAIN,*)
   endif
 
   end subroutine setup_sources_receivers

Modified: seismo/3D/SPECFEM3D/trunk/utils/create_specfem3D_gpu_cuda_method_stubs.pl
===================================================================
--- seismo/3D/SPECFEM3D/trunk/utils/create_specfem3D_gpu_cuda_method_stubs.pl	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/utils/create_specfem3D_gpu_cuda_method_stubs.pl	2012-07-19 21:55:39 UTC (rev 20529)
@@ -107,7 +107,7 @@
       # function declaration
       if($line =~ /{/){
         # function declaration ends
-        if( $line =~ /PREPARE_CUDA_DEVICE/ ){
+        if( $line =~ /INITIALIZE_CUDA_DEVICE/ ){
           # adds warning
           print IOUT "$line \n$warning\} \n\n";
         }else{

Modified: seismo/3D/SPECFEM3D/trunk/utils/update_headers_change_word_f90.pl
===================================================================
--- seismo/3D/SPECFEM3D/trunk/utils/update_headers_change_word_f90.pl	2012-07-18 22:19:20 UTC (rev 20528)
+++ seismo/3D/SPECFEM3D/trunk/utils/update_headers_change_word_f90.pl	2012-07-19 21:55:39 UTC (rev 20529)
@@ -16,40 +16,40 @@
 # in the code (which is dangerous, but really easier to program...)
 #
 
-      @objects = `ls ../src/*/*.f90 ../src/*/*.F90 ../src/*/*.c ../src/*/*.cu ../src/*/*.h.in ../src/*/*.h`;
+ at objects = `ls ../src/*/*.f90 ../src/*/*.F90 ../src/*/*.c ../src/*/*.cu ../src/*/*.h.in ../src/*/*.h`;
 
-      foreach $name (@objects) {
-            chop $name;
-# change tabs to white spaces
-            system("expand -2 < $name > _____tutu01_____");
-            $f90name = $name;
-            print STDOUT "Changing word in file $name ...\n";
+foreach $name (@objects) {
+  chop $name;
+  # change tabs to white spaces
+  system("expand -2 < $name > _____tutu01_____");
+  $f90name = $name;
+  print STDOUT "Changing word in file $name ...\n";
 
-            open(FILEF77,"<_____tutu01_____");
-            open(FILEF90,">$f90name");
+  open(FILEF77,"<_____tutu01_____");
+  open(FILEF90,">$f90name");
 
-# open the source file
-      while($line = <FILEF77>) {
-            chop $line;
+  # open the source file
+  while($line = <FILEF77>) {
+    chop $line;
 
-# suppress trailing white spaces and carriage return
-      $line =~ s/\s*$//;
+    # suppress trailing white spaces and carriage return
+    $line =~ s/\s*$//;
 
-# change the version number and copyright information
-      $line =~ s#!                            April 2011#!                             July 2012#og;
-      $line =~ s#!    Princeton University, USA and University of Pau / CNRS / INRIA#!    Princeton University, USA and CNRS / INRIA / University of Pau#og;
-      $line =~ s#! \(c\) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA#! \(c\) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau#og;
-#     $line =~ s#rmass_sigma#rmass_time_integral_of_sigma#og;
+    # change the version number and copyright information
+    #$line =~ s#!                            April 2011#!                             July 2012#og;
+    #$line =~ s#!    Princeton University, USA and University of Pau / CNRS / INRIA#!    Princeton University, USA and CNRS / INRIA / University of Pau#og;
+    #$line =~ s#! \(c\) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA#! \(c\) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau#og;
+    #     $line =~ s#rmass_sigma#rmass_time_integral_of_sigma#og;
 
-# write the modified line to the output file
-      print FILEF90 "$line\n";
+    # write the modified line to the output file
+    print FILEF90 "$line\n";
 
   }
 
-            close(FILEF77);
-            close(FILEF90);
+  close(FILEF77);
+  close(FILEF90);
 
-      }
+}
 
-            system("rm -f _____tutu01_____");
+system("rm -f _____tutu01_____");