[cig-commits] r19145 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER: examples/waterlayered_halfspace/in_data_files src/cuda src/generate_databases src/meshfem3D src/shared src/specfem3D

danielpeter at geodynamics.org danielpeter at geodynamics.org
Wed Nov 2 20:11:45 PDT 2011


Author: danielpeter
Date: 2011-11-02 20:11:45 -0700 (Wed, 02 Nov 2011)
New Revision: 19145

Added:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_perm_color.f90
Modified:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/waterlayered_halfspace/in_data_files/CMTSOLUTION
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/waterlayered_halfspace/in_data_files/Par_file
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/waterlayered_halfspace/in_data_files/STATIONS
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/Makefile.in
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_regions_mesh.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/save_arrays_solver.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/save_databases.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_VTK_data.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_acoustic.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/create_color_image.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/specfem3D_par.f90
Log:
adds mesh coloring option

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/waterlayered_halfspace/in_data_files/CMTSOLUTION
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/waterlayered_halfspace/in_data_files/CMTSOLUTION	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/waterlayered_halfspace/in_data_files/CMTSOLUTION	2011-11-03 03:11:45 UTC (rev 19145)
@@ -3,7 +3,7 @@
 time shift:       0.0000
 half duration:    2.0
 latitude:       67000.0
-longitude:      67000.0
+longitude:      66999.9
 depth:           1.0
 Mrr:       1.000000e+23
 Mtt:       1.000000e+23

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/waterlayered_halfspace/in_data_files/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/waterlayered_halfspace/in_data_files/Par_file	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/waterlayered_halfspace/in_data_files/Par_file	2011-11-03 03:11:45 UTC (rev 19145)
@@ -13,8 +13,8 @@
 NPROC                           = 4
 
 # time step parameters
-NSTEP                           = 4500
-DT                              = 0.005
+NSTEP                           = 4000
+DT                              = 0.01
 
 # parameters describing the model
 OCEANS                          = .false.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/waterlayered_halfspace/in_data_files/STATIONS
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/waterlayered_halfspace/in_data_files/STATIONS	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/waterlayered_halfspace/in_data_files/STATIONS	2011-11-03 03:11:45 UTC (rev 19145)
@@ -1,5 +1,13 @@
 X10 DB 67000.00 10767.86 0.0 100.0
 X20 DB 67000.00 22732.14 0.0 100.0
 X30 DB 67000.00 34696.43 0.0 100.0
+X40 DB 67000.00 46660.71 0.0 100.0
+X50 DB 67000.00 58625.00 0.0 100.0
+X50 DB 67000.00 67000.00 0.0 100.0
+X60 DB 67000.00 75375.00 0.0 100.0
+X70 DB 67000.00 87339.29 0.0 100.0
+X80 DB 67000.00 99303.57 0.0 100.0
+X90 DB 67000.00 111269.86 0.0 100.0
 X1 DB 67000.00 0.0 0.0 100.0
-X2 DB 67000.00 67000.0 0.0 25000.0
\ No newline at end of file
+X2 DB 67000.00 67000.0 0.0 25000.0
+X3 DB 67000.00 134000.0 0.0 100.0
\ No newline at end of file

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu	2011-11-03 03:11:45 UTC (rev 19145)
@@ -239,159 +239,43 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase, int SIMULATION_TYPE);
+//void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase, int SIMULATION_TYPE);
 
-__global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,int* d_phase_ispec_inner_acoustic,
-                                       int num_phase_ispec_acoustic, int d_iphase,
-                                       float* d_potential_acoustic, float* d_potential_dot_dot_acoustic,
-                                       float* d_xix, float* d_xiy, float* d_xiz, float* d_etax, float* d_etay, float* d_etaz,
-                                       float* d_gammax, float* d_gammay, float* d_gammaz,
-                                       float* hprime_xx, float* hprime_yy, float* hprime_zz,
-                                       float* hprimewgll_xx, float* hprimewgll_yy, float* hprimewgll_zz,
-                                       float* wgllwgll_xy,float* wgllwgll_xz,float* wgllwgll_yz,
-                                       float* d_rhostore);
+//__global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,int* d_phase_ispec_inner_acoustic,
+//                                       int num_phase_ispec_acoustic, int d_iphase,
+//                                       float* d_potential_acoustic, float* d_potential_dot_dot_acoustic,
+//                                       float* d_xix, float* d_xiy, float* d_xiz, float* d_etax, float* d_etay, float* d_etaz,
+//                                       float* d_gammax, float* d_gammay, float* d_gammaz,
+//                                       float* hprime_xx, float* hprime_yy, float* hprime_zz,
+//                                       float* hprimewgll_xx, float* hprimewgll_yy, float* hprimewgll_zz,
+//                                       float* wgllwgll_xy,float* wgllwgll_xz,float* wgllwgll_yz,
+//                                       float* d_rhostore);
 
 
-/* ----------------------------------------------------------------------------------------------- */
 
-// main compute_forces_acoustic CUDA routine
 
 /* ----------------------------------------------------------------------------------------------- */
 
-extern "C"
-void FC_FUNC_(compute_forces_acoustic_cuda,
-              COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
-                                            int* iphase,
-                                            int* nspec_outer_acoustic,
-                                            int* nspec_inner_acoustic,
-                                            int* SIMULATION_TYPE) {
-
-TRACE("compute_forces_acoustic_cuda");
-  //double start_time = get_time();
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
-
-  int num_elements;
-
-  if( *iphase == 1 )
-    num_elements = *nspec_outer_acoustic;
-  else
-    num_elements = *nspec_inner_acoustic;
-
-  if( num_elements == 0 ) return;
-
-  //int myrank;
-  /* MPI_Comm_rank(MPI_COMM_WORLD,&myrank); */
-  /* if(myrank==0) { */
-
-  Kernel_2_acoustic(num_elements, mp, *iphase, *SIMULATION_TYPE);
-
-  //cudaThreadSynchronize();
-
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  /* MPI_Barrier(MPI_COMM_WORLD); */
-  //double end_time = get_time();
-  //printf("Elapsed time: %e\n",end_time-start_time);
-//#endif
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
 /* KERNEL 2 */
 
 /* ----------------------------------------------------------------------------------------------- */
 
-void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase, int SIMULATION_TYPE)
-{
 
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("before acoustic kernel Kernel 2");
-#endif
+__global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,
+                                       int NGLOB, int* d_ibool,
+                                       int* d_phase_ispec_inner_acoustic,
+                                       int num_phase_ispec_acoustic,
+                                       int d_iphase,
+                                       int use_mesh_coloring_gpu,
+                                       float* d_potential_acoustic, float* d_potential_dot_dot_acoustic,
+                                       float* d_xix, float* d_xiy, float* d_xiz,
+                                       float* d_etax, float* d_etay, float* d_etaz,
+                                       float* d_gammax, float* d_gammay, float* d_gammaz,
+                                       float* hprime_xx, float* hprime_yy, float* hprime_zz,
+                                       float* hprimewgll_xx, float* hprimewgll_yy, float* hprimewgll_zz,
+                                       float* wgllwgll_xy,float* wgllwgll_xz,float* wgllwgll_yz,
+                                       float* d_rhostore){
 
-    /* if the grid can handle the number of blocks, we let it be 1D */
-    /* grid_2_x = nb_elem_color; */
-    /* nb_elem_color is just how many blocks we are computing now */
-
-    int num_blocks_x = nb_blocks_to_compute;
-    int num_blocks_y = 1;
-    while(num_blocks_x > 65535) {
-      num_blocks_x = ceil(num_blocks_x/2.0);
-      num_blocks_y = num_blocks_y*2;
-    }
-
-    int threads_2 = 128;//BLOCK_SIZE_K2;
-    dim3 grid_2(num_blocks_x,num_blocks_y);
-
-
-    // Cuda timing
-    // cudaEvent_t start, stop;
-    // float time;
-    // cudaEventCreate(&start);
-    // cudaEventCreate(&stop);
-    // cudaEventRecord( start, 0 );
-
-    Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
-                                                           mp->NGLOB_AB, mp->d_ibool,
-                                                           mp->d_phase_ispec_inner_acoustic,
-                                                           mp->num_phase_ispec_acoustic, d_iphase,
-                                                           mp->d_potential_acoustic, mp->d_potential_dot_dot_acoustic,
-                                                           mp->d_xix, mp->d_xiy, mp->d_xiz,
-                                                           mp->d_etax, mp->d_etay, mp->d_etaz,
-                                                           mp->d_gammax, mp->d_gammay, mp->d_gammaz,
-                                                           mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
-                                                           mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
-                                                           mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
-                                                           mp->d_rhostore);
-
-    if(SIMULATION_TYPE == 3) {
-      Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
-                                                            mp->NGLOB_AB, mp->d_ibool,
-                                                            mp->d_phase_ispec_inner_acoustic,
-                                                            mp->num_phase_ispec_acoustic, d_iphase,
-                                                            mp->d_b_potential_acoustic, mp->d_b_potential_dot_dot_acoustic,
-                                                            mp->d_xix, mp->d_xiy, mp->d_xiz,
-                                                            mp->d_etax, mp->d_etay, mp->d_etaz,
-                                                            mp->d_gammax, mp->d_gammay, mp->d_gammaz,
-                                                            mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
-                                                            mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
-                                                            mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
-                                                            mp->d_rhostore);
-    }
-
-    // cudaEventRecord( stop, 0 );
-    // cudaEventSynchronize( stop );
-    // cudaEventElapsedTime( &time, start, stop );
-    // cudaEventDestroy( start );
-    // cudaEventDestroy( stop );
-    // printf("Kernel2 Execution Time: %f ms\n",time);
-
-    /* cudaThreadSynchronize(); */
-    /* TRACE("Kernel 2 finished"); */
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //printf("Tried to start with %dx1 blocks\n",nb_blocks_to_compute);
-  exit_on_cuda_error("kernel Kernel_2");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-/* KERNEL 2 on device*/
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-__global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
-                                      int* d_phase_ispec_inner_acoustic,
-                                      int num_phase_ispec_acoustic, int d_iphase,
-                                      float* d_potential_acoustic, float* d_potential_dot_dot_acoustic,
-                                      float* d_xix, float* d_xiy, float* d_xiz, float* d_etax, float* d_etay, float* d_etaz,
-                                      float* d_gammax, float* d_gammay, float* d_gammaz,
-                                      float* hprime_xx, float* hprime_yy, float* hprime_zz,
-                                      float* hprimewgll_xx, float* hprimewgll_yy, float* hprimewgll_zz,
-                                      float* wgllwgll_xy,float* wgllwgll_xz,float* wgllwgll_yz,
-                                      float* d_rhostore){
-
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
   int tx = threadIdx.x;
 
@@ -428,16 +312,27 @@
 // copy from global memory to shared memory
 // each thread writes one of the NGLL^3 = 125 data points
     if (active) {
-      // iphase-1 and working_element-1 for Fortran->C array conventions
-      working_element = d_phase_ispec_inner_acoustic[bx + num_phase_ispec_acoustic*(d_iphase-1)]-1;
+
+#ifdef USE_MESH_COLORING_GPU
+      working_element = bx;
+#else
+      //mesh coloring
+      if( use_mesh_coloring_gpu ){
+        working_element = bx;
+      }else{
+        // iphase-1 and working_element-1 for Fortran->C array conventions
+        working_element = d_phase_ispec_inner_acoustic[bx + num_phase_ispec_acoustic*(d_iphase-1)]-1;
+      }
+#endif
+
       // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
       iglob = d_ibool[working_element*125 + tx]-1;
 
 #ifdef USE_TEXTURES
-        s_dummy_loc[tx] = tex1Dfetch(tex_potential_acoustic, iglob);
+      s_dummy_loc[tx] = tex1Dfetch(tex_potential_acoustic, iglob);
 #else
   // changing iglob indexing to match fortran row changes fast style
-        s_dummy_loc[tx] = d_potential_acoustic[iglob];
+      s_dummy_loc[tx] = d_potential_acoustic[iglob];
 #endif
     }
 
@@ -589,14 +484,25 @@
         d_potential_dot_dot_acoustic[iglob] = tex1Dfetch(tex_potential_dot_dot_acoustic, iglob)
                                               - (fac1*temp1l + fac2*temp2l + fac3*temp3l);
 #else
-  /* OLD version that uses coloring to get around race condition. About 1.6x faster */
-      // d_accel[iglob*3] -= (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
-      // d_accel[iglob*3 + 1] -= (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
-      // d_accel[iglob*3 + 2] -= (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
 
+#ifdef USE_MESH_COLORING_GPU
+      // no atomic operation needed, colors don't share global points between elements
+      d_potential_dot_dot_acoustic[iglob] -= (fac1*temp1l + fac2*temp2l + fac3*temp3l);
+#else
+      //mesh coloring
+      if( use_mesh_coloring_gpu ){
+
+        // no atomic operation needed, colors don't share global points between elements
+        d_potential_dot_dot_acoustic[iglob] -= (fac1*temp1l + fac2*temp2l + fac3*temp3l);
+
+      }else{
+
         atomicAdd(&d_potential_dot_dot_acoustic[iglob],-(fac1*temp1l + fac2*temp2l + fac3*temp3l));
 
+      }
 #endif
+
+#endif
     }
 
 #else  // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
@@ -607,6 +513,220 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
+                       int SIMULATION_TYPE,
+                       int* d_ibool,
+                       float* d_xix,
+                       float* d_xiy,
+                       float* d_xiz,
+                       float* d_etax,
+                       float* d_etay,
+                       float* d_etaz,
+                       float* d_gammax,
+                       float* d_gammay,
+                       float* d_gammaz,
+                       float* d_rhostore)
+{
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("before acoustic kernel Kernel 2");
+#endif
+
+  /* if the grid can handle the number of blocks, we let it be 1D */
+  /* grid_2_x = nb_elem_color; */
+  /* nb_elem_color is just how many blocks we are computing now */
+
+  int num_blocks_x = nb_blocks_to_compute;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  int threads_2 = 128;//BLOCK_SIZE_K2;
+  dim3 grid_2(num_blocks_x,num_blocks_y);
+
+
+  // Cuda timing
+  // cudaEvent_t start, stop;
+  // float time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+
+  Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
+                                                        mp->NGLOB_AB,
+                                                        d_ibool,
+                                                        mp->d_phase_ispec_inner_acoustic,
+                                                        mp->num_phase_ispec_acoustic,
+                                                        d_iphase,
+                                                        mp->use_mesh_coloring_gpu,
+                                                        mp->d_potential_acoustic, mp->d_potential_dot_dot_acoustic,
+                                                        d_xix, d_xiy, d_xiz,
+                                                        d_etax, d_etay, d_etaz,
+                                                        d_gammax, d_gammay, d_gammaz,
+                                                        mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+                                                        mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+                                                        mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                        d_rhostore);
+
+  if(SIMULATION_TYPE == 3) {
+    Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
+                                                          mp->NGLOB_AB,
+                                                          d_ibool,
+                                                          mp->d_phase_ispec_inner_acoustic,
+                                                          mp->num_phase_ispec_acoustic,
+                                                          d_iphase,
+                                                          mp->use_mesh_coloring_gpu,
+                                                          mp->d_b_potential_acoustic, mp->d_b_potential_dot_dot_acoustic,
+                                                          d_xix, d_xiy, d_xiz,
+                                                          d_etax, d_etay, d_etaz,
+                                                          d_gammax, d_gammay, d_gammaz,
+                                                          mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+                                                          mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+                                                          mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                          d_rhostore);
+  }
+
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("Kernel2 Execution Time: %f ms\n",time);
+
+  /* cudaThreadSynchronize(); */
+  /* TRACE("Kernel 2 finished"); */
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("Tried to start with %dx1 blocks\n",nb_blocks_to_compute);
+  exit_on_cuda_error("kernel Kernel_2");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// main compute_forces_acoustic CUDA routine
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_forces_acoustic_cuda,
+              COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+                                            int* iphase,
+                                            int* nspec_outer_acoustic,
+                                            int* nspec_inner_acoustic,
+                                            int* SIMULATION_TYPE) {
+
+  TRACE("compute_forces_acoustic_cuda");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int num_elements;
+
+  if( *iphase == 1 )
+    num_elements = *nspec_outer_acoustic;
+  else
+    num_elements = *nspec_inner_acoustic;
+
+  if( num_elements == 0 ) return;
+
+  //int myrank;
+  /* MPI_Comm_rank(MPI_COMM_WORLD,&myrank); */
+  /* if(myrank==0) { */
+
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+
+    // note: array offsets require sorted arrays, such that e.g. ibool starts with elastic elements
+    //         and followed by acoustic ones.
+    //         acoustic elements also start with outer than inner element ordering
+
+    int nb_colors,nb_blocks_to_compute;
+    int istart;
+    int color_offset,color_offset_nonpadded;
+
+    // sets up color loop
+    if( *iphase == 1 ){
+      // outer elements
+      nb_colors = mp->num_colors_outer_acoustic;
+      istart = 0;
+
+      // array offsets (acoustic elements start after elastic ones)
+      color_offset = mp->nspec_elastic * NGLL3_PADDED;
+      color_offset_nonpadded = mp->nspec_elastic * NGLL3_NONPADDED;
+    }else{
+      // inner element colors (start after outer elements)
+      nb_colors = mp->num_colors_outer_acoustic + mp->num_colors_inner_acoustic;
+      istart = mp->num_colors_outer_acoustic;
+
+      // array offsets (inner elements start after outer ones)
+      color_offset = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3_PADDED;
+      color_offset_nonpadded = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3_NONPADDED;
+    }
+
+    // loops over colors
+    for(int icolor = istart; icolor < nb_colors; icolor++){
+
+      nb_blocks_to_compute = mp->h_num_elem_colors_acoustic[icolor];
+
+      // checks
+      //if( nb_blocks_to_compute <= 0 ){
+      //  printf("error number of acoustic color blocks: %d -- color = %d \n",nb_blocks_to_compute,icolor);
+      //  exit(EXIT_FAILURE);
+      //}
+
+      Kernel_2_acoustic(nb_blocks_to_compute,mp,*iphase,
+                         *SIMULATION_TYPE,
+                         mp->d_ibool + color_offset_nonpadded,
+                         mp->d_xix + color_offset,
+                         mp->d_xiy + color_offset,
+                         mp->d_xiz + color_offset,
+                         mp->d_etax + color_offset,
+                         mp->d_etay + color_offset,
+                         mp->d_etaz + color_offset,
+                         mp->d_gammax + color_offset,
+                         mp->d_gammay + color_offset,
+                         mp->d_gammaz + color_offset,
+                         mp->d_rhostore + color_offset);
+
+      // for padded and aligned arrays
+      color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+      // for no-aligned arrays
+      color_offset_nonpadded += nb_blocks_to_compute * NGLL3_NONPADDED;
+    }
+
+  }else{
+
+    // no mesh coloring: uses atomic updates
+
+    Kernel_2_acoustic(num_elements, mp, *iphase,
+                      *SIMULATION_TYPE,
+                      mp->d_ibool,
+                      mp->d_xix,
+                      mp->d_xiy,
+                      mp->d_xiz,
+                      mp->d_etax,
+                      mp->d_etay,
+                      mp->d_etaz,
+                      mp->d_gammax,
+                      mp->d_gammay,
+                      mp->d_gammaz,
+                      mp->d_rhostore);
+
+  }
+
+  //cudaThreadSynchronize();
+
+  //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  /* MPI_Barrier(MPI_COMM_WORLD); */
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  //#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
 /* KERNEL 3 */
 
 /* ----------------------------------------------------------------------------------------------- */

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2011-11-03 03:11:45 UTC (rev 19145)
@@ -51,30 +51,28 @@
 __constant__ float d_wgllwgll_yz[NGLL2];
 
 
-void Kernel_2(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
-        int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,int ATTENUATION);
-
+//void Kernel_2(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
+//        int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,int ATTENUATION);
 //__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
 //                            int num_phase_ispec_elastic, int d_iphase, int* d_ibool);
+//__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
+//                              int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,
+//                              float* d_displ, float* d_accel,
+//                              float* d_xix, float* d_xiy, float* d_xiz,
+//                              float* d_etax, float* d_etay, float* d_etaz,
+//                              float* d_gammax, float* d_gammay, float* d_gammaz,
+//                              float* d_kappav, float* d_muv,
+//                              //float* d_debug,
+//                              int COMPUTE_AND_STORE_STRAIN,
+//                              float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
+//                              float* epsilondev_xz,float* epsilondev_yz,float* epsilon_trace_over_3,
+//                              int SIMULATION_TYPE,
+//                              int ATTENUATION,int NSPEC,
+//                              float* one_minus_sum_beta,float* factor_common,
+//                              float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+//                              float* alphaval,float* betaval,float* gammaval);
 
-__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
-                              int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,
-                              float* d_displ, float* d_accel,
-                              float* d_xix, float* d_xiy, float* d_xiz,
-                              float* d_etax, float* d_etay, float* d_etaz,
-                              float* d_gammax, float* d_gammay, float* d_gammaz,
-                              float* d_kappav, float* d_muv,
-                              //float* d_debug,
-                              int COMPUTE_AND_STORE_STRAIN,
-                              float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
-                              float* epsilondev_xz,float* epsilondev_yz,float* epsilon_trace_over_3,
-                              int SIMULATION_TYPE,
-                              int ATTENUATION,int NSPEC,
-                              float* one_minus_sum_beta,float* factor_common,
-                              float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
-                              float* alphaval,float* betaval,float* gammaval);
 
-
 /* ----------------------------------------------------------------------------------------------- */
 
 // prepares a device array with with all inter-element edge-nodes -- this
@@ -173,9 +171,10 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 __global__ void assemble_boundary_accel_on_device(float* d_accel, float* d_send_accel_buffer,
-             int num_interfaces_ext_mesh, int max_nibool_interfaces_ext_mesh,
-             int* d_nibool_interfaces_ext_mesh,
-             int* d_ibool_interfaces_ext_mesh) {
+                                                  int num_interfaces_ext_mesh,
+                                                  int max_nibool_interfaces_ext_mesh,
+                                                  int* d_nibool_interfaces_ext_mesh,
+                                                  int* d_ibool_interfaces_ext_mesh) {
 
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
   //int bx = blockIdx.y*gridDim.x+blockIdx.x;
@@ -183,7 +182,7 @@
   int iinterface=0;
 
   for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
-    if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
+    if(id < d_nibool_interfaces_ext_mesh[iinterface]) {
 
       // for testing atomic operations against not atomic operations (0.1ms vs. 0.04 ms)
       // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)] +=
@@ -228,7 +227,7 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   cudaMemcpy(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
-             3* *max_nibool_interfaces_ext_mesh* *num_interfaces_ext_mesh*sizeof(realw), cudaMemcpyHostToDevice);
+             3*(*max_nibool_interfaces_ext_mesh)*(*num_interfaces_ext_mesh)*sizeof(realw), cudaMemcpyHostToDevice);
 
   int blocksize = 256;
   int size_padded = ((int)ceil(((double)*max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
@@ -278,49 +277,34 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// KERNEL 2
 
-extern "C"
-void FC_FUNC_(compute_forces_elastic_cuda,
-              COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f,
-                                           int* iphase,
-                                           int* nspec_outer_elastic,
-                                           int* nspec_inner_elastic,
-                                           int* SIMULATION_TYPE,
-                                           int* COMPUTE_AND_STORE_STRAIN,
-                                           int* ATTENUATION) {
+/* ----------------------------------------------------------------------------------------------- */
 
-TRACE("compute_forces_elastic_cuda");
-// EPIK_TRACER("compute_forces_elastic_cuda");
-//printf("Running compute_forces\n");
-  //double start_time = get_time();
+/* ----------------------------------------------------------------------------------------------- */
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+//__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
+//                            int num_phase_ispec_elastic, int d_iphase, int* d_ibool) {
+//  int bx = blockIdx.x;
+//  int tx = threadIdx.x;
+//  int working_element;
+//  //int ispec;
+//  //int NGLL3_ALIGN = 128;
+//  if(tx==0 && bx==0) {
+//
+//    d_debug_output[0] = 420.0;
+//
+//    d_debug_output[2] = num_phase_ispec_elastic;
+//    d_debug_output[3] = d_iphase;
+//    working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
+//    d_debug_output[4] = working_element;
+//    d_debug_output[5] = d_phase_ispec_inner_elastic[0];
+//    /* d_debug_output[1] = d_ibool[working_element*NGLL3_ALIGN + tx]-1; */
+//  }
+//  /* d_debug_output[1+tx+128*bx] = 69.0; */
+//
+//}
 
-  int num_elements;
-
-  if( *iphase == 1 )
-    num_elements = *nspec_outer_elastic;
-  else
-    num_elements = *nspec_inner_elastic;
-
-  if( num_elements == 0 ) return;
-
-  //int myrank;
-  /* MPI_Comm_rank(MPI_COMM_WORLD,&myrank); */
-  /* if(myrank==0) { */
-
-  Kernel_2(num_elements,mp,*iphase,*COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION);
-
-  //cudaThreadSynchronize();
-
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-/* MPI_Barrier(MPI_COMM_WORLD); */
-  //double end_time = get_time();
-  //printf("Elapsed time: %e\n",end_time-start_time);
-//#endif
-}
-
-
 /* ----------------------------------------------------------------------------------------------- */
 
 // updates stress
@@ -434,168 +418,9 @@
   return;
 }
 
-/* ----------------------------------------------------------------------------------------------- */
 
-// KERNEL 2
-
 /* ----------------------------------------------------------------------------------------------- */
 
-void Kernel_2(int nb_blocks_to_compute,
-              Mesh* mp,
-              int d_iphase,
-              int COMPUTE_AND_STORE_STRAIN,
-              int SIMULATION_TYPE,
-              int ATTENUATION){
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("before kernel Kernel 2");
-#endif
-
-  /* if the grid can handle the number of blocks, we let it be 1D */
-  /* grid_2_x = nb_elem_color; */
-  /* nb_elem_color is just how many blocks we are computing now */
-
-  int num_blocks_x = nb_blocks_to_compute;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  //int threads_2 = 128;//BLOCK_SIZE_K2;
-  //dim3 grid_2(num_blocks_x,num_blocks_y);
-
-  int blocksize = 128;
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-  // debugging
-  //printf("Starting with grid %dx%d for %d blocks\n",num_blocks_x,num_blocks_y,nb_blocks_to_compute);
-//  float* d_debug;
-//    float* h_debug;
-//    h_debug = (float*)calloc(128,sizeof(float));
-//    cudaMalloc((void**)&d_debug,128*sizeof(float));
-//    cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
-
-  // Cuda timing
-  // cudaEvent_t start, stop;
-  // float time;
-  // cudaEventCreate(&start);
-  // cudaEventCreate(&stop);
-  // cudaEventRecord( start, 0 );
-
-  //Kernel_2_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,mp->NGLOB_AB, mp->d_ibool,
-  Kernel_2_impl<<<grid,threads>>>(nb_blocks_to_compute,mp->NGLOB_AB, mp->d_ibool,
-                                               mp->d_phase_ispec_inner_elastic,
-                                               mp->num_phase_ispec_elastic,
-                                               d_iphase,
-                                               mp->d_displ, mp->d_accel,
-                                               mp->d_xix, mp->d_xiy, mp->d_xiz,
-                                               mp->d_etax, mp->d_etay, mp->d_etaz,
-                                               mp->d_gammax, mp->d_gammay, mp->d_gammaz,
-                                               mp->d_kappav, mp->d_muv,
-                                               //d_debug,
-                                               COMPUTE_AND_STORE_STRAIN,
-                                               mp->d_epsilondev_xx,
-                                               mp->d_epsilondev_yy,
-                                               mp->d_epsilondev_xy,
-                                               mp->d_epsilondev_xz,
-                                               mp->d_epsilondev_yz,
-                                               mp->d_epsilon_trace_over_3,
-                                               SIMULATION_TYPE,
-                                               ATTENUATION,mp->NSPEC_AB,
-                                               mp->d_one_minus_sum_beta,mp->d_factor_common,
-                                               mp->d_R_xx,mp->d_R_yy,mp->d_R_xy,mp->d_R_xz,mp->d_R_yz,
-                                               mp->d_alphaval,mp->d_betaval,mp->d_gammaval
-                                               );
-
-
-  // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
-  // int procid;
-  // MPI_Comm_rank(MPI_COMM_WORLD,&procid);
-  // if(procid==0) {
-  //   for(int i=0;i<17;i++) {
-  //  printf("cudadebug[%d] = %e\n",i,h_debug[i]);
-  //   }
-  // }
-//    free(h_debug);
-//    cudaFree(d_debug);
-// #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-//    exit_on_cuda_error("Kernel_2_impl");
-// #endif
-
-  if(SIMULATION_TYPE == 3) {
-    //Kernel_2_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,mp->NGLOB_AB, mp->d_ibool,
-    Kernel_2_impl<<< grid,threads>>>(nb_blocks_to_compute,mp->NGLOB_AB, mp->d_ibool,
-                                                 mp->d_phase_ispec_inner_elastic,
-                                                 mp->num_phase_ispec_elastic,
-                                                 d_iphase,
-                                                 mp->d_b_displ, mp->d_b_accel,
-                                                 mp->d_xix, mp->d_xiy, mp->d_xiz,
-                                                 mp->d_etax, mp->d_etay, mp->d_etaz,
-                                                 mp->d_gammax, mp->d_gammay, mp->d_gammaz,
-                                                 mp->d_kappav, mp->d_muv,
-                                                 //d_debug,
-                                                 COMPUTE_AND_STORE_STRAIN,
-                                                 mp->d_b_epsilondev_xx,
-                                                 mp->d_b_epsilondev_yy,
-                                                 mp->d_b_epsilondev_xy,
-                                                 mp->d_b_epsilondev_xz,
-                                                 mp->d_b_epsilondev_yz,
-                                                 mp->d_b_epsilon_trace_over_3,
-                                                 SIMULATION_TYPE,
-                                                 ATTENUATION,mp->NSPEC_AB,
-                                                 mp->d_one_minus_sum_beta,mp->d_factor_common,
-                                                 mp->d_b_R_xx,mp->d_b_R_yy,mp->d_b_R_xy,mp->d_b_R_xz,mp->d_b_R_yz,
-                                                 mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval
-                                                 );
-  }
-
-  // cudaEventRecord( stop, 0 );
-  // cudaEventSynchronize( stop );
-  // cudaEventElapsedTime( &time, start, stop );
-  // cudaEventDestroy( start );
-  // cudaEventDestroy( stop );
-  // printf("Kernel2 Execution Time: %f ms\n",time);
-
-  // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
-  // for(int i=0;i<10;i++) {
-  // printf("debug[%d]=%e\n",i,h_debug[i]);
-  // }
-
-  /* cudaThreadSynchronize(); */
-  /* LOG("Kernel 2 finished"); */
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("Kernel_2_impl ");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-//__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
-//                            int num_phase_ispec_elastic, int d_iphase, int* d_ibool) {
-//  int bx = blockIdx.x;
-//  int tx = threadIdx.x;
-//  int working_element;
-//  //int ispec;
-//  //int NGLL3_ALIGN = 128;
-//  if(tx==0 && bx==0) {
-//
-//    d_debug_output[0] = 420.0;
-//
-//    d_debug_output[2] = num_phase_ispec_elastic;
-//    d_debug_output[3] = d_iphase;
-//    working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
-//    d_debug_output[4] = working_element;
-//    d_debug_output[5] = d_phase_ispec_inner_elastic[0];
-//    /* d_debug_output[1] = d_ibool[working_element*NGLL3_ALIGN + tx]-1; */
-//  }
-//  /* d_debug_output[1+tx+128*bx] = 69.0; */
-//
-//}
-
-/* ----------------------------------------------------------------------------------------------- */
-
 // double precision temporary variables leads to 10% performance
 // decrease in Kernel_2_impl (not very much..)
 //typedef float reald;
@@ -604,8 +429,12 @@
 // #define MANUALLY_UNROLLED_LOOPS
 
 
-__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
-                              int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,
+__global__ void Kernel_2_impl(int nb_blocks_to_compute,
+                              int NGLOB,
+                              int* d_ibool,
+                              int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic,
+                              int d_iphase,
+                              int use_mesh_coloring_gpu,
                               float* d_displ, float* d_accel,
                               float* d_xix, float* d_xiy, float* d_xiz,
                               float* d_etax, float* d_etay, float* d_etaz,
@@ -617,7 +446,8 @@
                               float* epsilondev_xz,float* epsilondev_yz,
                               float* epsilon_trace_over_3,
                               int SIMULATION_TYPE,
-                              int ATTENUATION, int NSPEC,
+                              int ATTENUATION,
+                              int NSPEC,
                               float* one_minus_sum_beta,float* factor_common,
                               float* R_xx, float* R_yy, float* R_xy, float* R_xz, float* R_yz,
                               float* alphaval,float* betaval,float* gammaval
@@ -676,8 +506,19 @@
 // copy from global memory to shared memory
 // each thread writes one of the NGLL^3 = 125 data points
     if (active) {
-      // iphase-1 and working_element-1 for Fortran->C array conventions
-      working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
+
+#ifdef USE_MESH_COLORING_GPU
+      working_element = bx;
+#else
+      //mesh coloring
+      if( use_mesh_coloring_gpu ){
+        working_element = bx;
+      }else{
+        // iphase-1 and working_element-1 for Fortran->C array conventions
+        working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
+      }
+#endif
+
       // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
       iglob = d_ibool[working_element*125 + tx]-1;
 
@@ -826,8 +667,6 @@
       duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
       duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
 
-
-
       duxdxl_plus_duydyl = duxdxl + duydyl;
       duxdxl_plus_duzdzl = duxdxl + duzdzl;
       duydyl_plus_duzdzl = duydyl + duzdzl;
@@ -945,14 +784,14 @@
         tempy3l += s_tempy3[offset]*fac3;
         tempz3l += s_tempz3[offset]*fac3;
 
-        if(working_element == 169)
-          if(l==0)
-            if(I+J+K == 0) {
+        //if(working_element == 169)
+        //  if(l==0)
+        //    if(I+J+K == 0) {
               // atomicAdd(&d_debug[0],1.0);
               // d_debug[0] = fac3;
               // d_debug[1] = offset;
               // d_debug[2] = s_tempz3[offset];
-            }
+        //    }
       }
 #else
 
@@ -1022,30 +861,47 @@
       d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
 #else
   /* OLD/To be implemented version that uses coloring to get around race condition. About 1.6x faster */
-      // d_accel[iglob*3] -= (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
-      // d_accel[iglob*3 + 1] -= (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
-      // d_accel[iglob*3 + 2] -= (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
 
-      //if(iglob*3+2 == 41153) {
-        // int ot = d_debug[5];
-        // d_debug[0+1+ot] = d_accel[iglob*3+2];
-        // // d_debug[1+1+ot] = fac1*tempz1l;
-        // // d_debug[2+1+ot] = fac2*tempz2l;
-        // // d_debug[3+1+ot] = fac3*tempz3l;
-        // d_debug[1+1+ot] = fac1;
-        // d_debug[2+1+ot] = fac2;
-        // d_debug[3+1+ot] = fac3;
-        // d_debug[4+1+ot] = d_accel[iglob*3+2]-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
-        // atomicAdd(&d_debug[0],1.0);
-        // d_debug[6+ot] = d_displ[iglob*3+2];
-      //}
 
-      atomicAdd(&d_accel[iglob*3],-(fac1*tempx1l + fac2*tempx2l + fac3*tempx3l));
-      atomicAdd(&d_accel[iglob*3+1],-(fac1*tempy1l + fac2*tempy2l + fac3*tempy3l));
-      atomicAdd(&d_accel[iglob*3+2],-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l));
+#ifdef USE_MESH_COLORING_GPU
+      // no atomic operation needed, colors don't share global points between elements
+      d_accel[iglob*3] -= (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+      d_accel[iglob*3 + 1] -= (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+      d_accel[iglob*3 + 2] -= (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+#else
+      //mesh coloring
+      if( use_mesh_coloring_gpu ){
 
+       // no atomic operation needed, colors don't share global points between elements
+        d_accel[iglob*3] -= (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+        d_accel[iglob*3 + 1] -= (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+        d_accel[iglob*3 + 2] -= (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+        
+
+      }else{
+
+        //if(iglob*3+2 == 41153) {
+          // int ot = d_debug[5];
+          // d_debug[0+1+ot] = d_accel[iglob*3+2];
+          // // d_debug[1+1+ot] = fac1*tempz1l;
+          // // d_debug[2+1+ot] = fac2*tempz2l;
+          // // d_debug[3+1+ot] = fac3*tempz3l;
+          // d_debug[1+1+ot] = fac1;
+          // d_debug[2+1+ot] = fac2;
+          // d_debug[3+1+ot] = fac3;
+          // d_debug[4+1+ot] = d_accel[iglob*3+2]-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+          // atomicAdd(&d_debug[0],1.0);
+          // d_debug[6+ot] = d_displ[iglob*3+2];
+        //}
+
+        atomicAdd(&d_accel[iglob*3],-(fac1*tempx1l + fac2*tempx2l + fac3*tempx3l));
+        atomicAdd(&d_accel[iglob*3+1],-(fac1*tempy1l + fac2*tempy2l + fac3*tempy3l));
+        atomicAdd(&d_accel[iglob*3+2],-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l));
+      }
 #endif
 
+#endif
+
       // update memory variables based upon the Runge-Kutta scheme
       if( ATTENUATION ){
         compute_element_att_memory(tx,working_element,NSPEC,
@@ -1079,6 +935,359 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,
+              int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,int ATTENUATION,
+              int* d_ibool,
+              float* d_xix,
+              float* d_xiy,
+              float* d_xiz,
+              float* d_etax,
+              float* d_etay,
+              float* d_etaz,
+              float* d_gammax,
+              float* d_gammay,
+              float* d_gammaz,
+              float* d_kappav,
+              float* d_muv,
+              float* d_epsilondev_xx,
+              float* d_epsilondev_yy,
+              float* d_epsilondev_xy,
+              float* d_epsilondev_xz,
+              float* d_epsilondev_yz,
+              float* d_epsilon_trace_over_3,
+              float* d_one_minus_sum_beta,
+              float* d_factor_common,
+              float* d_R_xx,
+              float* d_R_yy,
+              float* d_R_xy,
+              float* d_R_xz,
+              float* d_R_yz,
+              float* d_b_epsilondev_xx,
+              float* d_b_epsilondev_yy,
+              float* d_b_epsilondev_xy,
+              float* d_b_epsilondev_xz,
+              float* d_b_epsilondev_yz,
+              float* d_b_epsilon_trace_over_3,
+              float* d_b_R_xx,
+              float* d_b_R_yy,
+              float* d_b_R_xy,
+              float* d_b_R_xz,
+              float* d_b_R_yz){
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("before kernel Kernel 2");
+#endif
+
+  /* if the grid can handle the number of blocks, we let it be 1D */
+  /* grid_2_x = nb_elem_color; */
+  /* nb_elem_color is just how many blocks we are computing now */
+
+  int num_blocks_x = nb_blocks_to_compute;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  //int threads_2 = 128;//BLOCK_SIZE_K2;
+  //dim3 grid_2(num_blocks_x,num_blocks_y);
+
+  int blocksize = 128;
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // debugging
+  //printf("Starting with grid %dx%d for %d blocks\n",num_blocks_x,num_blocks_y,nb_blocks_to_compute);
+  //  float* d_debug;
+  //    float* h_debug;
+  //    h_debug = (float*)calloc(128,sizeof(float));
+  //    cudaMalloc((void**)&d_debug,128*sizeof(float));
+  //    cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
+
+  // Cuda timing
+  // cudaEvent_t start, stop;
+  // float time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+
+  //Kernel_2_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,mp->NGLOB_AB, mp->d_ibool,
+  Kernel_2_impl<<<grid,threads>>>(nb_blocks_to_compute,
+                                  mp->NGLOB_AB,
+                                  d_ibool,
+                                  mp->d_phase_ispec_inner_elastic,
+                                  mp->num_phase_ispec_elastic,
+                                  d_iphase,
+                                  mp->use_mesh_coloring_gpu,
+                                  mp->d_displ, mp->d_accel,
+                                  d_xix, d_xiy, d_xiz,
+                                  d_etax, d_etay, d_etaz,
+                                  d_gammax, d_gammay, d_gammaz,
+                                  d_kappav, d_muv,
+                                  //d_debug,
+                                  COMPUTE_AND_STORE_STRAIN,
+                                  d_epsilondev_xx,
+                                  d_epsilondev_yy,
+                                  d_epsilondev_xy,
+                                  d_epsilondev_xz,
+                                  d_epsilondev_yz,
+                                  d_epsilon_trace_over_3,
+                                  SIMULATION_TYPE,
+                                  ATTENUATION,mp->NSPEC_AB,
+                                  d_one_minus_sum_beta,
+                                  d_factor_common,
+                                  d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+                                  mp->d_alphaval,mp->d_betaval,mp->d_gammaval
+                                  );
+
+
+  // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
+  // int procid;
+  // MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+  // if(procid==0) {
+  //   for(int i=0;i<17;i++) {
+  //  printf("cudadebug[%d] = %e\n",i,h_debug[i]);
+  //   }
+  // }
+  //    free(h_debug);
+  //    cudaFree(d_debug);
+  // #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //    exit_on_cuda_error("Kernel_2_impl");
+  // #endif
+
+  if(SIMULATION_TYPE == 3) {
+    //Kernel_2_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,mp->NGLOB_AB, mp->d_ibool,
+    Kernel_2_impl<<< grid,threads>>>(nb_blocks_to_compute,
+                                     mp->NGLOB_AB,
+                                     d_ibool,
+                                     mp->d_phase_ispec_inner_elastic,
+                                     mp->num_phase_ispec_elastic,
+                                     d_iphase,
+                                     mp->use_mesh_coloring_gpu,
+                                     mp->d_b_displ, mp->d_b_accel,
+                                     d_xix, d_xiy, d_xiz,
+                                     d_etax, d_etay, d_etaz,
+                                     d_gammax, d_gammay, d_gammaz,
+                                     d_kappav, d_muv,
+                                     //d_debug,
+                                     COMPUTE_AND_STORE_STRAIN,
+                                     d_b_epsilondev_xx,
+                                     d_b_epsilondev_yy,
+                                     d_b_epsilondev_xy,
+                                     d_b_epsilondev_xz,
+                                     d_b_epsilondev_yz,
+                                     d_b_epsilon_trace_over_3,
+                                     SIMULATION_TYPE,
+                                     ATTENUATION,mp->NSPEC_AB,
+                                     d_one_minus_sum_beta,
+                                     d_factor_common,
+                                     d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
+                                     mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval
+                                     );
+  }
+
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("Kernel2 Execution Time: %f ms\n",time);
+
+  // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
+  // for(int i=0;i<10;i++) {
+  // printf("debug[%d]=%e\n",i,h_debug[i]);
+  // }
+
+  /* cudaThreadSynchronize(); */
+  /* LOG("Kernel 2 finished"); */
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("Kernel_2_impl ");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(compute_forces_elastic_cuda,
+              COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f,
+                                           int* iphase,
+                                           int* nspec_outer_elastic,
+                                           int* nspec_inner_elastic,
+                                           int* SIMULATION_TYPE,
+                                           int* COMPUTE_AND_STORE_STRAIN,
+                                           int* ATTENUATION) {
+
+  TRACE("compute_forces_elastic_cuda");
+  // EPIK_TRACER("compute_forces_elastic_cuda");
+  //printf("Running compute_forces\n");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int num_elements;
+
+  if( *iphase == 1 )
+    num_elements = *nspec_outer_elastic;
+  else
+    num_elements = *nspec_inner_elastic;
+
+  // checks if anything to do
+  if( num_elements == 0 ) return;
+
+  //int myrank;
+  /* MPI_Comm_rank(MPI_COMM_WORLD,&myrank); */
+  /* if(myrank==0) { */
+
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+
+    // note: array offsets require sorted arrays, such that e.g. ibool starts with elastic elements
+    //         and followed by acoustic ones.
+    //         elastic elements also start with outer than inner element ordering
+
+    int nb_colors,nb_blocks_to_compute;
+    int istart;
+    int color_offset,color_offset_nonpadded,color_offset_nonpadded_att2;
+
+    // sets up color loop
+    if( *iphase == 1 ){
+      // outer elements
+      nb_colors = mp->num_colors_outer_elastic;
+      istart = 0;
+
+      // array offsets
+      color_offset = 0;
+      color_offset_nonpadded = 0;
+      color_offset_nonpadded_att2 = 0;
+    }else{
+      // inner elements (start after outer elements)
+      nb_colors = mp->num_colors_outer_elastic + mp->num_colors_inner_elastic;
+      istart = mp->num_colors_outer_elastic;
+
+      // array offsets
+      color_offset = (*nspec_outer_elastic) * NGLL3_PADDED;
+      color_offset_nonpadded = (*nspec_outer_elastic) * NGLL3_NONPADDED;
+      color_offset_nonpadded_att2 = (*nspec_outer_elastic) * NGLL3_NONPADDED * N_SLS;
+    }
+
+    // loops over colors
+    for(int icolor = istart; icolor < nb_colors; icolor++){
+
+      nb_blocks_to_compute = mp->h_num_elem_colors_elastic[icolor];
+
+      // checks
+      //if( nb_blocks_to_compute <= 0 ){
+      //  printf("error number of elastic color blocks: %d -- color = %d \n",nb_blocks_to_compute,icolor);
+      //  exit(EXIT_FAILURE);
+      //}
+
+      Kernel_2(nb_blocks_to_compute,mp,*iphase,
+               *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION,
+               mp->d_ibool + color_offset_nonpadded,
+               mp->d_xix + color_offset,
+               mp->d_xiy + color_offset,
+               mp->d_xiz + color_offset,
+               mp->d_etax + color_offset,
+               mp->d_etay + color_offset,
+               mp->d_etaz + color_offset,
+               mp->d_gammax + color_offset,
+               mp->d_gammay + color_offset,
+               mp->d_gammaz + color_offset,
+               mp->d_kappav + color_offset,
+               mp->d_muv + color_offset,
+               mp->d_epsilondev_xx + color_offset_nonpadded,
+               mp->d_epsilondev_yy + color_offset_nonpadded,
+               mp->d_epsilondev_xy + color_offset_nonpadded,
+               mp->d_epsilondev_xz + color_offset_nonpadded,
+               mp->d_epsilondev_yz + color_offset_nonpadded,
+               mp->d_epsilon_trace_over_3 + color_offset_nonpadded,
+               mp->d_one_minus_sum_beta + color_offset_nonpadded,
+               mp->d_factor_common + color_offset_nonpadded_att2,
+               mp->d_R_xx + color_offset_nonpadded,
+               mp->d_R_yy + color_offset_nonpadded,
+               mp->d_R_xy + color_offset_nonpadded,
+               mp->d_R_xz + color_offset_nonpadded,
+               mp->d_R_yz + color_offset_nonpadded,
+               mp->d_b_epsilondev_xx + color_offset_nonpadded,
+               mp->d_b_epsilondev_yy + color_offset_nonpadded,
+               mp->d_b_epsilondev_xy + color_offset_nonpadded,
+               mp->d_b_epsilondev_xz + color_offset_nonpadded,
+               mp->d_b_epsilondev_yz + color_offset_nonpadded,
+               mp->d_b_epsilon_trace_over_3 + color_offset_nonpadded,
+               mp->d_b_R_xx + color_offset_nonpadded,
+               mp->d_b_R_yy + color_offset_nonpadded,
+               mp->d_b_R_xy + color_offset_nonpadded,
+               mp->d_b_R_xz + color_offset_nonpadded,
+               mp->d_b_R_yz + color_offset_nonpadded);
+
+      // for padded and aligned arrays
+      color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+      // for no-aligned arrays
+      color_offset_nonpadded += nb_blocks_to_compute * NGLL3_NONPADDED;
+      // for factor_common array
+      color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3_NONPADDED * N_SLS;
+    }
+
+  }else{
+
+    // no mesh coloring: uses atomic updates
+
+    Kernel_2(num_elements,mp,*iphase,
+             *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION,
+             mp->d_ibool,
+             mp->d_xix,
+             mp->d_xiy,
+             mp->d_xiz,
+             mp->d_etax,
+             mp->d_etay,
+             mp->d_etaz,
+             mp->d_gammax,
+             mp->d_gammay,
+             mp->d_gammaz,
+             mp->d_kappav,
+             mp->d_muv,
+             mp->d_epsilondev_xx,
+             mp->d_epsilondev_yy,
+             mp->d_epsilondev_xy,
+             mp->d_epsilondev_xz,
+             mp->d_epsilondev_yz,
+             mp->d_epsilon_trace_over_3,
+             mp->d_one_minus_sum_beta,
+             mp->d_factor_common,
+             mp->d_R_xx,
+             mp->d_R_yy,
+             mp->d_R_xy,
+             mp->d_R_xz,
+             mp->d_R_yz,
+             mp->d_b_epsilondev_xx,
+             mp->d_b_epsilondev_yy,
+             mp->d_b_epsilondev_xy,
+             mp->d_b_epsilondev_xz,
+             mp->d_b_epsilondev_yz,
+             mp->d_b_epsilon_trace_over_3,
+             mp->d_b_R_xx,
+             mp->d_b_R_yy,
+             mp->d_b_R_xy,
+             mp->d_b_R_xz,
+             mp->d_b_R_yz);
+  }
+
+
+
+  //cudaThreadSynchronize();
+
+  //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  /* MPI_Barrier(MPI_COMM_WORLD); */
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  //#endif
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
 // KERNEL 3
 
 /* ----------------------------------------------------------------------------------------------- */

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2011-11-03 03:11:45 UTC (rev 19145)
@@ -42,13 +42,11 @@
 
 */
 
-
 #ifndef GPU_MESH_
 #define GPU_MESH_
 #include <sys/types.h>
 #include <unistd.h>
 
-
 /* ----------------------------------------------------------------------------------------------- */
 
 // for debugging and benchmarking
@@ -73,27 +71,22 @@
 #define PRINT5(var,offset) // for(i=0;i<10;i++) printf("var(%d)=%f\n",i,var[offset+i]);
 #endif
 
-
-//#undef ENABLE_VERY_SLOW_ERROR_CHECKING
+// error checking after cuda function calls
 #define ENABLE_VERY_SLOW_ERROR_CHECKING
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // indexing
 
 #define INDEX2(xsize,x,y) x + (y)*xsize
-#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
-#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
-#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + (j)*xsize*ysize*zsize*isize
+#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
+#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
+#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*j)))
 #define INDEX6(xsize,ysize,zsize,isize,jsize,x,y,z,i,j,k) x + xsize*(y + ysize*(z + zsize*(i + isize*(j + jsize*k))))
 
-#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*128
+#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*z) + (i)*128
 
-//daniel: TODO -- check speed of these alternative definitions
-//#define INDEX2(xsize,x,y) x + (y)*xsize
-//#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
-//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
-//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*j)))
 
 /* ----------------------------------------------------------------------------------------------- */
 
@@ -120,6 +113,9 @@
 #define NGLL2 25
 #define N_SLS 3
 
+#define NGLL3_NONPADDED 125
+#define NGLL3_PADDED 128
+
 //typedef float real;   // type of variables passed into function
 typedef float realw;  // type of "working" variables
 
@@ -127,6 +123,10 @@
 // decrease in Kernel_2_impl (not very much..)
 typedef float reald;
 
+// (optional) pre-processing directive used in kernels: if defined check that it is also set in src/shared/constants.h:
+// leads up to ~ 5% performance increase
+//#define USE_MESH_COLORING_GPU
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // mesh pointer wrapper structure
@@ -153,6 +153,9 @@
   // inner / outer elements
   int* d_ispec_is_inner;
 
+  // mesh coloring
+  int use_mesh_coloring_gpu;
+
   // pointers to constant memory arrays
   float* d_hprime_xx; float* d_hprime_yy; float* d_hprime_zz;
   float* d_hprimewgll_xx; float* d_hprimewgll_yy; float* d_hprimewgll_zz;
@@ -174,6 +177,11 @@
   int* d_phase_ispec_inner_elastic;
   int num_phase_ispec_elastic;
 
+  // mesh coloring
+  int* h_num_elem_colors_elastic;
+  int num_colors_outer_elastic,num_colors_inner_elastic;
+  int nspec_elastic;
+
   float* d_rmass;
   float* d_send_accel_buffer;
 
@@ -301,6 +309,11 @@
   int* d_phase_ispec_inner_acoustic;
   int num_phase_ispec_acoustic;
 
+  // mesh coloring
+  int* h_num_elem_colors_acoustic;
+  int num_colors_outer_acoustic,num_colors_inner_acoustic;
+  int nspec_acoustic;
+
   float* d_rhostore;
   float* d_kappastore;
   float* d_rmass_acoustic;

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2011-11-03 03:11:45 UTC (rev 19145)
@@ -452,6 +452,8 @@
                                         int* nrec_f,
                                         int* nrec_local_f,
                                         int* SIMULATION_TYPE,
+                                        int* USE_MESH_COLORING_GPU_f,
+                                        int* nspec_acoustic,int* nspec_elastic,
                                         int* ncuda_devices) {
 
 TRACE("prepare_constants_device");
@@ -648,6 +650,20 @@
   print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_rec,h_ispec_selected_rec,
                                      nrec*sizeof(int),cudaMemcpyHostToDevice),1514);
 
+#ifdef USE_MESH_COLORING_GPU
+  mp->use_mesh_coloring_gpu = 1;
+  if( ! *USE_MESH_COLORING_GPU_f ) exit_on_error("error with USE_MESH_COLORING_GPU constant; please re-compile\n");
+#else
+  // mesh coloring
+  // note: this here passes the coloring as an option to the kernel routines
+  //          the performance seems to be the same if one uses the pre-processing directives above or not
+  mp->use_mesh_coloring_gpu = *USE_MESH_COLORING_GPU_f;
+#endif
+
+  // number of elements per domain
+  mp->nspec_acoustic = *nspec_acoustic;
+  mp->nspec_elastic = *nspec_elastic;
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_constants_device");
 #endif
@@ -746,8 +762,10 @@
                                               int* coupling_ac_el_ispec,
                                               int* coupling_ac_el_ijk,
                                               float* coupling_ac_el_normal,
-                                              float* coupling_ac_el_jacobian2Dw
-                                              ) {
+                                              float* coupling_ac_el_jacobian2Dw,
+                                              int* num_colors_outer_acoustic,
+                                              int* num_colors_inner_acoustic,
+                                              int* num_elem_colors_acoustic) {
 
   TRACE("prepare_fields_acoustic_device");
 
@@ -850,6 +868,13 @@
 
   }
 
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+    mp->num_colors_outer_acoustic = *num_colors_outer_acoustic;
+    mp->num_colors_inner_acoustic = *num_colors_inner_acoustic;
+    mp->h_num_elem_colors_acoustic = (int*) num_elem_colors_acoustic;
+  }
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_fields_acoustic_device");
 #endif
@@ -937,7 +962,10 @@
                                              int* free_surface_ispec,
                                              int* free_surface_ijk,
                                              int* num_free_surface_faces,
-                                             int* ACOUSTIC_SIMULATION){
+                                             int* ACOUSTIC_SIMULATION,
+                                             int* num_colors_outer_elastic,
+                                             int* num_colors_inner_elastic,
+                                             int* num_elem_colors_elastic){
 
 TRACE("prepare_fields_elastic_device");
 
@@ -1125,6 +1153,13 @@
     }
   }
 
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+    mp->num_colors_outer_elastic = *num_colors_outer_elastic;
+    mp->num_colors_inner_elastic = *num_colors_inner_elastic;
+    mp->h_num_elem_colors_elastic = (int*) num_elem_colors_elastic;
+  }
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_fields_elastic_device");
 #endif
@@ -1467,6 +1502,7 @@
       cudaFree(mp->d_station_seismo_potential);
       free(mp->h_station_seismo_potential);
     }
+
   } // ACOUSTIC_SIMULATION
 
   // ELASTIC arrays

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2011-11-03 03:11:45 UTC (rev 19145)
@@ -356,6 +356,8 @@
                                         int* nrec_f,
                                         int* nrec_local_f,
                                         int* SIMULATION_TYPE,
+                                        int* USE_MESH_COLORING_GPU,
+                                        int* nspec_acoustic,int* nspec_elastic,                                        
                                         int* ncuda_devices)
 {
   fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
@@ -391,8 +393,10 @@
                                               int* coupling_ac_el_ispec,
                                               int* coupling_ac_el_ijk,
                                               float* coupling_ac_el_normal,
-                                              float* coupling_ac_el_jacobian2Dw
-                                              ){}							 
+                                              float* coupling_ac_el_jacobian2Dw,
+                                              int* num_colors_outer_acoustic,
+                                              int* num_colors_inner_acoustic,
+                                              int* num_elem_colors_acoustic){}							 
 
 void FC_FUNC_(prepare_fields_acoustic_adj_dev,
               PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
@@ -426,7 +430,10 @@
                                              int* free_surface_ispec,
                                              int* free_surface_ijk,                                             
                                              int* num_free_surface_faces,
-                                             int* ACOUSTIC_SIMULATION){}
+                                             int* ACOUSTIC_SIMULATION,
+                                             int* num_colors_outer_elastic,
+                                             int* num_colors_inner_elastic,
+                                             int* num_elem_colors_elastic){}
   
 void FC_FUNC_(prepare_fields_elastic_adj_dev,
               PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/Makefile.in	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/Makefile.in	2011-11-03 03:11:45 UTC (rev 19145)
@@ -78,14 +78,15 @@
 	$O/detect_surface.o \
 	$O/exit_mpi.o \
 	$O/get_absorbing_boundary.o \
-	$O/get_coupling_surfaces.o \
-	$O/get_model.o \
-	$O/get_MPI.o \
 	$O/get_attenuation_model.o \
 	$O/get_cmt.o \
+	$O/get_coupling_surfaces.o \
 	$O/get_element_face.o \
 	$O/get_global.o \
 	$O/get_jacobian_boundaries.o \
+	$O/get_model.o \
+	$O/get_MPI.o \
+	$O/get_perm_color.o \
 	$O/get_shape2D.o \
 	$O/get_shape3D.o \
 	$O/get_value_parameters.o \
@@ -319,6 +320,9 @@
 $O/get_MPI.o:  ${SHARED}/constants.h get_MPI.f90
 	${FCCOMPILE_CHECK} -c -o $O/get_MPI.o get_MPI.f90
 
+$O/get_perm_color.o:  ${SHARED}/constants.h get_perm_color.f90
+	${FCCOMPILE_CHECK} -c -o $O/get_perm_color.o get_perm_color.f90
+
 $O/create_name_database.o:  ${SHARED}/constants.h ${SHARED}/create_name_database.f90
 	${FCCOMPILE_CHECK} -c -o $O/create_name_database.o ${SHARED}/create_name_database.f90
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_regions_mesh.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_regions_mesh.f90	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_regions_mesh.f90	2011-11-03 03:11:45 UTC (rev 19145)
@@ -86,6 +86,14 @@
   integer, dimension(:), allocatable :: coupling_ac_el_ispec
   integer :: num_coupling_ac_el_faces
 
+  ! Moho mesh
+  real(CUSTOM_REAL), dimension(:,:,:),allocatable :: normal_moho_top
+  real(CUSTOM_REAL), dimension(:,:,:),allocatable :: normal_moho_bot
+  integer,dimension(:,:,:),allocatable :: ijk_moho_top, ijk_moho_bot
+  integer,dimension(:),allocatable :: ibelm_moho_top, ibelm_moho_bot
+  integer :: NSPEC2D_MOHO
+  logical, dimension(:),allocatable :: is_moho_top, is_moho_bot
+
 ! for stacey
   real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: rho_vp,rho_vs
 
@@ -103,6 +111,26 @@
 ! name of the database file
   character(len=256) prname
 
+! inner/outer elements
+  logical,dimension(:),allocatable :: ispec_is_inner
+  integer :: nspec_inner_acoustic,nspec_outer_acoustic
+  integer :: nspec_inner_elastic,nspec_outer_elastic
+
+  integer :: num_phase_ispec_acoustic
+  integer,dimension(:,:),allocatable :: phase_ispec_inner_acoustic
+
+  integer :: num_phase_ispec_elastic
+  integer,dimension(:,:),allocatable :: phase_ispec_inner_elastic
+
+  logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION
+
+  ! mesh coloring
+  integer :: num_colors_outer_acoustic,num_colors_inner_acoustic
+  integer, dimension(:), allocatable :: num_elem_colors_acoustic
+
+  integer :: num_colors_outer_elastic,num_colors_inner_elastic
+  integer, dimension(:), allocatable :: num_elem_colors_elastic
+
 end module create_regions_mesh_ext_par
 
 !
@@ -112,7 +140,8 @@
 ! main routine
 
 subroutine create_regions_mesh_ext(ibool, &
-                        xstore,ystore,zstore,nspec,npointot,myrank,LOCAL_PATH, &
+                        xstore,ystore,zstore,nspec, &
+                        npointot,myrank,LOCAL_PATH, &
                         nnodes_ext_mesh,nelmnts_ext_mesh, &
                         nodes_coords_ext_mesh, elmnts_ext_mesh, &
                         max_static_memory_size, mat_ext_mesh, materials_ext_mesh, &
@@ -126,15 +155,16 @@
                         ibelm_xmin, ibelm_xmax, ibelm_ymin, ibelm_ymax, ibelm_bottom, ibelm_top, &
                         nodes_ibelm_xmin,nodes_ibelm_xmax,nodes_ibelm_ymin,nodes_ibelm_ymax,&
                         nodes_ibelm_bottom,nodes_ibelm_top, &
-                        SAVE_MESH_FILES,nglob, &
+                        SAVE_MESH_FILES, &
+                        nglob, &
                         ANISOTROPY,NPROC,OCEANS,TOPOGRAPHY, &
                         ATTENUATION,USE_OLSEN_ATTENUATION, &
                         UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION,NX_TOPO,NY_TOPO, &
                         ORIG_LAT_TOPO,ORIG_LONG_TOPO,DEGREES_PER_CELL_TOPO, &
-                        itopo_bathy)
+                        itopo_bathy, &
+                        nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho)
 
 ! create the different regions of the mesh
-
   use create_regions_mesh_ext_par
   implicit none
   !include "constants.h"
@@ -209,6 +239,11 @@
   double precision :: ORIG_LAT_TOPO,ORIG_LONG_TOPO,DEGREES_PER_CELL_TOPO
   integer, dimension(NX_TOPO,NY_TOPO) :: itopo_bathy
 
+! moho (optional)
+  integer :: nspec2D_moho_ext
+  integer, dimension(nspec2D_moho_ext)  :: ibelm_moho
+  integer, dimension(4,nspec2D_moho_ext) :: nodes_ibelm_moho
+
 ! local parameters
 ! static memory size needed by the solver
   double precision :: static_memory_size
@@ -289,16 +324,12 @@
   if( myrank == 0) then
     write(IMAIN,*) '  ...determining velocity model'
   endif
-  ! Default model. Using PREM model instead
-  ! call get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
-  ! materials_ext_mesh,nmat_ext_mesh, &
-  ! undef_mat_prop,nundefMat_ext_mesh, &
-  ! ANISOTROPY,LOCAL_PATH)
+  ! calls wrapper function
+  call get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
+                materials_ext_mesh,nmat_ext_mesh, &
+                undef_mat_prop,nundefMat_ext_mesh, &
+                ANISOTROPY,LOCAL_PATH)
 
-  call get_model_PREM(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
-                        materials_ext_mesh,nmat_ext_mesh, &
-                        undef_mat_prop,nundefMat_ext_mesh, &
-                        ANISOTROPY,LOCAL_PATH)
 
 ! sets up absorbing/free surface boundaries
   call sync_all()
@@ -324,6 +355,18 @@
                         num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
                         my_neighbours_ext_mesh)
 
+! sets up up Moho surface
+  NSPEC2D_MOHO = 0
+  if( SAVE_MOHO_MESH ) then
+    call sync_all()
+    if( myrank == 0) then
+      write(IMAIN,*) '  ...setting up Moho surface'
+    endif
+    call crm_setup_moho(myrank,nglob,nspec, &
+                      nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho, &
+                      nodes_coords_ext_mesh,nnodes_ext_mesh,ibool )
+  endif
+
 ! creates mass matrix
   call sync_all()
   if( myrank == 0) then
@@ -341,6 +384,24 @@
                         ORIG_LAT_TOPO,ORIG_LONG_TOPO,DEGREES_PER_CELL_TOPO, &
                         itopo_bathy)
 
+! locates inner and outer elements
+  call sync_all()
+  if( myrank == 0) then
+    write(IMAIN,*) '  ...element inner/outer separation '
+  endif
+  call crm_setup_inner_outer_elemnts(myrank,nspec,nglob, &
+                                    num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
+                                    nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                                    ibool,SAVE_MESH_FILES)
+
+! colors mesh if requested
+  call sync_all()
+  if( myrank == 0) then
+    write(IMAIN,*) '  ...element mesh coloring '
+  endif
+  call crm_setup_color_perm(myrank,nspec,nglob,ibool,ANISOTROPY,SAVE_MESH_FILES)
+
+
 ! saves the binary mesh files
   call sync_all()
   if( myrank == 0) then
@@ -370,9 +431,23 @@
                         c22store,c23store,c24store,c25store,c26store,c33store, &
                         c34store,c35store,c36store,c44store,c45store,c46store, &
                         c55store,c56store,c66store, &
-                        ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic)
+                        ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, &
+                        ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic, &
+                        nspec_outer_acoustic,nspec_outer_elastic, &
+                        num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
+                        num_phase_ispec_elastic,phase_ispec_inner_elastic, &
+                        num_colors_outer_acoustic,num_colors_inner_acoustic, &
+                        num_elem_colors_acoustic, &
+                        num_colors_outer_elastic,num_colors_inner_elastic, &
+                        num_elem_colors_elastic)
 
+! saves moho surface
+  if( SAVE_MOHO_MESH ) then
+    call crm_save_moho()
+  endif
+
 ! computes the approximate amount of static memory needed to run the solver
+  call sync_all()
   call memory_eval(nspec,nglob,maxval(nibool_interfaces_ext_mesh),num_interfaces_ext_mesh, &
                   OCEANS,static_memory_size)
   call max_all_dp(static_memory_size, max_static_memory_size)
@@ -782,7 +857,7 @@
 !
 
 
-  subroutine create_regions_mesh_save_moho( myrank,nglob,nspec, &
+  subroutine crm_setup_moho( myrank,nglob,nspec, &
                         nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho, &
                         nodes_coords_ext_mesh,nnodes_ext_mesh,ibool )
 
@@ -802,14 +877,6 @@
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
 
 ! local parameters
-  ! Moho mesh
-  real(CUSTOM_REAL), dimension(:,:,:),allocatable :: normal_moho_top
-  real(CUSTOM_REAL), dimension(:,:,:),allocatable :: normal_moho_bot
-  integer,dimension(:,:,:),allocatable :: ijk_moho_top, ijk_moho_bot
-  integer,dimension(:),allocatable :: ibelm_moho_top, ibelm_moho_bot
-  integer :: NSPEC2D_MOHO
-  logical, dimension(:),allocatable :: is_moho_top, is_moho_bot
-
   real(kind=CUSTOM_REAL),dimension(NGNOD2D) :: xcoord,ycoord,zcoord
   real(kind=CUSTOM_REAL) :: jacobian2Dw_face(NGLLX,NGLLY)
   real(kind=CUSTOM_REAL) :: normal_face(NDIM,NGLLX,NGLLY)
@@ -1074,21 +1141,863 @@
     write(IMAIN,*) '********'
   endif
 
+  deallocate(iglob_is_surface)
+  deallocate(iglob_normals)
+
+  end subroutine crm_setup_moho
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine crm_save_moho()
+
+  use create_regions_mesh_ext_par
+  implicit none
+  ! local parameters
+  integer :: ier
+
   ! saves moho files: total number of elements, corner points, all points
-  open(unit=27,file=prname(1:len_trim(prname))//'ibelm_moho.bin',status='unknown',form='unformatted')
+  open(unit=27,file=prname(1:len_trim(prname))//'ibelm_moho.bin', &
+        status='unknown',form='unformatted',iostat=ier)
+  if( ier /= 0 ) stop 'error opening ibelm_moho.bin file'
   write(27) NSPEC2D_MOHO
   write(27) ibelm_moho_top
   write(27) ibelm_moho_bot
   write(27) ijk_moho_top
   write(27) ijk_moho_bot
   close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'normal_moho.bin',status='unknown',form='unformatted')
+  open(unit=27,file=prname(1:len_trim(prname))//'normal_moho.bin', &
+        status='unknown',form='unformatted',iostat=ier)
+  if( ier /= 0 ) stop 'error opening normal_moho.bin file'
   write(27) normal_moho_top
   write(27) normal_moho_bot
   close(27)
-  open(unit=27,file=prname(1:len_trim(prname))//'is_moho.bin',status='unknown',form='unformatted')
+  open(unit=27,file=prname(1:len_trim(prname))//'is_moho.bin', &
+    status='unknown',form='unformatted',iostat=ier)
+  if( ier /= 0 ) stop 'error opening is_moho.bin file'
   write(27) is_moho_top
   write(27) is_moho_bot
   close(27)
 
-  end subroutine create_regions_mesh_save_moho
+  end subroutine crm_save_moho
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine crm_setup_inner_outer_elemnts(myrank,nspec,nglob, &
+                                  num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
+                                  nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                                  ibool,SAVE_MESH_FILES)
+
+! locates inner and outer elements
+
+  use create_regions_mesh_ext_par
+  implicit none
+
+  integer :: myrank,nspec,nglob
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+  ! MPI interfaces
+  integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh
+  integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: &
+    ibool_interfaces_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+
+  logical :: SAVE_MESH_FILES
+
+  ! local parameters
+  integer :: i,j,k,ispec,iglob
+  integer :: iinterface,ier
+  integer :: ispec_inner,ispec_outer
+  real :: percentage_edge
+  character(len=256) :: filename
+  logical,dimension(:),allocatable :: iglob_is_inner
+
+  ! allocates arrays
+  allocate(ispec_is_inner(nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array ispec_is_inner'
+
+  ! temporary array
+  allocate(iglob_is_inner(nglob),stat=ier)
+  if( ier /= 0 ) stop 'error allocating temporary array  iglob_is_inner'
+
+  ! initialize flags
+  ispec_is_inner(:) = .true.
+  iglob_is_inner(:) = .true.
+  do iinterface = 1, num_interfaces_ext_mesh
+    do i = 1, nibool_interfaces_ext_mesh(iinterface)
+      iglob = ibool_interfaces_ext_mesh(i,iinterface)
+      iglob_is_inner(iglob) = .false.
+    enddo
+  enddo
+
+  ! determines flags for inner elements (purely inside the partition)
+  do ispec = 1, nspec
+    do k = 1, NGLLZ
+      do j = 1, NGLLY
+        do i = 1, NGLLX
+          iglob = ibool(i,j,k,ispec)
+          ispec_is_inner(ispec) = ( iglob_is_inner(iglob) .and. ispec_is_inner(ispec) )
+        enddo
+      enddo
+    enddo
+  enddo
+
+  ! frees temporary array
+  deallocate( iglob_is_inner )
+
+  if( SAVE_MESH_FILES ) then
+    filename = prname(1:len_trim(prname))//'ispec_is_inner'
+    call write_VTK_data_elem_l(nspec,nglob, &
+                        xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                        ispec_is_inner,filename)
+  endif
+
+
+  ! sets up elements for loops in acoustic simulations
+  nspec_inner_acoustic = 0
+  nspec_outer_acoustic = 0
+  call any_all_l( ANY(ispec_is_acoustic), ACOUSTIC_SIMULATION )
+  if( ACOUSTIC_SIMULATION ) then
+    ! counts inner and outer elements
+    do ispec = 1, nspec
+      if( ispec_is_acoustic(ispec) ) then
+        if( ispec_is_inner(ispec) .eqv. .true. ) then
+          nspec_inner_acoustic = nspec_inner_acoustic + 1
+        else
+          nspec_outer_acoustic = nspec_outer_acoustic + 1
+        endif
+      endif
+    enddo
+
+    ! stores indices of inner and outer elements for faster(?) computation
+    num_phase_ispec_acoustic = max(nspec_inner_acoustic,nspec_outer_acoustic)
+    if( num_phase_ispec_acoustic < 0 ) stop 'error acoustic simulation: num_phase_ispec_acoustic is < zero'
+
+    allocate( phase_ispec_inner_acoustic(num_phase_ispec_acoustic,2),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array phase_ispec_inner_acoustic'
+    phase_ispec_inner_acoustic(:,:) = 0
+
+    ispec_inner = 0
+    ispec_outer = 0
+    do ispec = 1, nspec
+      if( ispec_is_acoustic(ispec) ) then
+        if( ispec_is_inner(ispec) .eqv. .true. ) then
+          ispec_inner = ispec_inner + 1
+          phase_ispec_inner_acoustic(ispec_inner,2) = ispec
+        else
+          ispec_outer = ispec_outer + 1
+          phase_ispec_inner_acoustic(ispec_outer,1) = ispec
+        endif
+      endif
+    enddo
+  else
+    ! allocates dummy array
+    num_phase_ispec_acoustic = 0
+    allocate( phase_ispec_inner_acoustic(num_phase_ispec_acoustic,2),stat=ier)
+    if( ier /= 0 ) stop 'error allocating dummy array phase_ispec_inner_acoustic'
+    phase_ispec_inner_acoustic(:,:) = 0
+  endif
+
+  ! sets up elements for loops in acoustic simulations
+  nspec_inner_elastic = 0
+  nspec_outer_elastic = 0
+  call any_all_l( ANY(ispec_is_elastic), ELASTIC_SIMULATION )
+  if( ELASTIC_SIMULATION ) then
+    ! counts inner and outer elements
+    do ispec = 1, nspec
+      if( ispec_is_elastic(ispec) ) then
+        if( ispec_is_inner(ispec) .eqv. .true. ) then
+          nspec_inner_elastic = nspec_inner_elastic + 1
+        else
+          nspec_outer_elastic = nspec_outer_elastic + 1
+        endif
+      endif
+    enddo
+
+    ! stores indices of inner and outer elements for faster(?) computation
+    num_phase_ispec_elastic = max(nspec_inner_elastic,nspec_outer_elastic)
+    if( num_phase_ispec_elastic < 0 ) stop 'error elastic simulation: num_phase_ispec_elastic is < zero'
+
+    allocate( phase_ispec_inner_elastic(num_phase_ispec_elastic,2),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array phase_ispec_inner_elastic'
+    phase_ispec_inner_elastic(:,:) = 0
+
+    ispec_inner = 0
+    ispec_outer = 0
+    do ispec = 1, nspec
+      if( ispec_is_elastic(ispec) ) then
+        if( ispec_is_inner(ispec) .eqv. .true. ) then
+          ispec_inner = ispec_inner + 1
+          phase_ispec_inner_elastic(ispec_inner,2) = ispec
+        else
+          ispec_outer = ispec_outer + 1
+          phase_ispec_inner_elastic(ispec_outer,1) = ispec
+        endif
+      endif
+    enddo
+  else
+    ! allocates dummy array
+    num_phase_ispec_elastic = 0
+    allocate( phase_ispec_inner_elastic(num_phase_ispec_elastic,2),stat=ier)
+    if( ier /= 0 ) stop 'error allocating dummy array phase_ispec_inner_elastic'
+    phase_ispec_inner_elastic(:,:) = 0
+  endif
+
+  ! user output
+  if(myrank == 0) then
+    percentage_edge = 100.*count(ispec_is_inner(:))/real(nspec)
+    write(IMAIN,*) '     for overlapping of communications with calculations:'
+    write(IMAIN,*) '     percentage of   edge elements ',100. -percentage_edge,'%'
+    write(IMAIN,*) '     percentage of volume elements ',percentage_edge,'%'
+  endif
+
+  end subroutine crm_setup_inner_outer_elemnts
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine crm_setup_color_perm(myrank,nspec,nglob,ibool,ANISOTROPY,SAVE_MESH_FILES)
+
+! sets up mesh coloring and permutes elements
+
+  use create_regions_mesh_ext_par
+  implicit none
+
+  integer :: myrank,nspec,nglob
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+  logical :: ANISOTROPY,SAVE_MESH_FILES
+
+  ! local parameters
+  integer, dimension(:), allocatable :: perm
+  integer :: ier
+
+  ! user output
+  if(myrank == 0) then
+    write(IMAIN,*) '     use coloring = ',USE_MESH_COLORING_GPU
+  endif
+
+  ! initializes
+  num_colors_outer_acoustic = 0
+  num_colors_inner_acoustic = 0
+  num_colors_outer_elastic = 0
+  num_colors_inner_elastic = 0
+
+  ! mesh coloring
+  if( USE_MESH_COLORING_GPU ) then
+
+    ! creates coloring of elements
+    allocate(perm(nspec),stat=ier)
+    if( ier /= 0 ) stop 'error allocating temporary perm array'
+    perm(:) = 0
+
+    ! acoustic domains
+    if( ACOUSTIC_SIMULATION ) then
+      if( myrank == 0) then
+        write(IMAIN,*) '     acoustic domains: '
+      endif
+      call crm_setup_color(myrank,nspec,nglob,ibool,perm, &
+                          ispec_is_acoustic,1, &
+                          num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
+                          SAVE_MESH_FILES)
+    else
+      ! allocates dummy arrays
+      allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array'
+    endif
+
+    ! elastic domains
+    if( ELASTIC_SIMULATION ) then
+      if( myrank == 0) then
+        write(IMAIN,*) '     elastic domains: '
+      endif
+      call crm_setup_color(myrank,nspec,nglob,ibool,perm, &
+                          ispec_is_elastic,2, &
+                          num_phase_ispec_elastic,phase_ispec_inner_elastic, &
+                          SAVE_MESH_FILES)
+    else
+      allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array'
+    endif
+
+    ! checks: after all domains are done
+    if(minval(perm) /= 1) &
+      call exit_MPI(myrank, 'minval(perm) should be 1')
+    if(maxval(perm) /= max(num_phase_ispec_acoustic,num_phase_ispec_elastic)) &
+      call exit_MPI(myrank, 'maxval(perm) should be max(num_phase_ispec_..)')
+
+    ! sorts array according to permutation
+    call sync_all()
+    if(myrank == 0) then
+      write(IMAIN,*) '     mesh permutation:'
+    endif
+    call crm_setup_permutation(myrank,nspec,nglob,ibool,ANISOTROPY,perm, &
+                              SAVE_MESH_FILES)
+
+    deallocate(perm)
+
+  else
+
+    ! allocates dummy arrays
+    allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier)
+    if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array'
+    allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier)
+    if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array'
+
+  endif ! USE_MESH_COLORING_GPU
+
+  end subroutine crm_setup_color_perm
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine crm_setup_color(myrank,nspec,nglob,ibool,perm, &
+                            ispec_is_d,idomain, &
+                            num_phase_ispec_d,phase_ispec_inner_d, &
+                            SAVE_MESH_FILES)
+
+! sets up mesh coloring
+
+  use create_regions_mesh_ext_par
+  implicit none
+
+  integer :: myrank,nspec,nglob
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+  integer, dimension(nspec) :: perm
+
+  ! wrapper array for ispec is in domain:
+  ! idomain acoustic == 1 or elastic == 2
+  integer :: idomain
+  logical, dimension(nspec) :: ispec_is_d
+  integer :: num_phase_ispec_d
+  integer, dimension(num_phase_ispec_d,2) :: phase_ispec_inner_d
+
+  logical :: SAVE_MESH_FILES
+
+  ! local parameters
+  ! added for color permutation
+  integer :: nb_colors_outer_elements,nb_colors_inner_elements
+  integer, dimension(:), allocatable :: num_of_elems_in_this_color
+  integer, dimension(:), allocatable :: color
+  integer, dimension(:), allocatable :: first_elem_number_in_this_color
+  logical, dimension(:), allocatable :: is_on_a_slice_edge
+
+  integer :: nspec_outer,nspec_inner,nspec_domain
+  integer :: nspec_outer_min_global,nspec_outer_max_global
+  integer :: nb_colors,nb_colors_min,nb_colors_max
+
+  integer :: icolor,ispec,ispec_counter
+  integer :: ispec_inner,ispec_outer
+  integer :: ier
+
+  character(len=2),dimension(2) :: str_domain = (/ "ac", "el" /)
+  character(len=256) :: filename
+
+  !!!! David Michea: detection of the edges, coloring and permutation separately
+
+  ! implement mesh coloring for GPUs if needed, to create subsets of disconnected elements
+  ! to remove dependencies and the need for atomic operations in the sum of
+  ! elemental contributions in the solver
+
+  ! allocates temporary array with colors
+  allocate(color(nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating temporary color array'
+  allocate(first_elem_number_in_this_color(MAX_NUMBER_OF_COLORS + 1),stat=ier)
+  if( ier /= 0 ) stop 'error allocating first_elem_number_in_this_color array'
+
+  ! flags for elements on outer rims
+  ! opposite to what is stored in ispec_is_inner
+  allocate(is_on_a_slice_edge(nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating is_on_a_slice_edge array'
+  do ispec = 1,nspec
+    is_on_a_slice_edge(ispec) = .not. ispec_is_inner(ispec)
+  enddo
+
+  ! fast element coloring scheme
+  call get_perm_color_faster(is_on_a_slice_edge,ispec_is_d, &
+                            ibool,perm,color, &
+                            nspec,nglob, &
+                            nb_colors_outer_elements,nb_colors_inner_elements, &
+                            nspec_outer,nspec_inner,nspec_domain, &
+                            first_elem_number_in_this_color, &
+                            myrank)
+
+  ! for the last color, the next color is fictitious and its first (fictitious) element number is nspec + 1
+  first_elem_number_in_this_color(nb_colors_outer_elements + nb_colors_inner_elements + 1) &
+    = nspec_domain + 1
+
+  allocate(num_of_elems_in_this_color(nb_colors_outer_elements + nb_colors_inner_elements),stat=ier)
+  if( ier /= 0 ) stop 'error allocating num_of_elems_in_this_color array'
+
+  num_of_elems_in_this_color(:) = 0
+  do icolor = 1, nb_colors_outer_elements + nb_colors_inner_elements
+    num_of_elems_in_this_color(icolor) = first_elem_number_in_this_color(icolor+1) - first_elem_number_in_this_color(icolor)
+  enddo
+
+  ! check that the sum of all the numbers of elements found in each color is equal
+  ! to the total number of elements in the mesh
+  if(sum(num_of_elems_in_this_color) /= nspec_domain) then
+    print *,'error number of elements in this color:',idomain
+    print *,'rank: ',myrank,' nspec = ',nspec_domain
+    print *,'  total number of elements in all the colors of the mesh = ', &
+      sum(num_of_elems_in_this_color)
+    call exit_MPI(myrank, 'incorrect total number of elements in all the colors of the mesh')
+  endif
+
+  ! check that the sum of all the numbers of elements found in each color for the outer elements is equal
+  ! to the total number of outer elements found in the mesh
+  if(sum(num_of_elems_in_this_color(1:nb_colors_outer_elements)) /= nspec_outer) then
+    print *,'error number of outer elements in this color:',idomain
+    print *,'rank: ',myrank,' nspec_outer = ',nspec_outer
+    print*,'nb_colors_outer_elements = ',nb_colors_outer_elements
+    print *,'total number of elements in all the colors of the mesh for outer elements = ', &
+      sum(num_of_elems_in_this_color(1:nb_colors_outer_elements))
+    call exit_MPI(myrank, 'incorrect total number of elements in all the colors of the mesh for outer elements')
+  endif
+
+  ! debug: file output
+  if( SAVE_MESH_FILES ) then
+    filename = prname(1:len_trim(prname))//'color_'//str_domain(idomain)
+    call write_VTK_data_elem_i(nspec,nglob, &
+                              xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                              color,filename)
+  endif
+
+  deallocate(first_elem_number_in_this_color)
+  deallocate(is_on_a_slice_edge)
+  deallocate(color)
+
+  ! debug: no mesh coloring, only creates dummy coloring arrays
+  if( .false. ) then
+    nb_colors_outer_elements = 0
+    nb_colors_inner_elements = 0
+    ispec_counter = 0
+
+    ! first generate all the outer elements
+    do ispec = 1,nspec
+      if( ispec_is_d(ispec) ) then
+        if( ispec_is_inner(ispec) .eqv. .false. ) then
+          ispec_counter = ispec_counter + 1
+          perm(ispec) = ispec_counter
+        endif
+      endif
+    enddo
+
+    ! store total number of outer elements
+    nspec_outer = ispec_counter
+
+    ! only single color
+    if(nspec_outer > 0 ) nb_colors_outer_elements = 1
+
+    ! then generate all the inner elements
+    do ispec = 1,nspec
+      if( ispec_is_d(ispec) ) then
+        if( ispec_is_inner(ispec) .eqv. .true. ) then
+          ispec_counter = ispec_counter + 1
+          perm(ispec) = ispec_counter - nspec_outer ! starts again at 1
+        endif
+      endif
+    enddo
+    nspec_inner = ispec_counter - nspec_outer
+
+    ! only single color
+    if(nspec_inner > 0 ) nb_colors_inner_elements = 1
+
+    allocate(num_of_elems_in_this_color(nb_colors_outer_elements + nb_colors_inner_elements),stat=ier)
+    if( ier /= 0 ) stop 'error allocating num_of_elems_in_this_color array'
+
+    if( nspec_outer > 0 ) num_of_elems_in_this_color(1) = nspec_outer
+    if( nspec_inner > 0 ) num_of_elems_in_this_color(2) = nspec_inner
+  endif ! debug
+
+  ! debug: saves mesh coloring numbers into files
+  if( SAVE_MESH_FILES ) then
+    ! debug file output
+    filename = prname(1:len_trim(prname))//'num_of_elems_in_this_color_'//str_domain(idomain)//'.dat'
+    open(unit=99,file=trim(filename),status='unknown',iostat=ier)
+    if( ier /= 0 ) stop 'error opening num_of_elems_in_this_color file'
+    ! number of colors for outer elements
+    write(99,*) nb_colors_outer_elements
+    ! number of colors for inner elements
+    write(99,*) nb_colors_inner_elements
+    ! number of elements in each color
+    ! outer elements
+    do icolor = 1, nb_colors_outer_elements + nb_colors_inner_elements
+      write(99,*) num_of_elems_in_this_color(icolor)
+    enddo
+    close(99)
+  endif
+
+  ! sets up domain coloring arrays
+  select case(idomain)
+  case( 1 )
+    ! acoustic domains
+    num_colors_outer_acoustic = nb_colors_outer_elements
+    num_colors_inner_acoustic = nb_colors_inner_elements
+
+    allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier)
+    if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array'
+
+    num_elem_colors_acoustic(:) = num_of_elems_in_this_color(:)
+
+  case( 2 )
+    ! elastic domains
+    num_colors_outer_elastic = nb_colors_outer_elements
+    num_colors_inner_elastic = nb_colors_inner_elements
+
+    allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier)
+    if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array'
+
+    num_elem_colors_elastic(:) = num_of_elems_in_this_color(:)
+
+  case default
+    stop 'error idomain not recognized'
+  end select
+
+  ! sets up elements for loops in simulations
+  ispec_inner = 0
+  ispec_outer = 0
+  do ispec = 1, nspec
+    ! only elements in this domain
+    if( ispec_is_d(ispec) ) then
+
+      ! sets phase_ispec arrays with ordering of elements
+      if( ispec_is_inner(ispec) .eqv. .false. ) then
+        ! outer elements
+        ispec_outer = perm(ispec)
+
+        ! checks
+        if( ispec_outer < 1 .or. ispec_outer > num_phase_ispec_d ) then
+          print*,'error outer permutation:',idomain
+          print*,'rank:',myrank,'  ispec_inner = ',ispec_outer
+          print*,'num_phase_ispec_d = ',num_phase_ispec_d
+          call exit_MPI(myrank,'error outer acoustic permutation')
+        endif
+
+        phase_ispec_inner_d(ispec_outer,1) = ispec
+
+      else
+        ! inner elements
+        ispec_inner = perm(ispec)
+
+        ! checks
+        if( ispec_inner < 1 .or. ispec_inner > num_phase_ispec_d ) then
+          print*,'error inner permutation:',idomain
+          print*,'rank:',myrank,'  ispec_inner = ',ispec_inner
+          print*,'num_phase_ispec_d = ',num_phase_ispec_d
+          call exit_MPI(myrank,'error inner acoustic permutation')
+        endif
+
+        phase_ispec_inner_d(ispec_inner,2) = ispec
+
+      endif
+    endif
+  enddo
+
+  ! total number of colors
+  nb_colors = nb_colors_inner_elements + nb_colors_outer_elements
+  call min_all_i(nb_colors,nb_colors_min)
+  call max_all_i(nb_colors,nb_colors_max)
+
+  ! user output
+  call min_all_i(nspec_outer,nspec_outer_min_global)
+  call max_all_i(nspec_outer,nspec_outer_max_global)
+  call min_all_i(nspec_outer,nspec_outer_min_global)
+  call max_all_i(nspec_outer,nspec_outer_max_global)
+  if(myrank == 0) then
+    write(IMAIN,*) '       colors min = ',nb_colors_min
+    write(IMAIN,*) '       colors max = ',nb_colors_max
+    write(IMAIN,*) '       outer elements: min = ',nspec_outer_min_global
+    write(IMAIN,*) '       outer elements: max = ',nspec_outer_max_global
+  endif
+
+  ! debug: outputs permutation array as vtk file
+  !if( SAVE_MESH_FILES ) then
+  !  filename = prname(1:len_trim(prname))//'perm_'//str_domain(idomain)
+  !  call write_VTK_data_elem_i(nspec,nglob, &
+  !                      xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+  !                      perm,filename)
+  !endif
+
+  deallocate(num_of_elems_in_this_color)
+
+  end subroutine crm_setup_color
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine crm_setup_permutation(myrank,nspec,nglob,ibool,ANISOTROPY,perm, &
+                                  SAVE_MESH_FILES)
+
+  use create_regions_mesh_ext_par
+  implicit none
+
+  integer :: myrank,nspec,nglob
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+  logical :: ANISOTROPY
+
+  integer, dimension(nspec),intent(inout) :: perm
+
+  logical :: SAVE_MESH_FILES
+
+  ! local parameters
+  ! added for sorting
+  integer, dimension(:,:,:,:), allocatable :: temp_array_int
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: temp_array_real
+  logical, dimension(:), allocatable :: temp_array_logical_1D
+
+  integer, dimension(:), allocatable :: temp_perm_global
+  logical, dimension(:), allocatable :: mask_global
+
+  integer :: icolor,icounter,ispec,ielem,ier,i
+  integer :: iface,old_ispec,new_ispec
+
+  character(len=256) :: filename
+
+  ! sorts array according to permutation
+  allocate(temp_perm_global(nspec),stat=ier)
+  if( ier /= 0 ) stop 'error temp_perm_global array'
+
+  ! global ordering
+  temp_perm_global(:) = 0
+  icounter = 0
+
+  ! fills global permutation array
+  ! starts with elastic elements
+  if( ELASTIC_SIMULATION ) then
+    ! first outer elements coloring
+    ! phase element counter
+    ielem = 0
+    do icolor = 1,num_colors_outer_elastic
+      ! loops through elements
+      do i = 1,num_elem_colors_elastic(icolor)
+        ielem = ielem + 1
+        ispec = phase_ispec_inner_elastic(ielem,1) ! 1 <-- first phase, outer elements
+        ! reorders elements
+        icounter = icounter + 1
+        temp_perm_global(ispec) = icounter
+        ! resets to new order
+        phase_ispec_inner_elastic(ielem,1) = icounter
+      enddo
+    enddo
+    ! inner elements coloring
+    ielem = 0
+    do icolor = num_colors_outer_elastic+1,num_colors_outer_elastic+num_colors_inner_elastic
+      ! loops through elements
+      do i = 1,num_elem_colors_elastic(icolor)
+        ielem = ielem + 1
+        ispec = phase_ispec_inner_elastic(ielem,2) ! 2 <-- second phase, inner elements
+        ! reorders elements
+        icounter = icounter + 1
+        temp_perm_global(ispec) = icounter
+        ! resets to new order
+        phase_ispec_inner_elastic(ielem,2) = icounter
+      enddo
+    enddo
+  endif
+
+  ! continues with acoustic elements
+  if( ACOUSTIC_SIMULATION ) then
+    ! first outer elements coloring
+    ! phase element counter
+    ielem = 0
+    do icolor = 1,num_colors_outer_acoustic
+      ! loops through elements
+      do i = 1,num_elem_colors_acoustic(icolor)
+        ielem = ielem + 1
+        ispec = phase_ispec_inner_acoustic(ielem,1) ! 1 <-- first phase, outer elements
+        ! reorders elements
+        icounter = icounter + 1
+        temp_perm_global(ispec) = icounter
+        ! resets to new order
+        phase_ispec_inner_acoustic(ielem,1) = icounter
+      enddo
+    enddo
+    ! inner elements coloring
+    ielem = 0
+    do icolor = num_colors_outer_acoustic+1,num_colors_outer_acoustic+num_colors_inner_acoustic
+      ! loops through elements
+      do i = 1,num_elem_colors_acoustic(icolor)
+        ielem = ielem + 1
+        ispec = phase_ispec_inner_acoustic(ielem,2) ! 2 <-- second phase, inner elements
+        ! reorders elements
+        icounter = icounter + 1
+        temp_perm_global(ispec) = icounter
+        ! resets to new order
+        phase_ispec_inner_acoustic(ielem,2) = icounter
+      enddo
+    enddo
+  endif
+
+  ! checks
+  if( icounter /= nspec ) then
+    print*,'error temp perm: ',icounter,nspec
+    stop 'error temporary global permutation incomplete'
+  endif
+
+  if(minval(temp_perm_global) /= 1) call exit_MPI(myrank, 'minval(temp_perm_global) should be 1')
+  if(maxval(temp_perm_global) /= nspec) call exit_MPI(myrank, 'maxval(temp_perm_global) should be nspec')
+
+  ! checks if every element was set
+  allocate(mask_global(nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating temporary mask_global'
+  mask_global(:) = .false.
+  icounter = 0 ! counts permutations
+  do ispec = 1, nspec
+    new_ispec = temp_perm_global(ispec)
+    ! checks bounds
+    if( new_ispec < 1 .or. new_ispec > nspec ) call exit_MPI(myrank,'error temp_perm_global ispec bounds')
+    ! checks if already set
+    if( mask_global(new_ispec) ) then
+      print*,'error temp_perm_global:',ispec,new_ispec,'element already set'
+      call exit_MPI(myrank,'error global permutation')
+    else
+      mask_global(new_ispec) = .true.
+    endif
+    ! counts permutations
+    if( new_ispec /= ispec ) icounter = icounter + 1
+  enddo
+
+  ! checks number of set elements
+  if( count(mask_global(:)) /= nspec ) then
+    print*,'error temp_perm_global:',count(mask_global(:)),nspec,'permutation incomplete'
+    call exit_MPI(myrank,'error global permutation incomplete')
+  endif
+  deallocate(mask_global)
+
+  ! user output
+  if(myrank == 0) then
+    write(IMAIN,*) '       number of permutations = ',icounter
+  endif
+
+  ! outputs permutation array as vtk file
+  if( SAVE_MESH_FILES ) then
+    filename = prname(1:len_trim(prname))//'perm_global'
+    call write_VTK_data_elem_i(nspec,nglob, &
+                        xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                        temp_perm_global,filename)
+  endif
+
+  ! store as new permutation
+  perm(:) = temp_perm_global(:)
+  deallocate(temp_perm_global)
+
+  ! permutes all required mesh arrays according to new ordering
+
+  ! permutation of ibool
+  allocate(temp_array_int(NGLLX,NGLLY,NGLLZ,nspec))
+  call permute_elements_integer(ibool,temp_array_int,perm,nspec)
+  deallocate(temp_array_int)
+
+  ! element domain flags
+  allocate(temp_array_logical_1D(nspec))
+  call permute_elements_logical1D(ispec_is_acoustic,temp_array_logical_1D,perm,nspec)
+  call permute_elements_logical1D(ispec_is_elastic,temp_array_logical_1D,perm,nspec)
+  call permute_elements_logical1D(ispec_is_poroelastic,temp_array_logical_1D,perm,nspec)
+  call permute_elements_logical1D(ispec_is_inner,temp_array_logical_1D,perm,nspec)
+  deallocate(temp_array_logical_1D)
+
+  ! mesh arrays
+  allocate(temp_array_real(NGLLX,NGLLY,NGLLZ,nspec))
+  call permute_elements_real(xixstore,temp_array_real,perm,nspec)
+  call permute_elements_real(xiystore,temp_array_real,perm,nspec)
+  call permute_elements_real(xizstore,temp_array_real,perm,nspec)
+  call permute_elements_real(etaxstore,temp_array_real,perm,nspec)
+  call permute_elements_real(etaystore,temp_array_real,perm,nspec)
+  call permute_elements_real(etazstore,temp_array_real,perm,nspec)
+  call permute_elements_real(gammaxstore,temp_array_real,perm,nspec)
+  call permute_elements_real(gammaystore,temp_array_real,perm,nspec)
+  call permute_elements_real(gammazstore,temp_array_real,perm,nspec)
+  call permute_elements_real(jacobianstore,temp_array_real,perm,nspec)
+
+  call permute_elements_real(kappastore,temp_array_real,perm,nspec)
+  call permute_elements_real(mustore,temp_array_real,perm,nspec)
+
+  ! acoustic arrays
+  if( ACOUSTIC_SIMULATION ) then
+    call permute_elements_real(rhostore,temp_array_real,perm,nspec)
+  endif
+
+  ! elastic arrays
+  if( ELASTIC_SIMULATION ) then
+    call permute_elements_real(rho_vp,temp_array_real,perm,nspec)
+    call permute_elements_real(rho_vs,temp_array_real,perm,nspec)
+    if( ANISOTROPY ) then
+      call permute_elements_real(c11store,temp_array_real,perm,nspec)
+      call permute_elements_real(c12store,temp_array_real,perm,nspec)
+      call permute_elements_real(c13store,temp_array_real,perm,nspec)
+      call permute_elements_real(c14store,temp_array_real,perm,nspec)
+      call permute_elements_real(c15store,temp_array_real,perm,nspec)
+      call permute_elements_real(c16store,temp_array_real,perm,nspec)
+      call permute_elements_real(c22store,temp_array_real,perm,nspec)
+      call permute_elements_real(c23store,temp_array_real,perm,nspec)
+      call permute_elements_real(c24store,temp_array_real,perm,nspec)
+      call permute_elements_real(c25store,temp_array_real,perm,nspec)
+      call permute_elements_real(c33store,temp_array_real,perm,nspec)
+      call permute_elements_real(c34store,temp_array_real,perm,nspec)
+      call permute_elements_real(c35store,temp_array_real,perm,nspec)
+      call permute_elements_real(c36store,temp_array_real,perm,nspec)
+      call permute_elements_real(c44store,temp_array_real,perm,nspec)
+      call permute_elements_real(c45store,temp_array_real,perm,nspec)
+      call permute_elements_real(c46store,temp_array_real,perm,nspec)
+      call permute_elements_real(c55store,temp_array_real,perm,nspec)
+      call permute_elements_real(c56store,temp_array_real,perm,nspec)
+      call permute_elements_real(c66store,temp_array_real,perm,nspec)
+    endif
+  endif
+  deallocate(temp_array_real)
+
+  ! boundary surface
+  if( num_abs_boundary_faces > 0 ) then
+    do iface = 1,num_abs_boundary_faces
+      old_ispec = abs_boundary_ispec(iface)
+      new_ispec = perm(old_ispec)
+      abs_boundary_ispec(iface) = new_ispec
+    enddo
+  endif
+
+  ! free surface
+  if( num_free_surface_faces > 0 ) then
+    do iface = 1,num_free_surface_faces
+      old_ispec = free_surface_ispec(iface)
+      new_ispec = perm(old_ispec)
+      free_surface_ispec(iface) = new_ispec
+    enddo
+  endif
+
+  ! coupling surface
+  if( num_coupling_ac_el_faces > 0 ) then
+    do iface = 1,num_coupling_ac_el_faces
+      old_ispec = coupling_ac_el_ispec(iface)
+      new_ispec = perm(old_ispec)
+      coupling_ac_el_ispec(iface) = new_ispec
+    enddo
+  endif
+
+  ! moho surface
+  if( NSPEC2D_MOHO > 0 ) then
+    allocate(temp_array_logical_1D(nspec))
+    call permute_elements_logical1D(is_moho_top,temp_array_logical_1D,perm,nspec)
+    call permute_elements_logical1D(is_moho_bot,temp_array_logical_1D,perm,nspec)
+    deallocate(temp_array_logical_1D)
+    do iface = 1,NSPEC2D_MOHO
+      ! top
+      old_ispec = ibelm_moho_top(iface)
+      new_ispec = perm(old_ispec)
+      ibelm_moho_top(iface) = new_ispec
+      ! bottom
+      old_ispec = ibelm_moho_bot(iface)
+      new_ispec = perm(old_ispec)
+      ibelm_moho_bot(iface) = new_ispec
+    enddo
+  endif
+
+  end subroutine crm_setup_permutation

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90	2011-11-03 03:11:45 UTC (rev 19145)
@@ -848,6 +848,11 @@
       write(IMAIN,*) '  moho surfaces: ',num_moho
     endif
     call sync_all()
+  else
+    ! allocate dummy array
+    nspec2D_moho_ext = 0
+    allocate(ibelm_moho(nspec2D_moho_ext),nodes_ibelm_moho(4,nspec2D_moho_ext),stat=ier)
+    if( ier /= 0 ) stop 'error allocating dumy array ibelm_moho etc.'
   endif
 
   close(IIN)
@@ -898,7 +903,8 @@
     write(IMAIN,*) 'create regions: '
   endif
   call create_regions_mesh_ext(ibool, &
-                        xstore, ystore, zstore, nspec, npointot, myrank, LOCAL_PATH, &
+                        xstore, ystore, zstore, nspec, &
+                        npointot, myrank, LOCAL_PATH, &
                         nnodes_ext_mesh, nelmnts_ext_mesh, &
                         nodes_coords_ext_mesh, elmnts_ext_mesh, &
                         max_static_memory_size, mat_ext_mesh, materials_ext_mesh, &
@@ -912,19 +918,22 @@
                         ibelm_xmin, ibelm_xmax, ibelm_ymin, ibelm_ymax, ibelm_bottom, ibelm_top, &
                         nodes_ibelm_xmin,nodes_ibelm_xmax,nodes_ibelm_ymin,nodes_ibelm_ymax, &
                         nodes_ibelm_bottom,nodes_ibelm_top, &
-                        SAVE_MESH_FILES,nglob, &
+                        SAVE_MESH_FILES,&
+                        nglob, &
                         ANISOTROPY,NPROC,OCEANS,TOPOGRAPHY, &
                         ATTENUATION,USE_OLSEN_ATTENUATION, &
                         UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION,NX_TOPO,NY_TOPO, &
                         ORIG_LAT_TOPO,ORIG_LONG_TOPO,DEGREES_PER_CELL_TOPO, &
-                        itopo_bathy)
+                        itopo_bathy, &
+                        nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho)
 
+! now done inside create_regions_mesh_ext routine...
 ! Moho boundary parameters, 2-D jacobians and normals
-  if( SAVE_MOHO_MESH ) then
-    call create_regions_mesh_save_moho(myrank,nglob,nspec, &
-                        nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho, &
-                        nodes_coords_ext_mesh,nnodes_ext_mesh,ibool )
-  endif
+!  if( SAVE_MOHO_MESH ) then
+!    call create_regions_mesh_save_moho(myrank,nglob,nspec, &
+!                        nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho, &
+!                        nodes_coords_ext_mesh,nnodes_ext_mesh,ibool )
+!  endif
 
 ! defines global number of nodes in model
   NGLOB_AB = nglob

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90	2011-11-03 03:11:45 UTC (rev 19145)
@@ -24,6 +24,9 @@
 !
 !=====================================================================
 
+
+! wrapper function
+
   subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
                         materials_ext_mesh,nmat_ext_mesh, &
                         undef_mat_prop,nundefMat_ext_mesh, &
@@ -34,7 +37,59 @@
 
   ! number of spectral elements in each block
   integer :: myrank,nspec
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+  ! external mesh
+  integer :: nelmnts_ext_mesh
+  integer :: nmat_ext_mesh,nundefMat_ext_mesh
+  integer, dimension(2,nelmnts_ext_mesh) :: mat_ext_mesh
+  double precision, dimension(6,nmat_ext_mesh) :: materials_ext_mesh
+  character (len=30), dimension(6,nundefMat_ext_mesh):: undef_mat_prop
+  ! anisotropy
+  logical :: ANISOTROPY
+  character(len=256) LOCAL_PATH
 
+  !----------------------------------------------------------
+  ! USER Parameter
+  ! daniel: TODO -- uses Piero's get_model_PREM routine rather than default one
+  logical,parameter :: USE_PIERO_MODEL = .true.
+
+  !----------------------------------------------------------
+
+  if( USE_PIERO_MODEL ) then
+    ! Using PREM model instead
+    call get_model_PREM(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
+                        materials_ext_mesh,nmat_ext_mesh, &
+                        undef_mat_prop,nundefMat_ext_mesh, &
+                        ANISOTROPY,LOCAL_PATH)
+
+  else
+    ! DEFAULT routine
+    call get_model_d(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
+                    materials_ext_mesh,nmat_ext_mesh, &
+                    undef_mat_prop,nundefMat_ext_mesh, &
+                    ANISOTROPY,LOCAL_PATH)
+  endif
+
+  end subroutine get_model
+
+  !
+  !-----------------------------------------------------------------------------------------------
+  !
+
+! default get model routine
+
+  subroutine get_model_d(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
+                        materials_ext_mesh,nmat_ext_mesh, &
+                        undef_mat_prop,nundefMat_ext_mesh, &
+                        ANISOTROPY,LOCAL_PATH)
+
+
+  use create_regions_mesh_ext_par
+  implicit none
+
+  ! number of spectral elements in each block
+  integer :: myrank,nspec
+
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
 
   ! external mesh
@@ -360,8 +415,11 @@
 
   endif ! USE_EXTERNAL_FILES
 
-  end subroutine get_model
+  end subroutine get_model_d
 
+!
+!-------------------------------------------------------------------------------------------------
+!
   subroutine get_model_PREM(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
                         materials_ext_mesh,nmat_ext_mesh, &
                         undef_mat_prop,nundefMat_ext_mesh, &

Added: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_perm_color.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_perm_color.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_perm_color.f90	2011-11-03 03:11:45 UTC (rev 19145)
@@ -0,0 +1,1048 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+
+! define sets of colors that contain disconnected elements for the CUDA solver.
+! also split the elements into two subsets: inner and outer elements, in order
+! to be able to compute the outer elements first in the solver and then
+! start non-blocking MPI calls and overlap them with the calculation of the inner elements
+! (which works fine because there are always far more inner elements than outer elements)
+
+!*********************************************************************************************************
+! Mila
+
+! daniel: modified routines to use element domain flags given in ispec_is_d, thus
+!             coloring only acoustic or elastic (or..) elements in one run, then repeat run for other domains.
+!             also, the permutation re-starts at 1 for outer and for inner elements,
+!             making it usable for the phase_ispec_inner_** arrays for acoustic and elastic elements.
+
+  subroutine get_perm_color_faster(is_on_a_slice_edge,ispec_is_d, &
+                                  ibool,perm,color, &
+                                  nspec,nglob, &
+                                  nb_colors_outer_elements,nb_colors_inner_elements, &
+                                  nspec_outer,nspec_inner,nspec_domain, &
+                                  first_elem_number_in_this_color, &
+                                  myrank)
+
+  implicit none
+
+  include "constants.h"
+
+  integer, intent(in) :: nspec, nglob
+  logical, dimension(nspec), intent(in) :: is_on_a_slice_edge
+  logical, dimension(nspec), intent(in) :: ispec_is_d
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec), intent(in) :: ibool
+  integer, dimension(nspec),intent(inout) :: perm
+
+  integer, dimension(nspec),intent(inout) :: color
+  integer, dimension(MAX_NUMBER_OF_COLORS+1),intent(inout) :: first_elem_number_in_this_color
+  integer, intent(out) :: nb_colors_outer_elements,nb_colors_inner_elements
+
+  integer, intent(out) :: nspec_outer,nspec_inner,nspec_domain
+  integer, intent(in) :: myrank
+
+  ! local variables
+  integer :: nb_colors
+
+  ! coloring algorithm
+  call get_color_faster(ibool, is_on_a_slice_edge, ispec_is_d, &
+                        myrank, nspec, nglob, &
+                        color, nb_colors_outer_elements, nb_colors_inner_elements, &
+                        nspec_outer,nspec_inner,nspec_domain)
+
+  !debug
+  !if(myrank == 0) then
+  !  print*, 'rank :',myrank,' - colors:'
+  !  print*, '   number of colors for inner elements = ',nb_colors_inner_elements
+  !  print*, '   number of colors for outer elements = ',nb_colors_outer_elements
+  !  print*, '   total number of colors (sum of both) = ', nb_colors_inner_elements + nb_colors_outer_elements
+  !  print*, 'rank :',myrank,' - elements:'
+  !  print*, '   number of elements for outer elements  = ',nspec_outer
+  !  print*, '   number of elements for inner elements  = ',nspec_inner
+  !  print*, '   total number of elements for domain elements  = ',nspec_domain
+  !endif
+  !if(myrank == 0) print*, '  generating the final colors'
+
+  ! total number of colors used
+  nb_colors = nb_colors_inner_elements+nb_colors_outer_elements
+  first_elem_number_in_this_color(:) = 0
+
+  ! gets element permutation depending on colors
+  call get_final_perm(color,perm,first_elem_number_in_this_color(1:nb_colors), &
+                     nspec,nb_colors,nb_colors_outer_elements, &
+                     ispec_is_d,nspec_domain)
+
+  !debug
+  !if(myrank == 0) print*, '  done with mesh coloring and inner/outer element splitting'
+
+  end subroutine get_perm_color_faster
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine get_color_faster(ibool, is_on_a_slice_edge, ispec_is_d, &
+                             myrank, nspec, nglob, &
+                             color, nb_colors_outer_elements, nb_colors_inner_elements, &
+                             nspec_outer,nspec_inner,nspec_domain)
+
+  implicit none
+
+  include "constants.h"
+
+  integer nspec,nglob
+  logical, dimension(nspec) :: is_on_a_slice_edge,ispec_is_d
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+  integer, dimension(nspec) :: color
+  integer :: nb_colors_outer_elements,nb_colors_inner_elements,myrank
+
+  integer :: nspec_outer,nspec_inner,nspec_domain
+
+  ! local variables
+  integer :: ispec
+  !! DK DK for mesh coloring GPU Joseph Fourier
+  logical, dimension(:), allocatable :: mask_ibool
+  integer :: icolor, nb_already_colored
+  integer :: iglob1,iglob2,iglob3,iglob4,iglob5,iglob6,iglob7,iglob8
+  logical :: conflict_found_need_new_color
+
+  ! user output
+  if( myrank == 0 ) then
+    write(IMAIN,*) '     fast coloring mesh algorithm'
+  endif
+
+  ! counts number of elements for inner, outer and total domain
+  nspec_outer = 0
+  nspec_inner = 0
+  nspec_domain = 0
+  do ispec=1,nspec
+    if(ispec_is_d(ispec)) then
+      if(is_on_a_slice_edge(ispec)) then
+        nspec_outer=nspec_outer+1
+      else
+        nspec_inner=nspec_inner+1
+      endif
+      nspec_domain=nspec_domain+1
+    endif
+  enddo
+
+  !! DK DK start mesh coloring (new Apr 2010 version by DK for GPU Joseph Fourier)
+  allocate(mask_ibool(nglob))
+
+  !! DK DK ----------------------------------
+  !! DK DK color the mesh in the crust_mantle
+  !! DK DK ----------------------------------
+
+  ! debug
+  !if(myrank == 0) then
+  !  print *
+  !  print *,'----------------------------------'
+  !  print *,'coloring the mesh'
+  !  print *,'----------------------------------'
+  !  print *
+  !endif
+
+  ! first set color of all elements to 0
+  color(:) = 0
+  icolor = 0
+  nb_already_colored = 0
+
+  ! colors outer elements
+  do while( nb_already_colored < nspec_outer )
+    icolor = icolor + 1
+
+    333 continue
+
+    ! debug: user output
+    !if(myrank == 0) then
+    !  print *,'  analyzing color ',icolor,' - outer elements'
+    !endif
+
+    ! resets flags
+    mask_ibool(:) = .false.
+    conflict_found_need_new_color = .false.
+
+    ! finds un-colored elements
+    do ispec = 1,nspec
+      ! domain elements only
+      if( ispec_is_d(ispec) ) then
+        ! outer elements
+        if( is_on_a_slice_edge(ispec) ) then
+          if(color(ispec) == 0) then
+            ! the eight corners of the current element
+            iglob1=ibool(1,1,1,ispec)
+            iglob2=ibool(NGLLX,1,1,ispec)
+            iglob3=ibool(NGLLX,NGLLY,1,ispec)
+            iglob4=ibool(1,NGLLY,1,ispec)
+            iglob5=ibool(1,1,NGLLZ,ispec)
+            iglob6=ibool(NGLLX,1,NGLLZ,ispec)
+            iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+            iglob8=ibool(1,NGLLY,NGLLZ,ispec)
+
+            if(mask_ibool(iglob1) .or. mask_ibool(iglob2) .or. mask_ibool(iglob3) .or. mask_ibool(iglob4) .or. &
+               mask_ibool(iglob5) .or. mask_ibool(iglob6) .or. mask_ibool(iglob7) .or. mask_ibool(iglob8)) then
+              ! if element of this color has a common point with another element of that same color
+              ! then we need to create a new color, i.e., increment the color of the current element
+              conflict_found_need_new_color = .true.
+            else
+              color(ispec) = icolor
+              nb_already_colored = nb_already_colored + 1
+              mask_ibool(iglob1) = .true.
+              mask_ibool(iglob2) = .true.
+              mask_ibool(iglob3) = .true.
+              mask_ibool(iglob4) = .true.
+              mask_ibool(iglob5) = .true.
+              mask_ibool(iglob6) = .true.
+              mask_ibool(iglob7) = .true.
+              mask_ibool(iglob8) = .true.
+            endif
+          endif
+        endif
+      endif
+    enddo
+
+    ! debug: user output
+    !if(myrank == 0) then
+    !  print *,'  done ',(100.0*nb_already_colored)/nspec_domain,'% of ',nspec_domain,'elements'
+    !endif
+
+    if(conflict_found_need_new_color) then
+      icolor = icolor + 1
+      if( icolor >= MAX_NUMBER_OF_COLORS ) stop 'error MAX_NUMBER_OF_COLORS too small'
+      goto 333
+    endif
+  enddo
+
+  nb_colors_outer_elements = icolor
+
+  ! colors inner elements
+  do while(nb_already_colored < nspec_domain)
+    icolor = icolor + 1
+
+    334 continue
+    !if(myrank == 0) print *,'analyzing color ',icolor,' for all the elements of the mesh'
+
+    ! debug: user output
+    !if(myrank == 0) then
+    !  print *,'  analyzing color ',icolor,' - inner elements'
+    !endif
+
+    ! resets flags
+    mask_ibool(:) = .false.
+    conflict_found_need_new_color = .false.
+
+    do ispec = 1,nspec
+      ! domain elements only
+      if(ispec_is_d(ispec)) then
+        ! inner elements
+        if (.not. is_on_a_slice_edge(ispec)) then
+          if(color(ispec) == 0) then
+            ! the eight corners of the current element
+            iglob1=ibool(1,1,1,ispec)
+            iglob2=ibool(NGLLX,1,1,ispec)
+            iglob3=ibool(NGLLX,NGLLY,1,ispec)
+            iglob4=ibool(1,NGLLY,1,ispec)
+            iglob5=ibool(1,1,NGLLZ,ispec)
+            iglob6=ibool(NGLLX,1,NGLLZ,ispec)
+            iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+            iglob8=ibool(1,NGLLY,NGLLZ,ispec)
+
+            if(mask_ibool(iglob1) .or. mask_ibool(iglob2) .or. mask_ibool(iglob3) .or. mask_ibool(iglob4) .or. &
+               mask_ibool(iglob5) .or. mask_ibool(iglob6) .or. mask_ibool(iglob7) .or. mask_ibool(iglob8)) then
+              ! if element of this color has a common point with another element of that same color
+              ! then we need to create a new color, i.e., increment the color of the current element
+              conflict_found_need_new_color = .true.
+            else
+              color(ispec) = icolor
+              nb_already_colored = nb_already_colored + 1
+              mask_ibool(iglob1) = .true.
+              mask_ibool(iglob2) = .true.
+              mask_ibool(iglob3) = .true.
+              mask_ibool(iglob4) = .true.
+              mask_ibool(iglob5) = .true.
+              mask_ibool(iglob6) = .true.
+              mask_ibool(iglob7) = .true.
+              mask_ibool(iglob8) = .true.
+            endif
+          endif
+        endif
+      endif
+    enddo
+
+    ! debug user output
+    !if(myrank == 0) then
+    !  print *,'  done ',(100.0*nb_already_colored)/nspec_domain,'% of ',nspec_domain,'elements'
+    !endif
+
+    if(conflict_found_need_new_color) then
+      icolor = icolor + 1
+      if( icolor >= MAX_NUMBER_OF_COLORS ) stop 'error MAX_NUMBER_OF_COLORS too small'
+      goto 334
+    endif
+  enddo
+
+  nb_colors_inner_elements = icolor - nb_colors_outer_elements
+
+  ! debug
+  !if(myrank == 0) then
+  !  print *
+  !  print *,'  created a total of ',maxval(color),' colors in this domain' ! 'for all the domain elements of the mesh'
+  !  print *
+  !endif
+
+  !!!!!!!! DK DK now check that all the color sets are independent
+  do icolor = 1,maxval(color)
+    mask_ibool(:) = .false.
+    do ispec = 1,nspec
+      ! domain elements only
+      if(ispec_is_d(ispec)) then
+        if(color(ispec) == icolor ) then
+          ! the eight corners of the current element
+          iglob1=ibool(1,1,1,ispec)
+          iglob2=ibool(NGLLX,1,1,ispec)
+          iglob3=ibool(NGLLX,NGLLY,1,ispec)
+          iglob4=ibool(1,NGLLY,1,ispec)
+          iglob5=ibool(1,1,NGLLZ,ispec)
+          iglob6=ibool(NGLLX,1,NGLLZ,ispec)
+          iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+          iglob8=ibool(1,NGLLY,NGLLZ,ispec)
+
+          if(mask_ibool(iglob1) .or. mask_ibool(iglob2) .or. mask_ibool(iglob3) .or. mask_ibool(iglob4) .or. &
+             mask_ibool(iglob5) .or. mask_ibool(iglob6) .or. mask_ibool(iglob7) .or. mask_ibool(iglob8)) then
+            ! if element of this color has a common point with another element of that same color
+            ! then there is a problem, the color set is not correct
+            print*,'error check color:',icolor
+            stop 'error detected: found a common point inside a color set'
+          else
+            mask_ibool(iglob1) = .true.
+            mask_ibool(iglob2) = .true.
+            mask_ibool(iglob3) = .true.
+            mask_ibool(iglob4) = .true.
+            mask_ibool(iglob5) = .true.
+            mask_ibool(iglob6) = .true.
+            mask_ibool(iglob7) = .true.
+            mask_ibool(iglob8) = .true.
+          endif
+        endif
+      endif
+    enddo
+
+    !debug
+    !if(myrank == 0) print *,'  color ',icolor,' has disjoint elements only and is therefore OK'
+    !if(myrank == 0) print *,'  color ',icolor,' contains ',count(color == icolor),' elements'
+  enddo
+
+  ! debug
+  !if(myrank == 0) then
+  !  print *,'  the ',maxval(color),' color sets are OK'
+  !  print *
+  !endif
+
+  deallocate(mask_ibool)
+
+  end subroutine get_color_faster
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine get_final_perm(color,perm,first_elem_number_in_this_color, &
+                            nspec,nb_colors,nb_colors_outer_elements, &
+                            ispec_is_d,nspec_domain)
+
+  integer, intent(in) :: nspec,nb_colors
+
+  integer,dimension(nspec), intent(in) :: color
+  integer,dimension(nspec), intent(inout) :: perm
+
+  integer, intent(inout) :: first_elem_number_in_this_color(nb_colors)
+
+  logical,dimension(nspec),intent(in) :: ispec_is_d
+
+  integer,intent(in) :: nb_colors_outer_elements,nspec_domain
+
+  ! local parameters
+  integer :: ispec,icolor,icounter,counter_outer
+
+  ! note: permutations are only valid within each domain
+  !          also, the counters start at 1 for each inner/outer element range
+
+  ! outer elements first ( note: inner / outer order sensitive)
+  icounter = 1
+  do icolor = 1, nb_colors_outer_elements
+    first_elem_number_in_this_color(icolor) = icounter
+    do ispec = 1, nspec
+      ! elements in this domain only
+      if( ispec_is_d(ispec) ) then
+        if(color(ispec) == icolor) then
+          perm(ispec) = icounter
+          icounter = icounter + 1
+        endif
+      endif
+    enddo
+  enddo
+  counter_outer = icounter - 1
+
+  ! inner elements second
+  icounter = 1
+  do icolor = nb_colors_outer_elements+1, nb_colors
+    first_elem_number_in_this_color(icolor) = icounter + counter_outer
+    do ispec = 1, nspec
+      ! elements in this domain only
+      if( ispec_is_d(ispec) ) then
+        ! outer elements
+        if(color(ispec) == icolor) then
+          perm(ispec) = icounter
+          icounter = icounter + 1
+        endif
+      endif
+    enddo
+  enddo
+
+  ! checks
+  if( counter_outer + icounter -1 /= nspec_domain ) then
+    print*,'error: perm: ',nspec_domain,counter_outer,icounter,counter_outer+icounter-1
+    stop 'error get_final_perm: counter incomplete'
+  endif
+
+  end subroutine get_final_perm
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! unused...
+!  subroutine get_perm_color(is_on_a_slice_edge,ispec_is_d, &
+!                            ibool,perm,nspec,nglob, &
+!                            nb_colors_outer_elements,nb_colors_inner_elements, &
+!                            nspec_outer,first_elem_number_in_this_color,myrank)
+!
+!  implicit none
+!
+!  include "constants.h"
+!  integer, parameter :: NGNOD_HEXAHEDRA = 8
+!
+!  logical, dimension(nspec) :: is_on_a_slice_edge,ispec_is_d
+!
+!  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+!  integer, dimension(nspec) :: perm
+!  integer, dimension(nspec) :: color
+!  integer, dimension(MAX_NUMBER_OF_COLORS+1) :: first_elem_number_in_this_color
+!  integer :: nb_colors_outer_elements,nb_colors_inner_elements,nspec_outer,myrank
+!
+!  ! local variables
+!  integer nspec,nglob_GLL_full
+!
+!  ! a neighbor of a hexahedral node is a hexahedron that shares a face with it -> max degree of a node = 6
+!  integer, parameter :: MAX_NUMBER_OF_NEIGHBORS = 100
+!
+!  ! global corner numbers that need to be created
+!  integer, dimension(nglob) :: global_corner_number
+!
+!  integer mn(nspec*NGNOD_HEXAHEDRA),mp(nspec+1)
+!  integer, dimension(:), allocatable :: ne,np,adj
+!  integer xadj(nspec+1)
+!
+!  !logical maskel(nspec)
+!
+!  integer i,istart,istop,number_of_neighbors
+!
+!  integer nglob_eight_corners_only,nglob
+!
+!  ! only count the total size of the array that will be created, or actually create it
+!  logical count_only
+!  integer total_size_ne,total_size_adj
+!
+!
+!  ! total number of points in the mesh
+!  nglob_GLL_full = nglob
+!
+!  !---- call Charbel Farhat's routines
+!  if(myrank == 0) &
+!    write(IMAIN,*) 'calling form_elt_connectivity_foelco to perform mesh coloring and inner/outer element splitting'
+!  call form_elt_connectivity_foelco(mn,mp,nspec,global_corner_number,nglob_GLL_full,ibool,nglob_eight_corners_only)
+!  do i=1,nspec
+!    istart = mp(i)
+!    istop = mp(i+1) - 1
+!  enddo
+!
+!  ! count only, to determine the size needed for the array
+!  allocate(np(nglob_eight_corners_only+1))
+!  count_only = .true.
+!  total_size_ne = 1
+!  if(myrank == 0) write(IMAIN,*) 'calling form_node_connectivity_fonoco to determine the size of the table'
+!  allocate(ne(total_size_ne))
+!  call form_node_connectivity_fonoco(mn,mp,ne,np,nglob_eight_corners_only,nspec,count_only,total_size_ne)
+!  deallocate(ne)
+!
+!  !print *, 'nglob_eight_corners_only'
+!  !print *, nglob_eight_corners_only
+!
+!  ! allocate the array with the right size
+!  allocate(ne(total_size_ne))
+!
+!  ! now actually generate the array
+!  count_only = .false.
+!  if(myrank == 0) write(IMAIN,*) 'calling form_node_connectivity_fonoco to actually create the table'
+!  call form_node_connectivity_fonoco(mn,mp,ne,np,nglob_eight_corners_only,nspec,count_only,total_size_ne)
+!  do i=1,nglob_eight_corners_only
+!    istart = np(i)
+!    istop = np(i+1) - 1
+!  enddo
+!
+!  !print *, 'total_size_ne'
+!  !print *, total_size_ne
+!
+!  ! count only, to determine the size needed for the array
+!  count_only = .true.
+!  total_size_adj = 1
+!  if(myrank == 0) write(IMAIN,*) 'calling create_adjacency_table_adjncy to determine the size of the table'
+!  allocate(adj(total_size_adj))
+!  !call create_adjacency_table_adjncy(mn,mp,ne,np,adj,xadj,maskel,nspec,nglob_eight_corners_only,&
+!  !count_only,total_size_ne,total_size_adj,.false.)
+!  call create_adjacency_table_adjncy(mn,mp,ne,np,adj,xadj,nspec,nglob_eight_corners_only,&
+!  count_only,total_size_ne,total_size_adj,.false.)
+!  deallocate(adj)
+!
+!  ! allocate the array with the right size
+!  allocate(adj(total_size_adj))
+!
+!  ! now actually generate the array
+!  count_only = .false.
+!  if(myrank == 0) write(IMAIN,*) 'calling create_adjacency_table_adjncy again to actually create the table'
+!  !call create_adjacency_table_adjncy(mn,mp,ne,np,adj,xadj,maskel,nspec,nglob_eight_corners_only,&
+!  !count_only,total_size_ne,total_size_adj,.false.)
+!  call create_adjacency_table_adjncy(mn,mp,ne,np,adj,xadj,nspec,nglob_eight_corners_only,&
+!  count_only,total_size_ne,total_size_adj,.false.)
+!
+!  do i=1,nspec
+!    istart = xadj(i)
+!    istop = xadj(i+1) - 1
+!    number_of_neighbors = istop-istart+1
+!    if(number_of_neighbors < 1 .or. number_of_neighbors > MAX_NUMBER_OF_NEIGHBORS) stop 'incorrect number of neighbors'
+!  enddo
+!
+!  deallocate(ne,np)
+!
+!  call get_color(adj,xadj,color,nspec,total_size_adj, &
+!                is_on_a_slice_edge,ispec_is_d, &
+!                nb_colors_outer_elements,nb_colors_inner_elements,nspec_outer)
+!
+!  if(myrank == 0) then
+!    write(IMAIN,*) '  number of colors of the graph for inner elements = ',nb_colors_inner_elements
+!    write(IMAIN,*) '  number of colors of the graph for outer elements = ',nb_colors_outer_elements
+!    write(IMAIN,*) '  total number of colors of the graph (sum of both) = ', nb_colors_inner_elements + nb_colors_outer_elements
+!    write(IMAIN,*) '  number of elements of the graph for outer elements = ',nspec_outer
+!  endif
+!
+!  deallocate(adj)
+!
+!  if(myrank == 0) write(IMAIN,*) 'generating the final colors'
+!  first_elem_number_in_this_color(:) = -1
+!  call get_final_perm(color,perm,first_elem_number_in_this_color,nspec,nb_colors_inner_elements+nb_colors_outer_elements)
+!
+!  if(myrank == 0) write(IMAIN,*) 'done with mesh coloring and inner/outer element splitting'
+!
+!  end subroutine get_perm_color
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+!unused...
+!  subroutine get_color(adj,xadj,color,nspec,total_size_adj, &
+!                      is_on_a_slice_edge,ispec_is_d, &
+!                      nb_colors_outer_elements,nb_colors_inner_elements,nspec_outer)
+!
+!  integer, intent(in) :: nspec,total_size_adj
+!  integer, intent(in) :: adj(total_size_adj),xadj(nspec+1)
+!  integer :: color(nspec)
+!  integer :: this_color,nb_already_colored,ispec,ixadj,ok
+!  logical, dimension(nspec) :: is_on_a_slice_edge,ispec_is_d
+!  integer :: nb_colors_outer_elements,nb_colors_inner_elements,nspec_outer
+!  logical :: is_outer_element(nspec)
+!
+!  nspec_outer = 0
+!
+!  is_outer_element(:) = .false.
+!
+!  do ispec=1,nspec
+!    if(ispec_is_d(ispec)) then
+!      if (is_on_a_slice_edge(ispec)) then
+!        is_outer_element(ispec) = .true.
+!        nspec_outer=nspec_outer+1
+!      endif
+!    endif
+!  enddo
+!
+!  ! outer elements
+!  color(:) = 0
+!  this_color = 0
+!  nb_already_colored = 0
+!  do while(nb_already_colored<nspec_outer)
+!    this_color = this_color + 1
+!    do ispec = 1, nspec
+!      if(ispec_is_d(ispec)) then
+!        if (is_outer_element(ispec)) then
+!          if (color(ispec) == 0) then
+!            ok = 1
+!            do ixadj = xadj(ispec), (xadj(ispec+1)-1)
+!              if (is_outer_element(adj(ixadj)) .and. color(adj(ixadj)) == this_color) ok = 0
+!            enddo
+!            if (ok /= 0) then
+!              color(ispec) = this_color
+!              nb_already_colored = nb_already_colored + 1
+!            endif
+!          endif
+!        endif
+!      endif
+!    enddo
+!  enddo
+!  nb_colors_outer_elements = this_color
+!
+!  ! inner elements
+!  do while(nb_already_colored<nspec)
+!    this_color = this_color + 1
+!    do ispec = 1, nspec
+!      if(ispec_is_d(ispec)) then
+!        if (.not. is_outer_element(ispec)) then
+!          if (color(ispec) == 0) then
+!            ok = 1
+!            do ixadj = xadj(ispec), (xadj(ispec+1)-1)
+!              if (.not. is_outer_element(adj(ixadj)) .and. color(adj(ixadj)) == this_color) ok = 0
+!            enddo
+!            if (ok /= 0) then
+!              color(ispec) = this_color
+!              nb_already_colored = nb_already_colored + 1
+!            endif
+!          endif
+!        endif
+!      endif
+!    enddo
+!  enddo
+!
+!  nb_colors_inner_elements = this_color - nb_colors_outer_elements
+!
+!end subroutine get_color
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+!unused...
+!
+!!=======================================================================
+!!
+!!  Charbel Farhat's FEM topology routines
+!!
+!!  Dimitri Komatitsch, February 1996 - Code based on Farhat's original version from 1987
+!!
+!!  modified and adapted by Dimitri Komatitsch, May 2006
+!!
+!!=======================================================================
+!
+!  subroutine form_elt_connectivity_foelco(mn,mp,nspec,global_corner_number,&
+!                                          nglob_GLL_full,ibool,nglob_eight_corners_only)
+!
+!!-----------------------------------------------------------------------
+!!
+!!   Forms the MN and MP arrays
+!!
+!!     Input :
+!!     -------
+!!           ibool    Array needed to build the element connectivity table
+!!           nspec    Number of elements in the domain
+!!           NGNOD_HEXAHEDRA    number of nodes per hexahedron (brick with 8 corners)
+!!
+!!     Output :
+!!     --------
+!!           MN, MP   This is the element connectivity array pair.
+!!                    Array MN contains the list of the element
+!!                    connectivity, that is, the nodes contained in each
+!!                    element. They are stored in a stacked fashion.
+!!
+!!                    Pointer array MP stores the location of each
+!!                    element list. Its length is equal to the number
+!!                    of elements plus one.
+!!
+!!-----------------------------------------------------------------------
+!
+!  implicit none
+!
+!  include "constants.h"
+!  integer, parameter :: NGNOD_HEXAHEDRA = 8
+!
+!  integer nspec,nglob_GLL_full
+!
+!  ! arrays with mesh parameters per slice
+!  integer, intent(in), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+!
+!  ! global corner numbers that need to be created
+!  integer, intent(out), dimension(nglob_GLL_full) :: global_corner_number
+!  integer, intent(out) :: mn(nspec*NGNOD_HEXAHEDRA),mp(nspec+1)
+!  integer, intent(out) :: nglob_eight_corners_only
+!
+!  integer ninter,nsum,ispec,node,k,inumcorner,ix,iy,iz
+!
+!  ninter = 1
+!  nsum = 1
+!  mp(1) = 1
+!
+!  !---- define topology of the elements in the mesh
+!  !---- we need to define adjacent numbers from the sub-mesh consisting of the corners only
+!  nglob_eight_corners_only = 0
+!  global_corner_number(:) = -1
+!
+!  do ispec=1,nspec
+!
+!    inumcorner = 0
+!    do iz = 1,NGLLZ,NGLLZ-1
+!      do iy = 1,NGLLY,NGLLY-1
+!        do ix = 1,NGLLX,NGLLX-1
+!
+!          inumcorner = inumcorner + 1
+!          if(inumcorner > NGNOD_HEXAHEDRA) stop 'corner number too large'
+!
+!          ! check if this point was already assigned a number previously, otherwise create one and store it
+!          if(global_corner_number(ibool(ix,iy,iz,ispec)) == -1) then
+!            nglob_eight_corners_only = nglob_eight_corners_only + 1
+!            global_corner_number(ibool(ix,iy,iz,ispec)) = nglob_eight_corners_only
+!          endif
+!
+!          node = global_corner_number(ibool(ix,iy,iz,ispec))
+!            do k=nsum,ninter-1
+!              if(node == mn(k)) goto 200
+!            enddo
+!
+!            mn(ninter) = node
+!            ninter = ninter + 1
+!  200 continue
+!
+!        enddo
+!      enddo
+!    enddo
+!
+!      nsum = ninter
+!      mp(ispec + 1) = nsum
+!
+!  enddo
+!
+!  end subroutine form_elt_connectivity_foelco
+!
+!!
+!!-------------------------------------------------------------------------------------------------
+!!
+!
+!  subroutine form_node_connectivity_fonoco(mn,mp,ne,np,nglob_eight_corners_only,&
+!                                           nspec,count_only,total_size_ne)
+!
+!!-----------------------------------------------------------------------
+!!
+!!   Forms the NE and NP arrays
+!!
+!!     Input :
+!!     -------
+!!           MN, MP, nspec
+!!           nglob_eight_corners_only    Number of nodes in the domain
+!!
+!!     Output :
+!!     --------
+!!           NE, NP   This is the node-connected element array pair.
+!!                    Integer array NE contains a list of the
+!!                    elements connected to each node, stored in stacked fashion.
+!!
+!!                    Array NP is the pointer array for the
+!!                    location of a node's element list in the NE array.
+!!                    Its length is equal to the number of points plus one.
+!!
+!!-----------------------------------------------------------------------
+!
+!  implicit none
+!
+!  include "constants.h"
+!  integer, parameter :: NGNOD_HEXAHEDRA = 8
+!
+!  ! only count the total size of the array that will be created, or actually create it
+!  logical count_only
+!  integer total_size_ne
+!
+!  integer nglob_eight_corners_only,nspec
+!
+!  integer, intent(in) ::  mn(nspec*NGNOD_HEXAHEDRA),mp(nspec+1)
+!
+!  integer, intent(out) ::  ne(total_size_ne),np(nglob_eight_corners_only+1)
+!
+!  integer nsum,inode,ispec,j
+!
+!  nsum = 1
+!  np(1) = 1
+!
+!  do inode=1,nglob_eight_corners_only
+!      do 200 ispec=1,nspec
+!
+!            do j=mp(ispec),mp(ispec + 1) - 1
+!                  if (mn(j) == inode) then
+!                        if(count_only) then
+!                          total_size_ne = nsum
+!                        else
+!                          ne(nsum) = ispec
+!                        endif
+!                        nsum = nsum + 1
+!                        goto 200
+!                  endif
+!            enddo
+!  200 continue
+!
+!      np(inode + 1) = nsum
+!
+!  enddo
+!
+!  end subroutine form_node_connectivity_fonoco
+!
+!!
+!!-------------------------------------------------------------------------------------------------
+!!
+!
+!  !subroutine create_adjacency_table_adjncy(mn,mp,ne,np,adj,xadj,maskel,nspec,nglob_eight_corners_only,&
+!  !                                         count_only,total_size_ne,total_size_adj,face)
+!
+!  subroutine create_adjacency_table_adjncy(mn,mp,ne,np,adj,xadj,nspec,nglob_eight_corners_only,&
+!                                           count_only,total_size_ne,total_size_adj,face)
+!
+!!-----------------------------------------------------------------------
+!!
+!!   Establishes the element adjacency information of the mesh
+!!   Two elements are considered adjacent if they share a face.
+!!
+!!     Input :
+!!     -------
+!!           MN, MP, NE, NP, nspec
+!!           MASKEL    logical mask (length = nspec)
+!!
+!!     Output :
+!!     --------
+!!           ADJ, XADJ This is the element adjacency array pair. Array
+!!                     ADJ contains the list of the elements adjacent to
+!!                     element i. They are stored in a stacked fashion.
+!!                     Pointer array XADJ stores the location of each element list.
+!!
+!!-----------------------------------------------------------------------
+!
+!  implicit none
+!
+!  include "constants.h"
+!  integer, parameter :: NGNOD_HEXAHEDRA = 8
+!
+!  ! only count the total size of the array that will be created, or actually create it
+!  logical count_only,face
+!  integer total_size_ne,total_size_adj
+!
+!  integer nglob_eight_corners_only
+!
+!  integer, intent(in) :: mn(nspec*NGNOD_HEXAHEDRA),mp(nspec+1),ne(total_size_ne),np(nglob_eight_corners_only+1)
+!
+!  integer, intent(out) :: adj(total_size_adj),xadj(nspec+1)
+!
+!  logical maskel(nspec)
+!  integer countel(nspec)
+!
+!  integer nspec,iad,ispec,istart,istop,ino,node,jstart,jstop,nelem,jel
+!
+!  xadj(1) = 1
+!  iad = 1
+!
+!  do ispec=1,nspec
+!
+!  ! reset mask
+!  maskel(:) = .false.
+!
+!  ! mask current element
+!  maskel(ispec) = .true.
+!  if (face) countel(:) = 0
+!
+!  istart = mp(ispec)
+!  istop = mp(ispec+1) - 1
+!    do ino=istart,istop
+!      node = mn(ino)
+!      jstart = np(node)
+!      jstop = np(node + 1) - 1
+!        do 120 jel=jstart,jstop
+!            nelem = ne(jel)
+!            if(maskel(nelem)) goto 120
+!            if (face) then
+!              ! if 2 elements share at least 3 corners, therefore they share a face
+!              countel(nelem) = countel(nelem) + 1
+!              if (countel(nelem)>=3) then
+!                if(count_only) then
+!                  total_size_adj = iad
+!                else
+!                  adj(iad) = nelem
+!                endif
+!                maskel(nelem) = .true.
+!                iad = iad + 1
+!              endif
+!            else
+!              if(count_only) then
+!                total_size_adj = iad
+!              else
+!                adj(iad) = nelem
+!              endif
+!              maskel(nelem) = .true.
+!              iad = iad + 1
+!            endif
+!  120   continue
+!    enddo
+!
+!    xadj(ispec+1) = iad
+!
+!  enddo
+!
+!  end subroutine create_adjacency_table_adjncy
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+! PERMUTATIONS
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! implement permutation of elements for arrays of real (CUSTOM_REAL) type
+
+  subroutine permute_elements_real(array_to_permute,temp_array,perm,nspec)
+
+  implicit none
+
+  include "constants.h"
+
+  integer, intent(in) :: nspec
+  integer, intent(in), dimension(nspec) :: perm
+
+  real(kind=CUSTOM_REAL), intent(inout), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: &
+    array_to_permute,temp_array
+
+  integer old_ispec,new_ispec
+
+  ! copy the original array
+  temp_array(:,:,:,:) = array_to_permute(:,:,:,:)
+
+  do old_ispec = 1,nspec
+    new_ispec = perm(old_ispec)
+    array_to_permute(:,:,:,new_ispec) = temp_array(:,:,:,old_ispec)
+  enddo
+
+  end subroutine permute_elements_real
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! implement permutation of elements for arrays of integer type
+
+  subroutine permute_elements_integer(array_to_permute,temp_array,perm,nspec)
+
+  implicit none
+
+  include "constants.h"
+
+  integer, intent(in) :: nspec
+  integer, intent(in), dimension(nspec) :: perm
+
+  integer, intent(inout), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: &
+    array_to_permute,temp_array
+
+  integer old_ispec,new_ispec
+
+  ! copy the original array
+  temp_array(:,:,:,:) = array_to_permute(:,:,:,:)
+
+  do old_ispec = 1,nspec
+    new_ispec = perm(old_ispec)
+    array_to_permute(:,:,:,new_ispec) = temp_array(:,:,:,old_ispec)
+  enddo
+
+  end subroutine permute_elements_integer
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! implement permutation of elements for arrays of double precision type
+
+  subroutine permute_elements_dble(array_to_permute,temp_array,perm,nspec)
+
+  implicit none
+
+  include "constants.h"
+
+  integer, intent(in) :: nspec
+  integer, intent(in), dimension(nspec) :: perm
+
+  double precision, intent(inout), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: &
+    array_to_permute,temp_array
+
+  integer old_ispec,new_ispec
+
+  ! copy the original array
+  temp_array(:,:,:,:) = array_to_permute(:,:,:,:)
+
+  do old_ispec = 1,nspec
+    new_ispec = perm(old_ispec)
+    array_to_permute(:,:,:,new_ispec) = temp_array(:,:,:,old_ispec)
+  enddo
+
+  end subroutine permute_elements_dble
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! implement permutation of elements for arrays of double precision type
+
+  subroutine permute_elements_logical1D(array_to_permute,temp_array,perm,nspec)
+
+  implicit none
+
+  include "constants.h"
+
+  integer, intent(in) :: nspec
+  integer, intent(in), dimension(nspec) :: perm
+
+  logical, intent(inout), dimension(nspec) :: array_to_permute,temp_array
+
+  integer old_ispec,new_ispec
+
+  ! copy the original array
+  temp_array(:) = array_to_permute(:)
+
+  do old_ispec = 1,nspec
+    new_ispec = perm(old_ispec)
+    array_to_permute(new_ispec) = temp_array(old_ispec)
+  enddo
+
+  end subroutine permute_elements_logical1D

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/save_arrays_solver.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/save_arrays_solver.f90	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/save_arrays_solver.f90	2011-11-03 03:11:45 UTC (rev 19145)
@@ -53,7 +53,15 @@
                     c22store,c23store,c24store,c25store,c26store,c33store, &
                     c34store,c35store,c36store,c44store,c45store,c46store, &
                     c55store,c56store,c66store, &
-                    ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic)
+                    ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, &
+                    ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic, &
+                    nspec_outer_acoustic,nspec_outer_elastic, &
+                    num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
+                    num_phase_ispec_elastic,phase_ispec_inner_elastic, &
+                    num_colors_outer_acoustic,num_colors_inner_acoustic, &
+                    num_elem_colors_acoustic, &
+                    num_colors_outer_elastic,num_colors_inner_elastic, &
+                    num_elem_colors_elastic)
 
   implicit none
 
@@ -127,6 +135,23 @@
 ! material domain flags
   logical, dimension(nspec) :: ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic
 
+! inner/outer elements
+  logical,dimension(nspec) :: ispec_is_inner
+  integer :: nspec_inner_acoustic,nspec_outer_acoustic
+  integer :: nspec_inner_elastic,nspec_outer_elastic
+  integer :: num_phase_ispec_acoustic
+  integer,dimension(num_phase_ispec_acoustic,2) :: phase_ispec_inner_acoustic
+  integer :: num_phase_ispec_elastic
+  integer,dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+  ! mesh coloring
+  integer :: num_colors_outer_acoustic,num_colors_inner_acoustic
+  integer, dimension(num_colors_outer_acoustic + num_colors_inner_acoustic) :: &
+    num_elem_colors_acoustic
+  integer :: num_colors_outer_elastic,num_colors_inner_elastic
+  integer, dimension(num_colors_outer_elastic + num_colors_inner_elastic) :: &
+    num_elem_colors_elastic
+
 ! local parameters
   real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: v_tmp
   integer,dimension(:),allocatable :: v_tmp_i
@@ -272,6 +297,33 @@
     write(IOUT) c66store
   endif
 
+! inner/outer elements
+  write(IOUT) ispec_is_inner
+
+  if( ACOUSTIC_SIMULATION ) then
+    write(IOUT) nspec_inner_acoustic,nspec_outer_acoustic
+    write(IOUT) num_phase_ispec_acoustic
+    if(num_phase_ispec_acoustic > 0 ) write(IOUT) phase_ispec_inner_acoustic
+  endif
+
+  if( ELASTIC_SIMULATION ) then
+    write(IOUT) nspec_inner_elastic,nspec_outer_elastic
+    write(IOUT) num_phase_ispec_elastic
+    if(num_phase_ispec_elastic > 0 ) write(IOUT) phase_ispec_inner_elastic
+  endif
+
+! mesh coloring
+  if( USE_MESH_COLORING_GPU ) then
+    if( ACOUSTIC_SIMULATION ) then
+      write(IOUT) num_colors_outer_acoustic,num_colors_inner_acoustic
+      write(IOUT) num_elem_colors_acoustic
+    endif
+    if( ELASTIC_SIMULATION ) then
+      write(IOUT) num_colors_outer_elastic,num_colors_inner_elastic
+      write(IOUT) num_elem_colors_elastic
+    endif
+  endif
+
   close(IOUT)
 
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/save_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/save_databases.f90	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/save_databases.f90	2011-11-03 03:11:45 UTC (rev 19145)
@@ -83,60 +83,66 @@
   integer, dimension(8) ::  nspec_interface
 
 
-  open(unit=15,file=prname(1:len_trim(prname))//'Database',status='unknown',action='write',form='formatted')
+  !open(unit=15,file=prname(1:len_trim(prname))//'Database',status='unknown',action='write',form='formatted')
+  open(unit=15,file=prname(1:len_trim(prname))//'Database', &
+        status='unknown',action='write',form='unformatted')
 
-  write(15,*) nglob
+  write(15) nglob
   do iglob=1,nglob
-     write(15,*) iglob,nodes_coords(iglob,1),nodes_coords(iglob,2),nodes_coords(iglob,3)
+     write(15) iglob,nodes_coords(iglob,1),nodes_coords(iglob,2),nodes_coords(iglob,3)
   end do
 
 
 ! Materials properties
-   write(15,*) NMATERIALS, 0
+   write(15) NMATERIALS, 0
    do idoubl = 1,NMATERIALS
       !write(15,*) material_properties(idoubl,:)
       matpropl(:) = material_properties(idoubl,:)
-      write(15,*) matpropl
+      write(15) matpropl
    end do
 
 
-  write(15,*) nspec
+  write(15) nspec
   do ispec=1,nspec
-      write(15,'(11i14)') ispec,true_material_num(ispec),1,ibool(1,1,1,ispec),ibool(2,1,1,ispec),&
+      !write(15,'(11i14)') ispec,true_material_num(ispec),1,ibool(1,1,1,ispec),ibool(2,1,1,ispec),&
+      !     ibool(2,2,1,ispec),ibool(1,2,1,ispec),ibool(1,1,2,ispec),&
+      !     ibool(2,1,2,ispec),ibool(2,2,2,ispec),ibool(1,2,2,ispec)
+      write(15) ispec,true_material_num(ispec),1,ibool(1,1,1,ispec),ibool(2,1,1,ispec),&
            ibool(2,2,1,ispec),ibool(1,2,1,ispec),ibool(1,1,2,ispec),&
            ibool(2,1,2,ispec),ibool(2,2,2,ispec),ibool(1,2,2,ispec)
+
   end do
 
   ! Boundaries
-  write(15,*) 1,nspec2D_xmin
-  write(15,*) 2,nspec2D_xmax
-  write(15,*) 3,nspec2D_ymin
-  write(15,*) 4,nspec2D_ymax
-  write(15,*) 5,NSPEC2D_BOTTOM
-  write(15,*) 6,NSPEC2D_TOP
+  write(15) 1,nspec2D_xmin
+  write(15) 2,nspec2D_xmax
+  write(15) 3,nspec2D_ymin
+  write(15) 4,nspec2D_ymax
+  write(15) 5,NSPEC2D_BOTTOM
+  write(15) 6,NSPEC2D_TOP
 
   do i=1,nspec2D_xmin
-     write(15,*) ibelm_xmin(i),ibool(1,1,1,ibelm_xmin(i)),ibool(1,NGLLY,1,ibelm_xmin(i)),&
+     write(15) ibelm_xmin(i),ibool(1,1,1,ibelm_xmin(i)),ibool(1,NGLLY,1,ibelm_xmin(i)),&
           ibool(1,1,NGLLZ,ibelm_xmin(i)),ibool(1,NGLLY,NGLLZ,ibelm_xmin(i))
   end do
   do i=1,nspec2D_xmax
-     write(15,*) ibelm_xmax(i),ibool(NGLLX,1,1,ibelm_xmax(i)),ibool(NGLLX,NGLLY,1,ibelm_xmax(i)), &
+     write(15) ibelm_xmax(i),ibool(NGLLX,1,1,ibelm_xmax(i)),ibool(NGLLX,NGLLY,1,ibelm_xmax(i)), &
           ibool(NGLLX,1,NGLLZ,ibelm_xmax(i)),ibool(NGLLX,NGLLY,NGLLZ,ibelm_xmax(i))
   end do
   do i=1,nspec2D_ymin
-     write(15,*) ibelm_ymin(i),ibool(1,1,1,ibelm_ymin(i)),ibool(NGLLX,1,1,ibelm_ymin(i)),&
+     write(15) ibelm_ymin(i),ibool(1,1,1,ibelm_ymin(i)),ibool(NGLLX,1,1,ibelm_ymin(i)),&
           ibool(1,1,NGLLZ,ibelm_ymin(i)),ibool(NGLLX,1,NGLLZ,ibelm_ymin(i))
   end do
   do i=1,nspec2D_ymax
-     write(15,*) ibelm_ymax(i),ibool(NGLLX,NGLLY,1,ibelm_ymax(i)),ibool(1,NGLLY,1,ibelm_ymax(i)), &
+     write(15) ibelm_ymax(i),ibool(NGLLX,NGLLY,1,ibelm_ymax(i)),ibool(1,NGLLY,1,ibelm_ymax(i)), &
           ibool(NGLLX,NGLLY,NGLLZ,ibelm_ymax(i)),ibool(1,NGLLY,NGLLZ,ibelm_ymax(i))
   end do
   do i=1,NSPEC2D_BOTTOM
-     write(15,*) ibelm_bottom(i),ibool(1,1,1,ibelm_bottom(i)),ibool(NGLLX,1,1,ibelm_bottom(i)), &
+     write(15) ibelm_bottom(i),ibool(1,1,1,ibelm_bottom(i)),ibool(NGLLX,1,1,ibelm_bottom(i)), &
           ibool(NGLLX,NGLLY,1,ibelm_bottom(i)),ibool(1,NGLLY,1,ibelm_bottom(i))
   end do
   do i=1,NSPEC2D_TOP
-     write(15,*) ibelm_top(i),ibool(1,1,NGLLZ,ibelm_top(i)),ibool(NGLLX,1,NGLLZ,ibelm_top(i)), &
+     write(15) ibelm_top(i),ibool(1,1,NGLLZ,ibelm_top(i)),ibool(NGLLX,1,NGLLZ,ibelm_top(i)), &
           ibool(NGLLX,NGLLY,NGLLZ,ibelm_top(i)),ibool(1,NGLLY,NGLLZ,ibelm_top(i))
   end do
 
@@ -194,79 +200,79 @@
 
   nspec_interfaces_max = maxval(nspec_interface)
 
-  write(15,*) nb_interfaces,nspec_interfaces_max
+  write(15) nb_interfaces,nspec_interfaces_max
 
   if(interfaces(W)) then
-     write(15,*) addressing(iproc_xi-1,iproc_eta),nspec_interface(W)
+     write(15) addressing(iproc_xi-1,iproc_eta),nspec_interface(W)
      do ispec = 1,nspec
-        if(iMPIcut_xi(1,ispec))  write(15,*) ispec,4,ibool(1,1,1,ispec),ibool(1,2,1,ispec), &
+        if(iMPIcut_xi(1,ispec))  write(15) ispec,4,ibool(1,1,1,ispec),ibool(1,2,1,ispec), &
              ibool(1,1,2,ispec),ibool(1,2,2,ispec)
      end do
   end if
 
   if(interfaces(E)) then
-     write(15,*) addressing(iproc_xi+1,iproc_eta),nspec_interface(E)
+     write(15) addressing(iproc_xi+1,iproc_eta),nspec_interface(E)
      do ispec = 1,nspec
-        if(iMPIcut_xi(2,ispec))  write(15,*) ispec,4,ibool(2,1,1,ispec),ibool(2,2,1,ispec), &
+        if(iMPIcut_xi(2,ispec))  write(15) ispec,4,ibool(2,1,1,ispec),ibool(2,2,1,ispec), &
              ibool(2,1,2,ispec),ibool(2,2,2,ispec)
      end do
   end if
 
    if(interfaces(S)) then
-     write(15,*) addressing(iproc_xi,iproc_eta-1),nspec_interface(S)
+     write(15) addressing(iproc_xi,iproc_eta-1),nspec_interface(S)
      do ispec = 1,nspec
-        if(iMPIcut_eta(1,ispec))  write(15,*) ispec,4,ibool(1,1,1,ispec),ibool(2,1,1,ispec), &
+        if(iMPIcut_eta(1,ispec))  write(15) ispec,4,ibool(1,1,1,ispec),ibool(2,1,1,ispec), &
              ibool(1,1,2,ispec),ibool(2,1,2,ispec)
      end do
   end if
 
   if(interfaces(N)) then
-     write(15,*) addressing(iproc_xi,iproc_eta+1),nspec_interface(N)
+     write(15) addressing(iproc_xi,iproc_eta+1),nspec_interface(N)
      do ispec = 1,nspec
-        if(iMPIcut_eta(2,ispec))  write(15,*) ispec,4,ibool(2,2,1,ispec),ibool(1,2,1,ispec), &
+        if(iMPIcut_eta(2,ispec))  write(15) ispec,4,ibool(2,2,1,ispec),ibool(1,2,1,ispec), &
              ibool(2,2,2,ispec),ibool(1,2,2,ispec)
      end do
   end if
 
   if(interfaces(NW)) then
-     write(15,*) addressing(iproc_xi-1,iproc_eta+1),nspec_interface(NW)
+     write(15) addressing(iproc_xi-1,iproc_eta+1),nspec_interface(NW)
      do ispec = 1,nspec
         if((iMPIcut_xi(1,ispec) .eqv. .true.) .and. (iMPIcut_eta(2,ispec) .eqv. .true.))  then
-           write(15,*) ispec,2,ibool(1,2,1,ispec),ibool(1,2,2,ispec),-1,-1
+           write(15) ispec,2,ibool(1,2,1,ispec),ibool(1,2,2,ispec),-1,-1
         end if
      end do
   end if
 
   if(interfaces(NE)) then
-     write(15,*) addressing(iproc_xi+1,iproc_eta+1),nspec_interface(NE)
+     write(15) addressing(iproc_xi+1,iproc_eta+1),nspec_interface(NE)
      do ispec = 1,nspec
         if((iMPIcut_xi(2,ispec) .eqv. .true.) .and. (iMPIcut_eta(2,ispec) .eqv. .true.))  then
-           write(15,*) ispec,2,ibool(2,2,1,ispec),ibool(2,2,2,ispec),-1,-1
+           write(15) ispec,2,ibool(2,2,1,ispec),ibool(2,2,2,ispec),-1,-1
         end if
      end do
   end if
 
   if(interfaces(SE)) then
-     write(15,*) addressing(iproc_xi+1,iproc_eta-1),nspec_interface(SE)
+     write(15) addressing(iproc_xi+1,iproc_eta-1),nspec_interface(SE)
      do ispec = 1,nspec
         if((iMPIcut_xi(2,ispec) .eqv. .true.) .and. (iMPIcut_eta(1,ispec) .eqv. .true.))  then
-           write(15,*) ispec,2,ibool(2,1,1,ispec),ibool(2,1,2,ispec),-1,-1
+           write(15) ispec,2,ibool(2,1,1,ispec),ibool(2,1,2,ispec),-1,-1
         end if
      end do
   end if
 
   if(interfaces(SW)) then
-     write(15,*) addressing(iproc_xi-1,iproc_eta-1),nspec_interface(SW)
+     write(15) addressing(iproc_xi-1,iproc_eta-1),nspec_interface(SW)
      do ispec = 1,nspec
         if((iMPIcut_xi(1,ispec) .eqv. .true.) .and. (iMPIcut_eta(1,ispec) .eqv. .true.))  then
-           write(15,*) ispec,2,ibool(1,1,1,ispec),ibool(1,1,2,ispec),-1,-1
+           write(15) ispec,2,ibool(1,1,1,ispec),ibool(1,1,2,ispec),-1,-1
         end if
      end do
   end if
 
   else
 
-     write(15,*) 0,0
+     write(15) 0,0
 
   end if
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in	2011-11-03 03:11:45 UTC (rev 19145)
@@ -256,7 +256,6 @@
 ! add mesh coloring for the GPU + MPI implementation
   logical, parameter :: USE_MESH_COLORING_GPU = .false.
   integer, parameter :: MAX_NUMBER_OF_COLORS = 10000
-  integer, parameter :: NGNOD_HEXAHEDRA = 8
 
 !------------------------------------------------------
 !----------- do not modify anything below -------------

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_VTK_data.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_VTK_data.f90	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_VTK_data.f90	2011-11-03 03:11:45 UTC (rev 19145)
@@ -38,17 +38,20 @@
 
   integer :: nspec,nglob
 
-! global coordinates
+  ! global coordinates
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
   real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy
 
-! element flag array
+  ! element flag array
   integer, dimension(nspec) :: elem_flag
+
+  ! file name
+  character(len=256) :: prname_file
+  !character(len=2), optional, intent(in) :: str_id
+
+  ! local parameters
   integer :: ispec,i
 
-! file name
-  character(len=256) prname_file
-
 ! write source and receiver VTK files for Paraview
   !debug
   !write(IMAIN,*) '  vtk file: '
@@ -79,6 +82,11 @@
   write(IOVTK,*) ""
 
   write(IOVTK,'(a,i12)') "CELL_DATA ",nspec
+  !if( present( str_id ) ) then
+  !  write(IOVTK,'(a)') "SCALARS elem_flag_"//str_id//" integer"
+  !else
+  !  write(IOVTK,'(a)') "SCALARS elem_flag integer"
+  !endif
   write(IOVTK,'(a)') "SCALARS elem_flag integer"
   write(IOVTK,'(a)') "LOOKUP_TABLE default"
   do ispec = 1,nspec

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_acoustic.f90	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_acoustic.f90	2011-11-03 03:11:45 UTC (rev 19145)
@@ -65,7 +65,7 @@
   logical:: phase_is_inner
 
 
-! enforces free surface (zeroes potentials at free surface)
+  ! enforces free surface (zeroes potentials at free surface)
   if(.NOT. GPU_MODE) then
     ! on CPU
     call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_AB, &
@@ -110,17 +110,17 @@
                               potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
   endif
 
-! distinguishes two runs: for points on MPI interfaces, and points within the partitions
+  ! distinguishes two runs: for elements on MPI interfaces, and elements within the partitions
   do iphase=1,2
 
-    !first for points on MPI interfaces
+    !first for points on MPI interfaces, thus outer elements
     if( iphase == 1 ) then
       phase_is_inner = .false.
     else
       phase_is_inner = .true.
     endif
 
-! acoustic pressure term
+    ! acoustic pressure term
     if(.NOT. GPU_MODE) then
       ! on CPU
       call compute_forces_acoustic_pot( iphase, NSPEC_AB,NGLOB_AB, &
@@ -184,7 +184,7 @@
 
     endif ! PML
 
-! absorbing boundaries
+    ! absorbing boundaries
     if(ABSORBING_CONDITIONS) then
       if( PML .and. PML_USE_SOMMERFELD ) then
         ! adds a Sommerfeld condition on the domain's absorbing boundaries
@@ -215,7 +215,7 @@
       endif
     endif
 
-! elastic coupling
+    ! elastic coupling
     if(ELASTIC_SIMULATION ) then
       if( .NOT. GPU_MODE ) then
         call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, &
@@ -243,13 +243,13 @@
       endif
     endif
 
-! poroelastic coupling
-! not implemented yet
+    ! poroelastic coupling
+    ! not implemented yet
     if(POROELASTIC_SIMULATION ) &
       !  call compute_coupling_acoustic_poro()
       call exit_MPI(myrank,'poroelastic coupling with acoustic domain not implemented yet!')
 
-! sources
+    ! sources
     call compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acoustic, &
                         ibool,ispec_is_inner,phase_is_inner, &
                         NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,&
@@ -262,7 +262,7 @@
                         NTSTEP_BETWEEN_READ_ADJSRC, &
                         GPU_MODE, Mesh_pointer)
 
-! assemble all the contributions between slices using MPI
+    ! assemble all the contributions between slices using MPI
     if( phase_is_inner .eqv. .false. ) then
       ! sends potential_dot_dot_acoustic values to corresponding MPI interface neighbors (non-blocking)
       if(.NOT. GPU_MODE) then

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/create_color_image.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/create_color_image.f90	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/create_color_image.f90	2011-11-03 03:11:45 UTC (rev 19145)
@@ -37,7 +37,7 @@
   !   type = 2 : velocity V_y component
   !   type = 3 : velocity V_z component
   !   type = 4 : velocity V norm
-  integer,parameter:: IMAGE_TYPE = 2
+  integer,parameter:: IMAGE_TYPE = 4
 
   ! cross-section surface
   ! cross-section origin point

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2011-11-03 03:11:45 UTC (rev 19145)
@@ -778,7 +778,9 @@
                                   sourcearrays, islice_selected_source, ispec_selected_source, &
                                   number_receiver_global, ispec_selected_rec, &
                                   nrec, nrec_local, &
-                                  SIMULATION_TYPE,ncuda_devices)
+                                  SIMULATION_TYPE, &
+                                  USE_MESH_COLORING_GPU,nspec_acoustic,nspec_elastic, &
+                                  ncuda_devices)
 
   call min_all_i(ncuda_devices,ncuda_devices_min)
   call max_all_i(ncuda_devices,ncuda_devices_max)
@@ -793,7 +795,9 @@
                                   ABSORBING_CONDITIONS,b_reclen_potential,b_absorb_potential, &
                                   ELASTIC_SIMULATION, num_coupling_ac_el_faces, &
                                   coupling_ac_el_ispec,coupling_ac_el_ijk, &
-                                  coupling_ac_el_normal,coupling_ac_el_jacobian2Dw)
+                                  coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
+                                  num_colors_outer_acoustic,num_colors_inner_acoustic, &
+                                  num_elem_colors_acoustic)
 
     if( SIMULATION_TYPE == 3 ) &
       call prepare_fields_acoustic_adj_dev(Mesh_pointer, &
@@ -822,7 +826,9 @@
                                   NOISE_TOMOGRAPHY, &
                                   free_surface_normal,free_surface_ispec,free_surface_ijk, &
                                   num_free_surface_faces, &
-                                  ACOUSTIC_SIMULATION)
+                                  ACOUSTIC_SIMULATION, &
+                                  num_colors_outer_elastic,num_colors_inner_elastic, &
+                                  num_elem_colors_elastic)
 
     if( SIMULATION_TYPE == 3 ) &
       call prepare_fields_elastic_adj_dev(Mesh_pointer, NDIM*NGLOB_AB, &

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2011-11-03 03:11:45 UTC (rev 19145)
@@ -76,6 +76,8 @@
   read(27) ispec_is_poroelastic
 
   ! acoustic
+  ! number of acoustic elements in this partition
+  nspec_acoustic = count(ispec_is_acoustic(:))
   ! all processes will have acoustic_simulation set if any flag is .true.
   call any_all_l( ANY(ispec_is_acoustic), ACOUSTIC_SIMULATION )
   if( ACOUSTIC_SIMULATION ) then
@@ -98,6 +100,9 @@
   endif
 
   ! elastic
+  ! number of elastic elements in this partition
+  nspec_elastic = count(ispec_is_elastic(:))
+  ! elastic simulation
   call any_all_l( ANY(ispec_is_elastic), ELASTIC_SIMULATION )
   if( ELASTIC_SIMULATION ) then
     ! displacement,velocity,acceleration
@@ -292,8 +297,68 @@
     read(27) c66store
   endif
 
+! inner / outer elements
+  allocate(ispec_is_inner(NSPEC_AB),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array ispec_is_inner'
+  read(27) ispec_is_inner
+
+  if( ACOUSTIC_SIMULATION ) then
+    read(27) nspec_inner_acoustic,nspec_outer_acoustic
+    read(27) num_phase_ispec_acoustic
+    if( num_phase_ispec_acoustic < 0 ) stop 'error acoustic simulation: num_phase_ispec_acoustic is < zero'
+    allocate( phase_ispec_inner_acoustic(num_phase_ispec_acoustic,2),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array phase_ispec_inner_acoustic'
+    if(num_phase_ispec_acoustic > 0 ) read(27) phase_ispec_inner_acoustic
+  endif
+
+  if( ELASTIC_SIMULATION ) then
+    read(27) nspec_inner_elastic,nspec_outer_elastic
+    read(27) num_phase_ispec_elastic
+    if( num_phase_ispec_elastic < 0 ) stop 'error elastic simulation: num_phase_ispec_elastic is < zero'
+    allocate( phase_ispec_inner_elastic(num_phase_ispec_elastic,2),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array phase_ispec_inner_elastic'
+    if(num_phase_ispec_elastic > 0 ) read(27) phase_ispec_inner_elastic
+  endif
+
+! mesh coloring for GPUs
+  if( USE_MESH_COLORING_GPU ) then
+    ! acoustic domain colors
+    if( ACOUSTIC_SIMULATION ) then
+      read(27) num_colors_outer_acoustic,num_colors_inner_acoustic
+
+      allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array'
+
+      read(27) num_elem_colors_acoustic
+    endif
+    ! elastic domain colors
+    if( ELASTIC_SIMULATION ) then
+      read(27) num_colors_outer_elastic,num_colors_inner_elastic
+
+      allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array'
+
+      read(27) num_elem_colors_elastic
+    endif
+  else
+    ! allocates dummy arrays
+    if( ACOUSTIC_SIMULATION ) then
+      num_colors_outer_acoustic = 0
+      num_colors_inner_acoustic = 0
+      allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array'
+    endif
+    if( ELASTIC_SIMULATION ) then
+      num_colors_outer_elastic = 0
+      num_colors_inner_elastic = 0
+      allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array'
+    endif
+  endif
+
   close(27)
 
+  ! outputs total element numbers
   call sum_all_i(count(ispec_is_acoustic(:)),inum)
   if( myrank == 0 ) then
     write(IMAIN,*) 'total acoustic elements:',inum
@@ -320,13 +385,7 @@
     request_recv_scalar_ext_mesh(num_interfaces_ext_mesh),stat=ier)
   if( ier /= 0 ) stop 'error allocating array buffer_send_vector_ext_mesh etc.'
 
-! locate inner and outer elements
-  call rmd_setup_inner_outer_elemnts()
-
-!daniel: TODO -- mesh coloring
-!  call rmd_setup_color_perm()
-
-! gets model dimensions
+  ! gets model dimensions
   minl = minval( xstore )
   maxl = maxval( xstore )
   call min_all_all_cr(minl,min_all)
@@ -341,7 +400,7 @@
   LATITUDE_MIN = min_all
   LATITUDE_MAX = max_all
 
-! check courant criteria on mesh
+  ! checks courant criteria on mesh
   if( ELASTIC_SIMULATION ) then
     call check_mesh_resolution(myrank,NSPEC_AB,NGLOB_AB, &
                               ibool,xstore,ystore,zstore, &
@@ -366,437 +425,11 @@
 
   end subroutine read_mesh_databases
 
-!
-!-------------------------------------------------------------------------------------------------
-!
-  subroutine rmd_setup_inner_outer_elemnts()
 
-  use specfem_par
-  use specfem_par_elastic
-  use specfem_par_acoustic
-  implicit none
-  ! local parameters
-  integer :: i,j,k,ispec,iglob
-  integer :: iinterface,ier
-  character(len=256) :: filename
-  logical,dimension(:),allocatable :: iglob_is_inner
-  real :: percentage_edge
-
-  ! allocates arrays
-  allocate(ispec_is_inner(NSPEC_AB),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array ispec_is_inner'
-  allocate(iglob_is_inner(NGLOB_AB),stat=ier)
-  if( ier /= 0 ) stop 'error allocating temporary array  iglob_is_inner'
-
-  ! initialize flags
-  ispec_is_inner(:) = .true.
-  iglob_is_inner(:) = .true.
-  do iinterface = 1, num_interfaces_ext_mesh
-    do i = 1, nibool_interfaces_ext_mesh(iinterface)
-      iglob = ibool_interfaces_ext_mesh(i,iinterface)
-      iglob_is_inner(iglob) = .false.
-    enddo
-  enddo
-
-  ! determines flags for inner elements (purely inside the partition)
-  do ispec = 1, NSPEC_AB
-    do k = 1, NGLLZ
-      do j = 1, NGLLY
-        do i = 1, NGLLX
-          iglob = ibool(i,j,k,ispec)
-          ispec_is_inner(ispec) = ( iglob_is_inner(iglob) .and. ispec_is_inner(ispec) )
-        enddo
-      enddo
-    enddo
-  enddo
-
-  ! frees temporary array
-  deallocate( iglob_is_inner )
-
-  if( SAVE_MESH_FILES ) then
-    filename = prname(1:len_trim(prname))//'ispec_is_inner'
-    call write_VTK_data_elem_l(NSPEC_AB,NGLOB_AB, &
-                        xstore,ystore,zstore,ibool, &
-                        ispec_is_inner,filename)
-  endif
-
-  ! sets up elements for loops in acoustic simulations
-  if( ACOUSTIC_SIMULATION ) then
-    ! counts inner and outer elements
-    nspec_inner_acoustic = 0
-    nspec_outer_acoustic = 0
-    do ispec = 1, NSPEC_AB
-      if( ispec_is_acoustic(ispec) ) then
-        if( ispec_is_inner(ispec) .eqv. .true. ) then
-          nspec_inner_acoustic = nspec_inner_acoustic + 1
-        else
-          nspec_outer_acoustic = nspec_outer_acoustic + 1
-        endif
-      endif
-    enddo
-
-    ! stores indices of inner and outer elements for faster(?) computation
-    num_phase_ispec_acoustic = max(nspec_inner_acoustic,nspec_outer_acoustic)
-    allocate( phase_ispec_inner_acoustic(num_phase_ispec_acoustic,2),stat=ier)
-    phase_ispec_inner_acoustic(:,:) = 0
-
-    if( ier /= 0 ) stop 'error allocating array phase_ispec_inner_acoustic'
-    nspec_inner_acoustic = 0
-    nspec_outer_acoustic = 0
-    do ispec = 1, NSPEC_AB
-      if( ispec_is_acoustic(ispec) ) then
-        if( ispec_is_inner(ispec) .eqv. .true. ) then
-          nspec_inner_acoustic = nspec_inner_acoustic + 1
-          phase_ispec_inner_acoustic(nspec_inner_acoustic,2) = ispec
-        else
-          nspec_outer_acoustic = nspec_outer_acoustic + 1
-          phase_ispec_inner_acoustic(nspec_outer_acoustic,1) = ispec
-        endif
-      endif
-    enddo
-    !print *,'rank ',myrank,' acoustic inner spec: ',nspec_inner_acoustic
-    !print *,'rank ',myrank,' acoustic outer spec: ',nspec_outer_acoustic
-  endif
-
-  ! sets up elements for loops in acoustic simulations
-  if( ELASTIC_SIMULATION ) then
-    ! counts inner and outer elements
-    nspec_inner_elastic = 0
-    nspec_outer_elastic = 0
-    do ispec = 1, NSPEC_AB
-      if( ispec_is_elastic(ispec) ) then
-        if( ispec_is_inner(ispec) .eqv. .true. ) then
-          nspec_inner_elastic = nspec_inner_elastic + 1
-        else
-          nspec_outer_elastic = nspec_outer_elastic + 1
-        endif
-      endif
-    enddo
-
-    ! stores indices of inner and outer elements for faster(?) computation
-    num_phase_ispec_elastic = max(nspec_inner_elastic,nspec_outer_elastic)
-    if( num_phase_ispec_elastic <= 0 ) stop 'error elastic simulation: num_phase_ispec_elastic is zero'
-
-    allocate( phase_ispec_inner_elastic(num_phase_ispec_elastic,2),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array phase_ispec_inner_elastic'
-    phase_ispec_inner_elastic(:,:) = 0
-
-    nspec_inner_elastic = 0
-    nspec_outer_elastic = 0
-    do ispec = 1, NSPEC_AB
-      if( ispec_is_elastic(ispec) ) then
-        if( ispec_is_inner(ispec) .eqv. .true. ) then
-          nspec_inner_elastic = nspec_inner_elastic + 1
-          phase_ispec_inner_elastic(nspec_inner_elastic,2) = ispec
-        else
-          nspec_outer_elastic = nspec_outer_elastic + 1
-          phase_ispec_inner_elastic(nspec_outer_elastic,1) = ispec
-        endif
-      endif
-    enddo
-    !print *,'rank ',myrank,' elastic inner spec: ',nspec_inner_elastic
-    !print *,'rank ',myrank,' elastic outer spec: ',nspec_outer_elastic
-  endif
-
-  if(myrank == 0) then
-    percentage_edge = 100.*count(ispec_is_inner(:))/real(NSPEC_AB)
-    write(IMAIN,*) 'for overlapping of communications with calculations:'
-    write(IMAIN,*) '  percentage of   edge elements ',100. -percentage_edge,'%'
-    write(IMAIN,*) '  percentage of volume elements ',percentage_edge,'%'
-    write(IMAIN,*)
-  endif
-
-
-  end subroutine rmd_setup_inner_outer_elemnts
-
 !
 !-------------------------------------------------------------------------------------------------
 !
 
-!daniel: TODO -- mesh coloring
-  subroutine rmd_setup_color_perm()
-
-  use specfem_par
-  use specfem_par_elastic
-  use specfem_par_acoustic
-  implicit none
-  ! local parameters
-  ! added for color permutation
-  integer :: nb_colors_outer_elements,nb_colors_inner_elements
-  integer, dimension(:), allocatable :: perm
-  integer, dimension(:), allocatable :: first_elem_number_in_this_color
-  integer, dimension(:), allocatable :: num_of_elems_in_this_color
-
-  integer :: icolor,ispec,ispec_counter
-  integer :: nspec_outer,nspec_outer_min_global,nspec_outer_max_global
-
-  integer :: ispec_inner_acoustic,ispec_outer_acoustic
-  integer :: ispec_inner_elastic,ispec_outer_elastic
-  integer :: nspec,nglob
-
-  ! added for sorting
-  integer, dimension(:,:,:,:), allocatable :: temp_array_int
-  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: temp_array_real
-  logical, dimension(:), allocatable :: temp_array_logical_1D
-  !integer, dimension(:), allocatable :: temp_array_int_1D
-
-  nspec = NSPEC_AB
-  nglob = NGLOB_AB
-
-  !!!! David Michea: detection of the edges, coloring and permutation separately
-  allocate(perm(nspec))
-
-  ! implement mesh coloring for GPUs if needed, to create subsets of disconnected elements
-  ! to remove dependencies and the need for atomic operations in the sum of
-  ! elemental contributions in the solver
-  if(USE_MESH_COLORING_GPU) then
-
-    allocate(first_elem_number_in_this_color(MAX_NUMBER_OF_COLORS + 1))
-
-    call get_perm_color_faster(ispec_is_inner,ibool,perm,nspec,nglob, &
-                              nb_colors_outer_elements,nb_colors_inner_elements, &
-                              nspec_outer,first_elem_number_in_this_color,myrank)
-
-    ! for the last color, the next color is fictitious and its first (fictitious) element number is nspec + 1
-    first_elem_number_in_this_color(nb_colors_outer_elements + nb_colors_inner_elements + 1) = nspec + 1
-
-    allocate(num_of_elems_in_this_color(nb_colors_outer_elements + nb_colors_inner_elements))
-
-    ! save mesh coloring
-    open(unit=99,file=prname(1:len_trim(prname))//'num_of_elems_in_this_color.dat',status='unknown')
-
-    ! number of colors for outer elements
-    write(99,*) nb_colors_outer_elements
-
-    ! number of colors for inner elements
-    write(99,*) nb_colors_inner_elements
-
-    ! number of elements in each color
-    do icolor = 1, nb_colors_outer_elements + nb_colors_inner_elements
-      num_of_elems_in_this_color(icolor) = first_elem_number_in_this_color(icolor+1) - first_elem_number_in_this_color(icolor)
-      write(99,*) num_of_elems_in_this_color(icolor)
-    enddo
-    close(99)
-
-    ! check that the sum of all the numbers of elements found in each color is equal
-    ! to the total number of elements in the mesh
-    if(sum(num_of_elems_in_this_color) /= nspec) then
-      print *,'nspec = ',nspec
-      print *,'total number of elements in all the colors of the mesh = ',sum(num_of_elems_in_this_color)
-      call exit_MPI(myrank, 'incorrect total number of elements in all the colors of the mesh')
-    endif
-
-    ! check that the sum of all the numbers of elements found in each color for the outer elements is equal
-    ! to the total number of outer elements found in the mesh
-    if(sum(num_of_elems_in_this_color(1:nb_colors_outer_elements)) /= nspec_outer) then
-      print *,'nspec_outer = ',nspec_outer
-      print *,'total number of elements in all the colors of the mesh for outer elements = ',sum(num_of_elems_in_this_color)
-      call exit_MPI(myrank, 'incorrect total number of elements in all the colors of the mesh for outer elements')
-    endif
-
-    deallocate(first_elem_number_in_this_color)
-    deallocate(num_of_elems_in_this_color)
-
-  else
-
-!! DK DK for regular C + MPI version for CPUs: do not use colors but nonetheless put all the outer elements
-!! DK DK first in order to be able to overlap non-blocking MPI communications with calculations
-
-!! DK DK nov 2010, for Rosa Badia / StarSs:
-!! no need for mesh coloring, but need to implement inner/outer subsets for non blocking MPI for StarSs
-    ispec_counter = 0
-    perm(:) = 0
-
-    ! first generate all the outer elements
-    do ispec = 1,nspec
-      if( ispec_is_inner(ispec) .eqv. .false.) then
-        ispec_counter = ispec_counter + 1
-        perm(ispec) = ispec_counter
-      endif
-    enddo
-
-    ! make sure we have detected some outer elements
-    !if(ispec_counter <= 0) call exit_MPI(myrank, 'fatal error: no outer elements detected!')
-
-    ! store total number of outer elements
-    nspec_outer = ispec_counter
-
-    ! then generate all the inner elements
-    do ispec = 1,nspec
-      if( ispec_is_inner(ispec) .eqv. .true.) then
-        ispec_counter = ispec_counter + 1
-        perm(ispec) = ispec_counter - nspec_outer ! starts again at 1
-      endif
-    enddo
-
-    ! test that all the elements have been used once and only once
-    if(ispec_counter /= nspec) call exit_MPI(myrank, 'fatal error: ispec_counter not equal to nspec')
-
-    ! do basic checks
-    if(minval(perm) /= 1) call exit_MPI(myrank, 'minval(perm) should be 1')
-    !if(maxval(perm) /= nspec) call exit_MPI(myrank, 'maxval(perm) should be nspec')
-
-
-  endif
-
-  ! sets up elements for loops in acoustic simulations
-  if( ACOUSTIC_SIMULATION ) then
-    ispec_inner_acoustic = 0
-    ispec_outer_acoustic = 0
-    do ispec = 1, NSPEC_AB
-      if( ispec_is_acoustic(ispec) ) then
-        if( ispec_is_inner(ispec) .eqv. .true. ) then
-          !ispec_inner_acoustic = ispec_inner_acoustic + 1
-          ispec_inner_acoustic = perm(ispec)
-          if( ispec_inner_acoustic < 1 .or. ispec_inner_acoustic > num_phase_ispec_acoustic ) &
-            call exit_MPI(myrank,'error inner acoustic permutation')
-
-          phase_ispec_inner_acoustic(ispec_inner_acoustic,2) = ispec
-
-        else
-          !ispec_outer_acoustic = ispec_outer_acoustic + 1
-          ispec_outer_acoustic = perm(ispec)
-          if( ispec_outer_acoustic < 1 .or. ispec_outer_acoustic > num_phase_ispec_acoustic ) &
-            call exit_MPI(myrank,'error outer acoustic permutation')
-
-          phase_ispec_inner_acoustic(ispec_outer_acoustic,1) = ispec
-
-        endif
-      endif
-    enddo
-  endif
-
-  ! sets up elements for loops in elastic simulations
-  if( ELASTIC_SIMULATION ) then
-    ispec_inner_elastic = 0
-    ispec_outer_elastic = 0
-    do ispec = 1, NSPEC_AB
-      if( ispec_is_elastic(ispec) ) then
-        if( ispec_is_inner(ispec) .eqv. .true. ) then
-          !ispec_inner_elastic = ispec_inner_elastic + 1
-          ispec_inner_elastic = perm(ispec)
-          if( ispec_inner_elastic < 1 .or. ispec_inner_elastic > num_phase_ispec_elastic ) then
-            print*,'error: inner elastic permutation',ispec_inner_elastic,num_phase_ispec_elastic,nspec_outer
-            call exit_MPI(myrank,'error inner elastic permutation')
-          endif
-          phase_ispec_inner_elastic(ispec_inner_elastic,2) = ispec
-
-        else
-          !ispec_outer_elastic = ispec_outer_elastic + 1
-          ispec_outer_elastic = perm(ispec)
-          if( ispec_outer_elastic < 1 .or. ispec_outer_elastic > num_phase_ispec_elastic ) then
-            print*,'error: outer elastic permutation',ispec_outer_elastic,num_phase_ispec_elastic,nspec_outer
-            call exit_MPI(myrank,'error outer elastic permutation')
-          endif
-          phase_ispec_inner_elastic(ispec_outer_elastic,1) = ispec
-
-        endif
-      endif
-    enddo
-  endif
-
-  ! sorts array according to permutation
-  ! SORT_MESH_INNER_OUTER
-!daniel: TODO -- mesh permutation
-  if( .false. ) then
-
-    ! permutation of ibool
-    allocate(temp_array_int(NGLLX,NGLLY,NGLLZ,nspec))
-    call permute_elements_integer(ibool,temp_array_int,perm,nspec)
-    deallocate(temp_array_int)
-
-    ! element domain flags
-    allocate(temp_array_logical_1D(nglob))
-    temp_array_logical_1D(:) = ispec_is_acoustic
-    do ispec = 1, nspec
-      ispec_is_acoustic(perm(ispec)) = temp_array_logical_1D(ispec)
-    enddo
-    temp_array_logical_1D(:) = ispec_is_elastic
-    do ispec = 1, nspec
-      ispec_is_elastic(perm(ispec)) = temp_array_logical_1D(ispec)
-    enddo
-    !temp_array_logical_1D(:) = ispec_is_poroelastic
-    !do ispec = 1, nspec
-    !  ispec_is_poroelastic(perm(ispec)) = temp_array_logical_1D(ispec)
-    !enddo
-    deallocate(temp_array_logical_1D)
-
-    ! mesh arrays
-    allocate(temp_array_real(NGLLX,NGLLY,NGLLZ,nspec))
-    call permute_elements_real(xix,temp_array_real,perm,nspec)
-    call permute_elements_real(xiy,temp_array_real,perm,nspec)
-    call permute_elements_real(xiz,temp_array_real,perm,nspec)
-    call permute_elements_real(etax,temp_array_real,perm,nspec)
-    call permute_elements_real(etay,temp_array_real,perm,nspec)
-    call permute_elements_real(etaz,temp_array_real,perm,nspec)
-    call permute_elements_real(gammax,temp_array_real,perm,nspec)
-    call permute_elements_real(gammay,temp_array_real,perm,nspec)
-    call permute_elements_real(gammaz,temp_array_real,perm,nspec)
-    call permute_elements_real(jacobian,temp_array_real,perm,nspec)
-
-    call permute_elements_real(kappastore,temp_array_real,perm,nspec)
-    call permute_elements_real(mustore,temp_array_real,perm,nspec)
-
-    ! acoustic arrays
-    if( ACOUSTIC_SIMULATION ) then
-      call permute_elements_real(rhostore,temp_array_real,perm,nspec)
-    endif
-
-    ! elastic arrays
-    if( ELASTIC_SIMULATION ) then
-      call permute_elements_real(rho_vp,temp_array_real,perm,nspec)
-      call permute_elements_real(rho_vs,temp_array_real,perm,nspec)
-      if( ANISOTROPY ) then
-        call permute_elements_real(c11store,temp_array_real,perm,nspec)
-        call permute_elements_real(c12store,temp_array_real,perm,nspec)
-        call permute_elements_real(c13store,temp_array_real,perm,nspec)
-        call permute_elements_real(c14store,temp_array_real,perm,nspec)
-        call permute_elements_real(c15store,temp_array_real,perm,nspec)
-        call permute_elements_real(c16store,temp_array_real,perm,nspec)
-        call permute_elements_real(c22store,temp_array_real,perm,nspec)
-        call permute_elements_real(c23store,temp_array_real,perm,nspec)
-        call permute_elements_real(c24store,temp_array_real,perm,nspec)
-        call permute_elements_real(c25store,temp_array_real,perm,nspec)
-        call permute_elements_real(c33store,temp_array_real,perm,nspec)
-        call permute_elements_real(c34store,temp_array_real,perm,nspec)
-        call permute_elements_real(c35store,temp_array_real,perm,nspec)
-        call permute_elements_real(c36store,temp_array_real,perm,nspec)
-        call permute_elements_real(c44store,temp_array_real,perm,nspec)
-        call permute_elements_real(c45store,temp_array_real,perm,nspec)
-        call permute_elements_real(c46store,temp_array_real,perm,nspec)
-        call permute_elements_real(c55store,temp_array_real,perm,nspec)
-        call permute_elements_real(c56store,temp_array_real,perm,nspec)
-        call permute_elements_real(c66store,temp_array_real,perm,nspec)
-      endif
-    endif
-
-    deallocate(temp_array_real)
-  endif
-
-  ! user output
-  !call MPI_ALLREDUCE(nspec_outer,nspec_outer_min_global,1,MPI_INTEGER,MPI_MIN,MPI_COMM_WORLD,ier)
-  !call MPI_ALLREDUCE(nspec_outer,nspec_outer_max_global,1,MPI_INTEGER,MPI_MAX,MPI_COMM_WORLD,ier)
-  call min_all_i(nspec_outer,nspec_outer_min_global)
-  call max_all_i(nspec_outer,nspec_outer_max_global)
-  call min_all_i(nspec_outer,nspec_outer_min_global)
-  call max_all_i(nspec_outer,nspec_outer_max_global)
-
-  if(myrank == 0) then
-    write(IMAIN,*) 'mesh coloring:'
-    write(IMAIN,*) '  permutation   : use coloring = ',USE_MESH_COLORING_GPU
-    write(IMAIN,*) '  outer elements: min = ',nspec_outer_min_global
-    write(IMAIN,*) '                  max = ',nspec_outer_max_global
-    write(IMAIN,*)
-  endif
-
-  deallocate(perm)
-
-  end subroutine rmd_setup_color_perm
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
   subroutine read_mesh_databases_adjoint()
 
 ! reads in moho meshes

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/specfem3D_par.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/specfem3D_par.f90	2011-11-03 01:09:17 UTC (rev 19144)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/specfem3D_par.f90	2011-11-03 03:11:45 UTC (rev 19145)
@@ -301,9 +301,13 @@
   integer, dimension(:,:), allocatable :: phase_ispec_inner_elastic
   integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
 
+! mesh coloring
+  integer :: num_colors_outer_elastic,num_colors_inner_elastic
+  integer, dimension(:), allocatable :: num_elem_colors_elastic
+  integer :: nspec_elastic
+
   logical :: ELASTIC_SIMULATION
 
-
 ! ADJOINT elastic
 
   ! (backward/reconstructed) wavefields
@@ -374,6 +378,11 @@
   integer, dimension(:,:), allocatable :: phase_ispec_inner_acoustic
   integer :: num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic
 
+! mesh coloring
+  integer :: num_colors_outer_acoustic,num_colors_inner_acoustic
+  integer, dimension(:), allocatable :: num_elem_colors_acoustic
+  integer :: nspec_acoustic
+
   logical :: ACOUSTIC_SIMULATION
 
 ! ADJOINT acoustic



More information about the CIG-COMMITS mailing list