[cig-commits] [commit] master: Silence linear solver fails from all but processor zero. (7f05769)

cig_noreply at geodynamics.org cig_noreply at geodynamics.org
Fri Jun 6 13:26:48 PDT 2014


Repository : https://github.com/geodynamics/aspect

On branch  : master
Link       : https://github.com/geodynamics/aspect/compare/cbbfca824374d9d154fcfb17bfe73fe1bd7db9c3...7f5bb2052fa860100b4201c16c844b5987ce4647

>---------------------------------------------------------------

commit 7f0576920ce239c4e742545f25f1b1e3d630684e
Author: Wolfgang Bangerth <bangerth at math.tamu.edu>
Date:   Fri Jun 6 09:56:21 2014 -0500

    Silence linear solver fails from all but processor zero.
    
     When a linear solver fails to converge in a parallel program, every processor would output the same error message -- leading to incredible amounts of entangled error messages. This has now been resolved: every processor still fails, but only processor 0 reports the error.


>---------------------------------------------------------------

7f0576920ce239c4e742545f25f1b1e3d630684e
 doc/modules/changes.h      |   7 +++
 include/aspect/global.h    |  17 +++++-
 source/main.cc             |  11 +---
 source/simulator/solver.cc | 135 ++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 141 insertions(+), 29 deletions(-)

diff --git a/doc/modules/changes.h b/doc/modules/changes.h
index 1afc129..769fa02 100644
--- a/doc/modules/changes.h
+++ b/doc/modules/changes.h
@@ -7,6 +7,13 @@
  *
  * <ol>
  *
+ * <li> Fixed: When a linear solver fails to converge in a parallel program,
+ * every processor would output the same error message -- leading to incredible
+ * amounts of entangled error messages. This has now been resolved: every processor
+ * still fails, but only processor 0 reports the error.
+ * <br>
+ * (Wolfgang Bangerth, 2014/06/06)
+ *
  * <li> Fixed: When setting "Use years in output instead of seconds" the
  * velocity solution is now exported in m/year instead of m/s in visualization
  * files.
diff --git a/include/aspect/global.h b/include/aspect/global.h
index 6cde988..0b7845d 100644
--- a/include/aspect/global.h
+++ b/include/aspect/global.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (C) 2011, 2012 by the authors of the ASPECT code.
+  Copyright (C) 2011, 2012, 2014 by the authors of the ASPECT code.
 
   This file is part of ASPECT.
 
@@ -71,6 +71,21 @@ namespace aspect
   typedef boost::archive::binary_oarchive oarchive;
 
   /**
+   * A class we throw in exceptions in parallel jobs and that we can
+   * silently treat in main(). We do this, for example, in
+   * read_parameters() where each processor would otherwise throw the
+   * same exception and every processor would produce a tangle of
+   * output that is impenetrable in large parallel jobs. The same
+   * situation happens if a linear solver fails. Rather, we make
+   * processor 0 throw the real exception and every other processor
+   * converts the exception it wants to throw to an object of the
+   * current type -- which is caught in main() but doesn't produce any
+   * output (because processor 0 will already produce the output).
+   */
+  class QuietException {};
+
+  
+  /**
    * A namespace that contains typedefs for classes used in the linear algebra
    * description.
    */
diff --git a/source/main.cc b/source/main.cc
index d8e3406..e34ec17 100644
--- a/source/main.cc
+++ b/source/main.cc
@@ -211,11 +211,6 @@ expand_backslashes (const std::string &filename)
 
 
 /**
- * An exception that we can silently treat in main(). Used in read_parameters().
- */
-class QuietException {};
-
-/**
  * Let ParameterHandler parse the input file, here given as a string.
  * Since ParameterHandler unconditionally writes to the screen when it
  * finds something it doesn't like, we get massive amounts of output
@@ -226,7 +221,7 @@ class QuietException {};
  *
  * In case of an error, we need to abort all processors without them
  * having read their data. This is done by throwing an exception of the
- * special class QuietException that we can catch in main() and terminate
+ * special class aspect::QuietException that we can catch in main() and terminate
  * the program quietly without generating other output.
  */
 void
@@ -256,7 +251,7 @@ parse_parameters (const std::string &input_as_string,
           AssertThrow(false, dealii::ExcMessage ("Invalid input parameter file."));
         }
       else
-        throw QuietException();
+        throw aspect::QuietException();
     }
 
   // otherwise, processor 0 was ok reading the data, so we can expect the
@@ -371,7 +366,7 @@ int main (int argc, char *argv[])
 
       return 1;
     }
-  catch (QuietException &)
+  catch (aspect::QuietException &)
     {
       // quitly treat an exception used on processors other than
       // root when we already know that processor 0 will generate
diff --git a/source/simulator/solver.cc b/source/simulator/solver.cc
index c0f9cce..9316fa5 100644
--- a/source/simulator/solver.cc
+++ b/source/simulator/solver.cc
@@ -280,10 +280,27 @@ namespace aspect
         // solving in this case.
         if (src.block(1).l2_norm() > 1e-50 || dst.block(1).l2_norm() > 1e-50)
           {
-            solver.solve(stokes_preconditioner_matrix.block(1,1),
-                         dst.block(1), src.block(1),
-                         mp_preconditioner);
-            n_iterations_S_ += solver_control.last_step();
+	    try
+	      {
+		solver.solve(stokes_preconditioner_matrix.block(1,1),
+			     dst.block(1), src.block(1),
+			     mp_preconditioner);
+		n_iterations_S_ += solver_control.last_step();
+	      }
+	    // if the solver fails, report the error from processor 0 with some additional
+	    // information about its location, and throw a quiet exception on all other
+	    // processors
+	    catch (const SolverControl::NoConvergence &exc)
+	      {
+		if (Utilities::MPI::this_mpi_process(src.block(0).get_mpi_communicator()) == 0)
+		  Assert (false,
+			  ExcMessage (std::string("The iterative solver in BlockSchurPreconditioner::vmult "
+						  "did not converge. It reported the following error:\n\n")
+				      +
+				      exc.what()))
+		else
+		  throw QuietException();
+	      }
           }
 
         dst.block(1) *= -1.0;
@@ -303,9 +320,26 @@ namespace aspect
 #else
           TrilinosWrappers::SolverCG solver(solver_control);
 #endif
-          solver.solve(stokes_matrix.block(0,0), dst.block(0), utmp,
-                       a_preconditioner);
-          n_iterations_A_ += solver_control.last_step();
+	  try
+	    {
+	      solver.solve(stokes_matrix.block(0,0), dst.block(0), utmp,
+			   a_preconditioner);
+	      n_iterations_A_ += solver_control.last_step();
+	    }
+	    // if the solver fails, report the error from processor 0 with some additional
+	    // information about its location, and throw a quiet exception on all other
+	    // processors
+	    catch (const SolverControl::NoConvergence &exc)
+	      {
+		if (Utilities::MPI::this_mpi_process(src.block(0).get_mpi_communicator()) == 0)
+		  Assert (false,
+			  ExcMessage (std::string("The iterative solver in BlockSchurPreconditioner::vmult "
+						  "did not converge. It reported the following error:\n\n")
+				      +
+				      exc.what()))
+		else
+		  throw QuietException();
+	      }	  
         }
       else
         {
@@ -367,14 +401,31 @@ namespace aspect
 
     // solve the linear system:
     current_constraints.set_zero(distributed_solution);
-    solver.solve (system_matrix.block(block_idx,block_idx),
-                  distributed_solution.block(block_idx),
-                  system_rhs.block(block_idx),
-                  (advection_field.is_temperature()
-                   ?
-                   *T_preconditioner
-                   :
-                   *C_preconditioner));
+    try
+      {
+	solver.solve (system_matrix.block(block_idx,block_idx),
+		      distributed_solution.block(block_idx),
+		      system_rhs.block(block_idx),
+		      (advection_field.is_temperature()
+		       ?
+		       *T_preconditioner
+		       :
+		       *C_preconditioner));
+      }
+    // if the solver fails, report the error from processor 0 with some additional
+    // information about its location, and throw a quiet exception on all other
+    // processors
+    catch (const SolverControl::NoConvergence &exc)
+      {
+	if (Utilities::MPI::this_mpi_process(mpi_communicator) == 0)
+	  Assert (false,
+		  ExcMessage (std::string("The iterative advection solver "
+					  "did not converge. It reported the following error:\n\n")
+			      +
+			      exc.what()))
+	else
+	  throw QuietException();
+      }
     
     current_constraints.distribute (distributed_solution);
     solution.block(block_idx) = distributed_solution.block(block_idx);
@@ -423,7 +474,27 @@ namespace aspect
 #else
         TrilinosWrappers::SolverDirect solver(cn);
 #endif
-        solver.solve(system_matrix.block(0,0), distributed_stokes_solution.block(0), system_rhs.block(0));
+	try
+	  {
+	    solver.solve(system_matrix.block(0,0),
+			 distributed_stokes_solution.block(0),
+			 system_rhs.block(0));
+	  }
+	// if the solver fails, report the error from processor 0 with some additional
+	// information about its location, and throw a quiet exception on all other
+	// processors
+	catch (const std::exception &exc)
+	  {
+	    if (Utilities::MPI::this_mpi_process(mpi_communicator) == 0)
+	      Assert (false,
+		      ExcMessage (std::string("The direct Stokes solver "
+					      "did not succeed. It reported the following error:\n\n")
+				  +
+				  exc.what()))
+	    else
+	      throw QuietException();
+	  }
+	
 
         current_constraints.distribute (distributed_stokes_solution);
 
@@ -549,8 +620,10 @@ namespace aspect
         solver(solver_control_cheap, mem,
                SolverFGMRES<LinearAlgebra::BlockVector>::
                AdditionalData(30, true));
-        solver.solve(stokes_block, distributed_stokes_solution,
-                     distributed_stokes_rhs, preconditioner);
+	solver.solve (stokes_block,
+		      distributed_stokes_solution,
+		      distributed_stokes_rhs,
+		      preconditioner);
 
         its_A += preconditioner.n_iterations_A();
         its_S += preconditioner.n_iterations_S();
@@ -570,8 +643,30 @@ namespace aspect
         solver(solver_control_expensive, mem,
                SolverFGMRES<LinearAlgebra::BlockVector>::
                AdditionalData(50, true));
-        solver.solve(stokes_block, distributed_stokes_solution,
-                     distributed_stokes_rhs, preconditioner);
+
+	try
+	  {
+	    solver.solve(stokes_block,
+			 distributed_stokes_solution,
+			 distributed_stokes_rhs,
+			 preconditioner);
+	  }
+	// if the solver fails, report the error from processor 0 with some additional
+	// information about its location, and throw a quiet exception on all other
+	// processors
+	catch (const SolverControl::NoConvergence &exc)
+	  {
+	    if (Utilities::MPI::this_mpi_process(mpi_communicator) == 0)
+	      Assert (false,
+		      ExcMessage (std::string("The iterative Stokes solver "
+					      "did not converge. It reported the following error:\n\n")
+				  +
+				  exc.what()))
+	    else
+	      throw QuietException();
+	  }
+	
+	
         its_A += preconditioner.n_iterations_A();
         its_S += preconditioner.n_iterations_S();
       }



More information about the CIG-COMMITS mailing list