[cig-commits] [commit] master: Silence linear solver fails from all but processor zero. (7f05769)
cig_noreply at geodynamics.org
cig_noreply at geodynamics.org
Fri Jun 6 13:26:48 PDT 2014
Repository : https://github.com/geodynamics/aspect
On branch : master
Link : https://github.com/geodynamics/aspect/compare/cbbfca824374d9d154fcfb17bfe73fe1bd7db9c3...7f5bb2052fa860100b4201c16c844b5987ce4647
>---------------------------------------------------------------
commit 7f0576920ce239c4e742545f25f1b1e3d630684e
Author: Wolfgang Bangerth <bangerth at math.tamu.edu>
Date: Fri Jun 6 09:56:21 2014 -0500
Silence linear solver fails from all but processor zero.
When a linear solver fails to converge in a parallel program, every processor would output the same error message -- leading to incredible amounts of entangled error messages. This has now been resolved: every processor still fails, but only processor 0 reports the error.
>---------------------------------------------------------------
7f0576920ce239c4e742545f25f1b1e3d630684e
doc/modules/changes.h | 7 +++
include/aspect/global.h | 17 +++++-
source/main.cc | 11 +---
source/simulator/solver.cc | 135 ++++++++++++++++++++++++++++++++++++++-------
4 files changed, 141 insertions(+), 29 deletions(-)
diff --git a/doc/modules/changes.h b/doc/modules/changes.h
index 1afc129..769fa02 100644
--- a/doc/modules/changes.h
+++ b/doc/modules/changes.h
@@ -7,6 +7,13 @@
*
* <ol>
*
+ * <li> Fixed: When a linear solver fails to converge in a parallel program,
+ * every processor would output the same error message -- leading to incredible
+ * amounts of entangled error messages. This has now been resolved: every processor
+ * still fails, but only processor 0 reports the error.
+ * <br>
+ * (Wolfgang Bangerth, 2014/06/06)
+ *
* <li> Fixed: When setting "Use years in output instead of seconds" the
* velocity solution is now exported in m/year instead of m/s in visualization
* files.
diff --git a/include/aspect/global.h b/include/aspect/global.h
index 6cde988..0b7845d 100644
--- a/include/aspect/global.h
+++ b/include/aspect/global.h
@@ -1,5 +1,5 @@
/*
- Copyright (C) 2011, 2012 by the authors of the ASPECT code.
+ Copyright (C) 2011, 2012, 2014 by the authors of the ASPECT code.
This file is part of ASPECT.
@@ -71,6 +71,21 @@ namespace aspect
typedef boost::archive::binary_oarchive oarchive;
/**
+ * A class we throw in exceptions in parallel jobs and that we can
+ * silently treat in main(). We do this, for example, in
+ * read_parameters() where each processor would otherwise throw the
+ * same exception and every processor would produce a tangle of
+ * output that is impenetrable in large parallel jobs. The same
+ * situation happens if a linear solver fails. Rather, we make
+ * processor 0 throw the real exception and every other processor
+ * converts the exception it wants to throw to an object of the
+ * current type -- which is caught in main() but doesn't produce any
+ * output (because processor 0 will already produce the output).
+ */
+ class QuietException {};
+
+
+ /**
* A namespace that contains typedefs for classes used in the linear algebra
* description.
*/
diff --git a/source/main.cc b/source/main.cc
index d8e3406..e34ec17 100644
--- a/source/main.cc
+++ b/source/main.cc
@@ -211,11 +211,6 @@ expand_backslashes (const std::string &filename)
/**
- * An exception that we can silently treat in main(). Used in read_parameters().
- */
-class QuietException {};
-
-/**
* Let ParameterHandler parse the input file, here given as a string.
* Since ParameterHandler unconditionally writes to the screen when it
* finds something it doesn't like, we get massive amounts of output
@@ -226,7 +221,7 @@ class QuietException {};
*
* In case of an error, we need to abort all processors without them
* having read their data. This is done by throwing an exception of the
- * special class QuietException that we can catch in main() and terminate
+ * special class aspect::QuietException that we can catch in main() and terminate
* the program quietly without generating other output.
*/
void
@@ -256,7 +251,7 @@ parse_parameters (const std::string &input_as_string,
AssertThrow(false, dealii::ExcMessage ("Invalid input parameter file."));
}
else
- throw QuietException();
+ throw aspect::QuietException();
}
// otherwise, processor 0 was ok reading the data, so we can expect the
@@ -371,7 +366,7 @@ int main (int argc, char *argv[])
return 1;
}
- catch (QuietException &)
+ catch (aspect::QuietException &)
{
// quitly treat an exception used on processors other than
// root when we already know that processor 0 will generate
diff --git a/source/simulator/solver.cc b/source/simulator/solver.cc
index c0f9cce..9316fa5 100644
--- a/source/simulator/solver.cc
+++ b/source/simulator/solver.cc
@@ -280,10 +280,27 @@ namespace aspect
// solving in this case.
if (src.block(1).l2_norm() > 1e-50 || dst.block(1).l2_norm() > 1e-50)
{
- solver.solve(stokes_preconditioner_matrix.block(1,1),
- dst.block(1), src.block(1),
- mp_preconditioner);
- n_iterations_S_ += solver_control.last_step();
+ try
+ {
+ solver.solve(stokes_preconditioner_matrix.block(1,1),
+ dst.block(1), src.block(1),
+ mp_preconditioner);
+ n_iterations_S_ += solver_control.last_step();
+ }
+ // if the solver fails, report the error from processor 0 with some additional
+ // information about its location, and throw a quiet exception on all other
+ // processors
+ catch (const SolverControl::NoConvergence &exc)
+ {
+ if (Utilities::MPI::this_mpi_process(src.block(0).get_mpi_communicator()) == 0)
+ Assert (false,
+ ExcMessage (std::string("The iterative solver in BlockSchurPreconditioner::vmult "
+ "did not converge. It reported the following error:\n\n")
+ +
+ exc.what()))
+ else
+ throw QuietException();
+ }
}
dst.block(1) *= -1.0;
@@ -303,9 +320,26 @@ namespace aspect
#else
TrilinosWrappers::SolverCG solver(solver_control);
#endif
- solver.solve(stokes_matrix.block(0,0), dst.block(0), utmp,
- a_preconditioner);
- n_iterations_A_ += solver_control.last_step();
+ try
+ {
+ solver.solve(stokes_matrix.block(0,0), dst.block(0), utmp,
+ a_preconditioner);
+ n_iterations_A_ += solver_control.last_step();
+ }
+ // if the solver fails, report the error from processor 0 with some additional
+ // information about its location, and throw a quiet exception on all other
+ // processors
+ catch (const SolverControl::NoConvergence &exc)
+ {
+ if (Utilities::MPI::this_mpi_process(src.block(0).get_mpi_communicator()) == 0)
+ Assert (false,
+ ExcMessage (std::string("The iterative solver in BlockSchurPreconditioner::vmult "
+ "did not converge. It reported the following error:\n\n")
+ +
+ exc.what()))
+ else
+ throw QuietException();
+ }
}
else
{
@@ -367,14 +401,31 @@ namespace aspect
// solve the linear system:
current_constraints.set_zero(distributed_solution);
- solver.solve (system_matrix.block(block_idx,block_idx),
- distributed_solution.block(block_idx),
- system_rhs.block(block_idx),
- (advection_field.is_temperature()
- ?
- *T_preconditioner
- :
- *C_preconditioner));
+ try
+ {
+ solver.solve (system_matrix.block(block_idx,block_idx),
+ distributed_solution.block(block_idx),
+ system_rhs.block(block_idx),
+ (advection_field.is_temperature()
+ ?
+ *T_preconditioner
+ :
+ *C_preconditioner));
+ }
+ // if the solver fails, report the error from processor 0 with some additional
+ // information about its location, and throw a quiet exception on all other
+ // processors
+ catch (const SolverControl::NoConvergence &exc)
+ {
+ if (Utilities::MPI::this_mpi_process(mpi_communicator) == 0)
+ Assert (false,
+ ExcMessage (std::string("The iterative advection solver "
+ "did not converge. It reported the following error:\n\n")
+ +
+ exc.what()))
+ else
+ throw QuietException();
+ }
current_constraints.distribute (distributed_solution);
solution.block(block_idx) = distributed_solution.block(block_idx);
@@ -423,7 +474,27 @@ namespace aspect
#else
TrilinosWrappers::SolverDirect solver(cn);
#endif
- solver.solve(system_matrix.block(0,0), distributed_stokes_solution.block(0), system_rhs.block(0));
+ try
+ {
+ solver.solve(system_matrix.block(0,0),
+ distributed_stokes_solution.block(0),
+ system_rhs.block(0));
+ }
+ // if the solver fails, report the error from processor 0 with some additional
+ // information about its location, and throw a quiet exception on all other
+ // processors
+ catch (const std::exception &exc)
+ {
+ if (Utilities::MPI::this_mpi_process(mpi_communicator) == 0)
+ Assert (false,
+ ExcMessage (std::string("The direct Stokes solver "
+ "did not succeed. It reported the following error:\n\n")
+ +
+ exc.what()))
+ else
+ throw QuietException();
+ }
+
current_constraints.distribute (distributed_stokes_solution);
@@ -549,8 +620,10 @@ namespace aspect
solver(solver_control_cheap, mem,
SolverFGMRES<LinearAlgebra::BlockVector>::
AdditionalData(30, true));
- solver.solve(stokes_block, distributed_stokes_solution,
- distributed_stokes_rhs, preconditioner);
+ solver.solve (stokes_block,
+ distributed_stokes_solution,
+ distributed_stokes_rhs,
+ preconditioner);
its_A += preconditioner.n_iterations_A();
its_S += preconditioner.n_iterations_S();
@@ -570,8 +643,30 @@ namespace aspect
solver(solver_control_expensive, mem,
SolverFGMRES<LinearAlgebra::BlockVector>::
AdditionalData(50, true));
- solver.solve(stokes_block, distributed_stokes_solution,
- distributed_stokes_rhs, preconditioner);
+
+ try
+ {
+ solver.solve(stokes_block,
+ distributed_stokes_solution,
+ distributed_stokes_rhs,
+ preconditioner);
+ }
+ // if the solver fails, report the error from processor 0 with some additional
+ // information about its location, and throw a quiet exception on all other
+ // processors
+ catch (const SolverControl::NoConvergence &exc)
+ {
+ if (Utilities::MPI::this_mpi_process(mpi_communicator) == 0)
+ Assert (false,
+ ExcMessage (std::string("The iterative Stokes solver "
+ "did not converge. It reported the following error:\n\n")
+ +
+ exc.what()))
+ else
+ throw QuietException();
+ }
+
+
its_A += preconditioner.n_iterations_A();
its_S += preconditioner.n_iterations_S();
}
More information about the CIG-COMMITS
mailing list