[cig-commits] r1347 - in branches/s-wang/for_deal.II: . examples examples/step-32

Tue Nov 6 12:29:14 PST 2012

Author: s-wang
Date: 2012-11-06 13:29:13 -0700 (Tue, 06 Nov 2012)
New Revision: 1347

Added:
   branches/s-wang/for_deal.II/examples/
   branches/s-wang/for_deal.II/examples/step-32/
   branches/s-wang/for_deal.II/examples/step-32/test-step-32.cc
Log:
test-step-32.cc has been written to replace trilinos with petsc (based on step-32.cc).


Added: branches/s-wang/for_deal.II/examples/step-32/test-step-32.cc
===================================================================

--- branches/s-wang/for_deal.II/examples/step-32/test-step-32.cc	                        (rev 0)
+++ branches/s-wang/for_deal.II/examples/step-32/test-step-32.cc	2012-11-06 20:29:13 UTC (rev 1347)
@@ -0,0 +1,4646 @@
+/* Author: Martin Kronbichler, Uppsala University,
+           Wolfgang Bangerth, Texas A&M University,
+           Timo Heister, University of Goettingen, 2008-2011 */
+/*                                                                */
+/*    Copyright (C) 2008, 2009, 2010, 2011, 2012 by the deal.II authors */
+/*                                                                */
+/*    This file is subject to QPL and may not be  distributed     */
+/*    without copyright and license information. Please refer     */
+/*    to the file deal.II/doc/license.html for the  text  and     */
+/*    further information on this license.                        */
+
+				 // @sect3{Include files}
+
+				 //The first task as usual is to
+				 // include the functionality of these
+				 // well-known deal.II library files
+				 // and some C++ header files.
+#include <deal.II/base/quadrature_lib.h>
+#include <deal.II/base/logstream.h>
+#include <deal.II/base/function.h>
+#include <deal.II/base/utilities.h>
+#include <deal.II/base/conditional_ostream.h>
+#include <deal.II/base/work_stream.h>
+#include <deal.II/base/timer.h>
+#include <deal.II/base/parameter_handler.h>
+
+#include <deal.II/lac/full_matrix.h>
+#include <deal.II/lac/solver_bicgstab.h>
+#include <deal.II/lac/solver_cg.h>
+#include <deal.II/lac/solver_gmres.h>
+#include <deal.II/lac/constraint_matrix.h>
+#include <deal.II/lac/block_sparsity_pattern.h>
+#include <deal.II/lac/petsc_parallel_vector.h>
+#include <deal.II/lac/petsc_block_vector.h>
+#include <deal.II/lac/petsc_parallel_block_vector.h>
+#include <deal.II/lac/petsc_sparse_matrix.h>
+#include <deal.II/lac/petsc_parallel_sparse_matrix.h>
+#include <deal.II/lac/petsc_parallel_block_sparse_matrix.h>
+#include <deal.II/lac/petsc_precondition.h>
+#include <deal.II/lac/petsc_solver.h>
+
+#include <deal.II/grid/tria.h>
+#include <deal.II/grid/grid_generator.h>
+#include <deal.II/grid/tria_accessor.h>
+#include <deal.II/grid/tria_iterator.h>
+#include <deal.II/grid/filtered_iterator.h>
+#include <deal.II/grid/tria_boundary_lib.h>
+#include <deal.II/grid/grid_tools.h>
+#include <deal.II/grid/grid_refinement.h>
+
+#include <deal.II/dofs/dof_handler.h>
+#include <deal.II/dofs/dof_renumbering.h>
+#include <deal.II/dofs/dof_accessor.h>
+#include <deal.II/dofs/dof_tools.h>
+
+#include <deal.II/fe/fe_q.h>
+#include <deal.II/fe/fe_dgq.h>
+#include <deal.II/fe/fe_dgp.h>
+#include <deal.II/fe/fe_system.h>
+#include <deal.II/fe/fe_values.h>
+#include <deal.II/fe/mapping_q.h>
+
+#include <deal.II/numerics/vector_tools.h>
+#include <deal.II/numerics/matrix_tools.h>
+#include <deal.II/numerics/data_out.h>
+#include <deal.II/numerics/error_estimator.h>
+#include <deal.II/numerics/solution_transfer.h>
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <limits>
+#include <locale>
+#include <string>
+
+				 // This is the only include file that
+				 // is new: It introduces the
+				 // parallel::distributed::SolutionTransfer
+				 // equivalent of the
+				 // dealii::SolutionTransfer class to
+				 // take a solution from on mesh to
+				 // the next one upon mesh refinement,
+				 // but in the case of parallel
+				 // distributed triangulations:
+#include <deal.II/distributed/solution_transfer.h>
+
+				 // The following classes are used in
+				 // parallel distributed computations
+				 // and have all already been
+				 // introduced in step-40:
+#include <deal.II/base/index_set.h>
+#include <deal.II/distributed/tria.h>
+#include <deal.II/distributed/grid_refinement.h>
+
+
+/**
+ * utilities to replace Trilinos with PETSc.
+ */
+namespace CIG
+{
+/**
+ *	convert a block_partition used for trilinos into data used for petsc.
+ *	It is assumed that block_partition.size()==2.
+ */
+void convert_block_partitioning(
+		const std::vector<dealii::IndexSet> &block_partition,
+		int n_u, int n_p,
+		std::vector<unsigned int> &block_sizes,
+		std::vector<unsigned int> &local_sizes)
+{
+	Assert(block_partition.size()==2, dealii::ExcMessage("logic error"));
+
+	// init,
+	block_sizes.clear();
+	local_sizes.clear();
+
+	// block_sizes
+	block_sizes.push_back(n_u);
+	block_sizes.push_back(n_p);
+
+	// local_sizes
+	local_sizes.push_back(block_partition[0].n_elements());
+	local_sizes.push_back(block_partition[1].n_elements());
+}
+
+
+void setup_petsc_matrix(
+		std::vector<unsigned int> &block_sizes,
+		std::vector<unsigned int> &local_sizes,
+		int max_coupling_between_dofs,
+		dealii::PETScWrappers::MPI::BlockSparseMatrix  &matrix)
+{
+	Assert(block_sizes.size()==2, dealii::ExcMessage("logic error"));
+
+	int n_u = block_sizes[0];
+	int n_p = block_sizes[1];
+	matrix.reinit(2,2);
+	matrix.block(0,0).reinit(
+			MPI_COMM_WORLD,n_u,n_u,local_sizes[0],local_sizes[0],max_coupling_between_dofs);
+	matrix.block(0,1).reinit(
+				MPI_COMM_WORLD,n_u,n_p,local_sizes[0],local_sizes[1],max_coupling_between_dofs);
+	matrix.block(1,0).reinit(
+				MPI_COMM_WORLD,n_p,n_u,local_sizes[1],local_sizes[0],max_coupling_between_dofs);
+	matrix.block(1,1).reinit(
+				MPI_COMM_WORLD,n_p,n_p,local_sizes[1],local_sizes[1],max_coupling_between_dofs);
+	matrix.collect_sizes();
+}
+
+void setup_petsc_vector(
+		std::vector<unsigned int> &block_sizes,
+		 std::vector<dealii::IndexSet> &partitioning,
+		 std::vector<dealii::IndexSet> &relevant_partitioning,
+		 dealii::PETScWrappers::MPI::BlockVector &vector)
+{
+	Assert(block_sizes.size()==2, dealii::ExcMessage("logic error"));
+
+	vector.reinit(block_sizes,MPI_COMM_WORLD);
+	vector.block(0).reinit(MPI_COMM_WORLD,partitioning[0],relevant_partitioning[0]);
+	vector.block(1).reinit(MPI_COMM_WORLD,partitioning[1],relevant_partitioning[1]);
+}
+
+template <class VectorType>
+void reduce_accuracy(VectorType &vector)
+{
+	std::pair<unsigned int,unsigned int> range = vector.local_range();
+	for(unsigned int i=range.first; i<range.second; i++)
+		vector[i] = std::floor(vector[i]);
+	vector.compress();
+}
+
+}
+
+				 // The next step is like in all
+				 // previous tutorial programs: We put
+				 // everything into a namespace of its
+				 // own and then import the deal.II
+				 // classes and functions into it:
+namespace Step32
+{
+  using namespace dealii;
+
+				   // @sect3{Equation data}
+
+				   // In the following namespace, we
+				   // define the various pieces of
+				   // equation data that describe the
+				   // problem. This corresponds to the
+				   // various aspects of making the
+				   // problem at least slightly
+				   // realistc and that were
+				   // exhaustively discussed in the
+				   // description of the testcase in
+				   // the introduction.
+				   //
+				   // We start with a few coefficients
+				   // that have constant values (the
+				   // comment after the value
+				   // indicates its physical units):
+  namespace EquationData
+  {
+    const double eta                   = 1e21;    /* Pa s       */
+    const double kappa                 = 1e-6;    /* m / s      */
+    const double reference_density     = 3300;    /* kg / m^3   */
+    const double reference_temperature = 293;     /* K          */
+    const double expansion_coefficient = 2e-5;    /* 1/K        */
+    const double specific_heat         = 1250;    /* J / K / kg */
+    const double radiogenic_heating    = 7.4e-12; /* W / kg     */
+
+
+    const double R0      = 6371000.-2890000.;     /* m          */
+    const double R1      = 6371000.-  35000.;     /* m          */
+
+    const double T0      = 4000+273;              /* K          */
+    const double T1      =  700+273;              /* K          */
+
+
+				     // The next set of definitions
+				     // are for functions that encode
+				     // the density as a function of
+				     // temperature, the gravity
+				     // vector, and the initial values
+				     // for the temperature. Again,
+				     // all of these (along with the
+				     // values they compute) are
+				     // discussed in the introduction:
+    double density (const double temperature)
+    {
+      return (reference_density *
+	      (1 - expansion_coefficient * (temperature -
+					    reference_temperature)));
+    }
+
+
+    template <int dim>
+    Tensor<1,dim> gravity_vector (const Point<dim> &p)
+    {
+      const double r = p.norm();
+      return -(1.245e-6 * r + 7.714e13/r/r) * p / r;
+    }
+
+
+
+    template <int dim>
+    class TemperatureInitialValues : public Function<dim>
+    {
+      public:
+	TemperatureInitialValues () : Function<dim>(1) {}
+
+	virtual double value (const Point<dim>   &p,
+			      const unsigned int  component = 0) const;
+
+	virtual void vector_value (const Point<dim> &p,
+				   Vector<double>   &value) const;
+    };
+
+
+
+    template <int dim>
+    double
+    TemperatureInitialValues<dim>::value (const Point<dim>  &p,
+					  const unsigned int) const
+    {
+      const double r = p.norm();
+      const double h = R1-R0;
+
+      const double s = (r-R0)/h;
+      const double q = (dim==3)?std::max(0.0,cos(numbers::PI*abs(p(2)/R1))):1.0;
+      const double phi   = std::atan2(p(0),p(1));
+      const double tau = s
+			 +
+			 0.2 * s * (1-s) * std::sin(6*phi) * q;
+
+      return T0*(1.0-tau) + T1*tau;
+    }
+
+
+    template <int dim>
+    void
+    TemperatureInitialValues<dim>::vector_value (const Point<dim> &p,
+						 Vector<double>   &values) const
+    {
+      for (unsigned int c=0; c<this->n_components; ++c)
+	values(c) = TemperatureInitialValues<dim>::value (p, c);
+    }
+
+
+				     // As mentioned in the
+				     // introduction we need to
+				     // rescale the pressure to avoid
+				     // the relative ill-conditioning
+				     // of the momentum and mass
+				     // conservation equations. The
+				     // scaling factor is
+				     // $\frac{\eta}{L}$ where $L$ was
+				     // a typical length scale. By
+				     // experimenting it turns out
+				     // that a good length scale is
+				     // the diameter of plumes, which
+				     // is around 10 km:
+    const double pressure_scaling = eta / 10000;
+
+				     // The final number in this
+				     // namespace is a constant that
+				     // denotes the number of seconds
+				     // per (average, tropical)
+				     // year. We use this only when
+				     // generating screen output:
+				     // internally, all computations
+				     // of this program happen in SI
+				     // units (kilogram, meter,
+				     // seconds) but writing
+				     // geological times in seconds
+				     // yields numbers that one can't
+				     // relate to reality, and so we
+				     // convert to years using the
+				     // factor defined here:
+    const double year_in_seconds  = 60*60*24*365.2425;
+
+  }
+
+
+
+				   // @sect3{Preconditioning the Stokes system}
+
+				   // This namespace implements the
+				   // preconditioner. As discussed in the
+				   // introduction, this preconditioner
+				   // differs in a number of key portions from
+				   // the one used in step-31. Specifically,
+				   // it is a right preconditioner,
+				   // implementing the matrix
+				   // @f{align*}\left(\begin{array}{cc}A^{-1}
+				   // & B^T \\ 0 & S^{-1}\end{array}\right)@f}
+				   // where the two inverse matrix operations
+				   // are approximated by linear solvers or,
+				   // if the right flag is given to the
+				   // constructor of this class, by a single
+				   // AMG V-cycle for the velocity block. The
+				   // three code blocks of the
+				   // <code>vmult</code> function implement
+				   // the multiplications with the three
+				   // blocks of this preconditioner matrix and
+				   // should be self explanatory if you have
+				   // read through step-31 or the discussion
+				   // of compositing solvers in step-20.
+  namespace LinearSolvers
+  {
+    template <class PreconditionerA, class PreconditionerMp>
+    class BlockSchurPreconditioner : public Subscriptor
+    {
+      public:
+	BlockSchurPreconditioner (const PETScWrappers::MPI::BlockSparseMatrix  &S,
+				  const PETScWrappers::MPI::BlockSparseMatrix  &Spre,
+				  const PreconditionerMp                     &Mppreconditioner,
+				  const PreconditionerA                      &Apreconditioner,
+				  const bool                                  do_solve_A)
+			:
+			stokes_matrix     (&S),
+			stokes_preconditioner_matrix     (&Spre),
+			mp_preconditioner (Mppreconditioner),
+			a_preconditioner  (Apreconditioner),
+			do_solve_A        (do_solve_A)
+	  {}
+
+	void vmult (PETScWrappers::MPI::BlockVector       &dst,
+		    const PETScWrappers::MPI::BlockVector &src) const
+	  {
+	    PETScWrappers::MPI::Vector utmp(src.block(0));
+
+	    {
+	      SolverControl solver_control(5000, 1e-6 * src.block(1).l2_norm());
+
+	      SolverCG<PETScWrappers::MPI::Vector> solver(solver_control);
+
+	      solver.solve(stokes_preconditioner_matrix->block(1,1),
+			   dst.block(1), src.block(1),
+			   mp_preconditioner);
+
+	      dst.block(1) *= -1.0;
+	    }
+
+	    {
+	      stokes_matrix->block(0,1).vmult(utmp, dst.block(1));
+	      utmp*=-1.0;
+	      utmp.add(src.block(0));
+	    }
+
+	    if (do_solve_A == true)
+	      {
+		SolverControl solver_control(5000, utmp.l2_norm()*1e-2);
+		PETScWrappers::SolverCG solver(solver_control);
+		solver.solve(stokes_matrix->block(0,0), dst.block(0), utmp,
+			     a_preconditioner);
+	      }
+	    else
+	      a_preconditioner.vmult (dst.block(0), utmp);
+	  }
+
+      private:
+	const SmartPointer<const PETScWrappers::MPI::BlockSparseMatrix> stokes_matrix;
+	const SmartPointer<const PETScWrappers::MPI::BlockSparseMatrix> stokes_preconditioner_matrix;
+	const PreconditionerMp &mp_preconditioner;
+	const PreconditionerA  &a_preconditioner;
+	const bool do_solve_A;
+    };
+  }
+
+
+
+				   // @sect3{Definition of assembly data structures}
+				   //
+				   // As described in the
+				   // introduction, we will use the
+				   // WorkStream mechanism discussed
+				   // in the @ref threads module to
+				   // parallelize operations among the
+				   // processors of a single
+				   // machine. The WorkStream class
+				   // requires that data is passed
+				   // around in two kinds of data
+				   // structures, one for scratch data
+				   // and one to pass data from the
+				   // assembly function to the
+				   // function that copies local
+				   // contributions into global
+				   // objects.
+				   //
+				   // The following namespace (and the
+				   // two sub-namespaces) contains a
+				   // collection of data structures
+				   // that serve this purpose, one
+				   // pair for each of the four
+				   // operations discussed in the
+				   // introduction that we will want
+				   // to parallelize. Each assembly
+				   // routine gets two sets of data: a
+				   // Scratch array that collects all
+				   // the classes and arrays that are
+				   // used for the calculation of the
+				   // cell contribution, and a
+				   // CopyData array that keeps local
+				   // matrices and vectors which will
+				   // be written into the global
+				   // matrix. Whereas CopyData is a
+				   // container for the final data
+				   // that is written into the global
+				   // matrices and vector (and, thus,
+				   // absolutely necessary), the
+				   // Scratch arrays are merely there
+				   // for performance reasons &mdash;
+				   // it would be much more expensive
+				   // to set up a FEValues object on
+				   // each cell, than creating it only
+				   // once and updating some
+				   // derivative data.
+				   //
+				   // Step-31 had four assembly
+				   // routines: One for the
+				   // preconditioner matrix of the
+				   // Stokes system, one for the
+				   // Stokes matrix and right hand
+				   // side, one for the temperature
+				   // matrices and one for the right
+				   // hand side of the temperature
+				   // equation. We here organize the
+				   // scratch arrays and CopyData
+				   // objects for each of those four
+				   // assembly components using a
+				   // <code>struct</code> environment
+				   // (since we consider these as
+				   // temporary objects we pass
+				   // around, rather than classes that
+				   // implement functionality of their
+				   // own, though this is a more
+				   // subjective point of view to
+				   // distinguish between
+				   // <code>struct</code>s and
+				   // <code>class</code>es).
+				   //
+				   // Regarding the Scratch objects,
+				   // each struct is equipped with a
+				   // constructor that creates an
+				   // FEValues object for a @ref
+				   // FiniteElement "finite element",
+				   // a @ref Quadrature "quadrature formula",
+				   // the @ref Mapping "mapping" that
+				   // describes the
+				   // interpolation of curved
+				   // boundaries, and some @ref
+				   // UpdateFlags "update flags".
+				   // Moreover, we manually implement
+				   // a copy constructor (since the
+				   // FEValues class is not copyable
+				   // by itself), and provide some
+				   // additional vector fields that
+				   // are used to hold intermediate
+				   // data during the computation of
+				   // local contributions.
+				   //
+				   // Let us start with the scratch
+				   // arrays and, specifically, the
+				   // one used for assembly of the
+				   // Stokes preconditioner:
+  namespace Assembly
+  {
+    namespace Scratch
+    {
+      template <int dim>
+      struct StokesPreconditioner
+      {
+	  StokesPreconditioner (const FiniteElement<dim> &stokes_fe,
+				const Quadrature<dim>    &stokes_quadrature,
+				const Mapping<dim>       &mapping,
+				const UpdateFlags         update_flags);
+
+	  StokesPreconditioner (const StokesPreconditioner &data);
+
+
+         FEValues<dim>               stokes_fe_values;
+
+         std::vector<Tensor<2,dim> > grad_phi_u;
+         std::vector<double>         phi_p;
+      };
+
+      template <int dim>
+      StokesPreconditioner<dim>::
+      StokesPreconditioner (const FiniteElement<dim> &stokes_fe,
+			    const Quadrature<dim>    &stokes_quadrature,
+			    const Mapping<dim>       &mapping,
+			    const UpdateFlags         update_flags)
+		      :
+		      stokes_fe_values (mapping, stokes_fe, stokes_quadrature,
+					update_flags),
+                     grad_phi_u (stokes_fe.dofs_per_cell),
+		      phi_p (stokes_fe.dofs_per_cell)
+      {}
+
+
+
+      template <int dim>
+      StokesPreconditioner<dim>::
+      StokesPreconditioner (const StokesPreconditioner &scratch)
+		      :
+		      stokes_fe_values (scratch.stokes_fe_values.get_mapping(),
+					scratch.stokes_fe_values.get_fe(),
+					scratch.stokes_fe_values.get_quadrature(),
+					scratch.stokes_fe_values.get_update_flags()),
+                     grad_phi_u (scratch.grad_phi_u),
+		      phi_p (scratch.phi_p)
+      {}
+
+
+
+				       // The next one is the scratch object
+				       // used for the assembly of the full
+				       // Stokes system. Observe that we
+				       // derive the StokesSystem scratch
+				       // class from the StokesPreconditioner
+				       // class above. We do this because all the
+				       // objects that are necessary for the
+				       // assembly of the preconditioner are
+				       // also needed for the actual matrix
+				       // system and right hand side, plus
+				       // some extra data. This makes the
+				       // program more compact. Note also that
+				       // the assembly of the Stokes system
+				       // and the temperature right hand side
+				       // further down requires data from
+				       // temperature and velocity,
+				       // respectively, so we actually need
+				       // two FEValues objects for those two
+				       // cases.
+      template <int dim>
+      struct StokesSystem : public StokesPreconditioner<dim>
+      {
+	  StokesSystem (const FiniteElement<dim> &stokes_fe,
+			const Mapping<dim>       &mapping,
+			const Quadrature<dim>    &stokes_quadrature,
+			const UpdateFlags         stokes_update_flags,
+			const FiniteElement<dim> &temperature_fe,
+			const UpdateFlags         temperature_update_flags);
+
+	  StokesSystem (const StokesSystem<dim> &data);
+
+
+	  FEValues<dim>                        temperature_fe_values;
+
+	  std::vector<Tensor<1,dim> >          phi_u;
+	  std::vector<SymmetricTensor<2,dim> > grads_phi_u;
+	  std::vector<double>                  div_phi_u;
+
+	  std::vector<double>                  old_temperature_values;
+      };
+
+
+      template <int dim>
+      StokesSystem<dim>::
+      StokesSystem (const FiniteElement<dim> &stokes_fe,
+		    const Mapping<dim>       &mapping,
+		    const Quadrature<dim>    &stokes_quadrature,
+		    const UpdateFlags         stokes_update_flags,
+		    const FiniteElement<dim> &temperature_fe,
+		    const UpdateFlags         temperature_update_flags)
+		      :
+		      StokesPreconditioner<dim> (stokes_fe, stokes_quadrature,
+						 mapping,
+						 stokes_update_flags),
+		      temperature_fe_values (mapping, temperature_fe, stokes_quadrature,
+					     temperature_update_flags),
+		      phi_u (stokes_fe.dofs_per_cell),
+		      grads_phi_u (stokes_fe.dofs_per_cell),
+		      div_phi_u (stokes_fe.dofs_per_cell),
+		      old_temperature_values (stokes_quadrature.size())
+      {}
+
+
+      template <int dim>
+      StokesSystem<dim>::
+      StokesSystem (const StokesSystem<dim> &scratch)
+		      :
+		      StokesPreconditioner<dim> (scratch),
+		      temperature_fe_values (scratch.temperature_fe_values.get_mapping(),
+					     scratch.temperature_fe_values.get_fe(),
+					     scratch.temperature_fe_values.get_quadrature(),
+					     scratch.temperature_fe_values.get_update_flags()),
+		      phi_u (scratch.phi_u),
+		      grads_phi_u (scratch.grads_phi_u),
+		      div_phi_u (scratch.div_phi_u),
+		      old_temperature_values (scratch.old_temperature_values)
+      {}
+
+
+				       // After defining the objects used in
+				       // the assembly of the Stokes system,
+				       // we do the same for the assembly of
+				       // the matrices necessary for the
+				       // temperature system. The general
+				       // structure is very similar:
+      template <int dim>
+      struct TemperatureMatrix
+      {
+	  TemperatureMatrix (const FiniteElement<dim> &temperature_fe,
+			     const Mapping<dim>       &mapping,
+			     const Quadrature<dim>    &temperature_quadrature);
+
+	  TemperatureMatrix (const TemperatureMatrix &data);
+
+
+	  FEValues<dim>               temperature_fe_values;
+
+	  std::vector<double>         phi_T;
+	  std::vector<Tensor<1,dim> > grad_phi_T;
+      };
+
+
+      template <int dim>
+      TemperatureMatrix<dim>::
+      TemperatureMatrix (const FiniteElement<dim> &temperature_fe,
+			 const Mapping<dim>       &mapping,
+			 const Quadrature<dim>    &temperature_quadrature)
+		      :
+		      temperature_fe_values (mapping,
+					     temperature_fe, temperature_quadrature,
+					     update_values    | update_gradients |
+					     update_JxW_values),
+		      phi_T (temperature_fe.dofs_per_cell),
+		      grad_phi_T (temperature_fe.dofs_per_cell)
+      {}
+
+
+      template <int dim>
+      TemperatureMatrix<dim>::
+      TemperatureMatrix (const TemperatureMatrix &scratch)
+		      :
+		      temperature_fe_values (scratch.temperature_fe_values.get_mapping(),
+					     scratch.temperature_fe_values.get_fe(),
+					     scratch.temperature_fe_values.get_quadrature(),
+					     scratch.temperature_fe_values.get_update_flags()),
+		      phi_T (scratch.phi_T),
+		      grad_phi_T (scratch.grad_phi_T)
+      {}
+
+
+				       // The final scratch object is used in
+				       // the assembly of the right hand side
+				       // of the temperature system. This
+				       // object is significantly larger than
+				       // the ones above because a lot more
+				       // quantities enter the computation of
+				       // the right hand side of the
+				       // temperature equation. In particular,
+				       // the temperature values and gradients
+				       // of the previous two time steps need
+				       // to be evaluated at the quadrature
+				       // points, as well as the velocities
+				       // and the strain rates (i.e. the
+				       // symmetric gradients of the velocity)
+				       // that enter the right hand side as
+				       // friction heating terms. Despite the
+				       // number of terms, the following
+				       // should be rather self explanatory:
+      template <int dim>
+      struct TemperatureRHS
+      {
+	  TemperatureRHS (const FiniteElement<dim> &temperature_fe,
+			  const FiniteElement<dim> &stokes_fe,
+			  const Mapping<dim>       &mapping,
+			  const Quadrature<dim>    &quadrature);
+
+	  TemperatureRHS (const TemperatureRHS &data);
+
+
+	  FEValues<dim>                        temperature_fe_values;
+	  FEValues<dim>                        stokes_fe_values;
+
+	  std::vector<double>                  phi_T;
+	  std::vector<Tensor<1,dim> >          grad_phi_T;
+
+	  std::vector<Tensor<1,dim> >          old_velocity_values;
+	  std::vector<Tensor<1,dim> >          old_old_velocity_values;
+
+	  std::vector<SymmetricTensor<2,dim> > old_strain_rates;
+	  std::vector<SymmetricTensor<2,dim> > old_old_strain_rates;
+
+	  std::vector<double>                  old_temperature_values;
+	  std::vector<double>                  old_old_temperature_values;
+	  std::vector<Tensor<1,dim> >          old_temperature_grads;
+	  std::vector<Tensor<1,dim> >          old_old_temperature_grads;
+	  std::vector<double>                  old_temperature_laplacians;
+	  std::vector<double>                  old_old_temperature_laplacians;
+      };
+
+
+      template <int dim>
+      TemperatureRHS<dim>::
+      TemperatureRHS (const FiniteElement<dim> &temperature_fe,
+		      const FiniteElement<dim> &stokes_fe,
+		      const Mapping<dim>       &mapping,
+		      const Quadrature<dim>    &quadrature)
+		      :
+		      temperature_fe_values (mapping,
+					     temperature_fe, quadrature,
+					     update_values    |
+					     update_gradients |
+					     update_hessians  |
+					     update_quadrature_points |
+					     update_JxW_values),
+		      stokes_fe_values (mapping,
+					stokes_fe, quadrature,
+					update_values | update_gradients),
+		      phi_T (temperature_fe.dofs_per_cell),
+		      grad_phi_T (temperature_fe.dofs_per_cell),
+
+		      old_velocity_values (quadrature.size()),
+		      old_old_velocity_values (quadrature.size()),
+		      old_strain_rates (quadrature.size()),
+		      old_old_strain_rates (quadrature.size()),
+
+		      old_temperature_values (quadrature.size()),
+		      old_old_temperature_values(quadrature.size()),
+		      old_temperature_grads(quadrature.size()),
+		      old_old_temperature_grads(quadrature.size()),
+		      old_temperature_laplacians(quadrature.size()),
+		      old_old_temperature_laplacians(quadrature.size())
+      {}
+
+
+      template <int dim>
+      TemperatureRHS<dim>::
+      TemperatureRHS (const TemperatureRHS &scratch)
+		      :
+		      temperature_fe_values (scratch.temperature_fe_values.get_mapping(),
+					     scratch.temperature_fe_values.get_fe(),
+					     scratch.temperature_fe_values.get_quadrature(),
+					     scratch.temperature_fe_values.get_update_flags()),
+		      stokes_fe_values (scratch.stokes_fe_values.get_mapping(),
+					scratch.stokes_fe_values.get_fe(),
+					scratch.stokes_fe_values.get_quadrature(),
+					scratch.stokes_fe_values.get_update_flags()),
+		      phi_T (scratch.phi_T),
+		      grad_phi_T (scratch.grad_phi_T),
+
+		      old_velocity_values (scratch.old_velocity_values),
+		      old_old_velocity_values (scratch.old_old_velocity_values),
+		      old_strain_rates (scratch.old_strain_rates),
+		      old_old_strain_rates (scratch.old_old_strain_rates),
+
+		      old_temperature_values (scratch.old_temperature_values),
+		      old_old_temperature_values (scratch.old_old_temperature_values),
+		      old_temperature_grads (scratch.old_temperature_grads),
+		      old_old_temperature_grads (scratch.old_old_temperature_grads),
+		      old_temperature_laplacians (scratch.old_temperature_laplacians),
+		      old_old_temperature_laplacians (scratch.old_old_temperature_laplacians)
+      {}
+    }
+
+
+				     // The CopyData objects are even
+				     // simpler than the Scratch
+				     // objects as all they have to do
+				     // is to store the results of
+				     // local computations until they
+				     // can be copied into the global
+				     // matrix or vector
+				     // objects. These structures
+				     // therefore only need to provide
+				     // a constructor, a copy
+				     // operation, and some arrays for
+				     // local matrix, local vectors
+				     // and the relation between local
+				     // and global degrees of freedom
+				     // (a.k.a.
+				     // <code>local_dof_indices</code>). Again,
+				     // we have one such structure for
+				     // each of the four operations we
+				     // will parallelize using the
+				     // WorkStream class:
+    namespace CopyData
+    {
+      template <int dim>
+      struct StokesPreconditioner
+      {
+	  StokesPreconditioner (const FiniteElement<dim> &stokes_fe);
+	  StokesPreconditioner (const StokesPreconditioner &data);
+
+	  FullMatrix<double>          local_matrix;
+	  std::vector<unsigned int>   local_dof_indices;
+      };
+
+      template <int dim>
+      StokesPreconditioner<dim>::
+      StokesPreconditioner (const FiniteElement<dim> &stokes_fe)
+		      :
+		      local_matrix (stokes_fe.dofs_per_cell,
+				    stokes_fe.dofs_per_cell),
+		      local_dof_indices (stokes_fe.dofs_per_cell)
+      {}
+
+      template <int dim>
+      StokesPreconditioner<dim>::
+      StokesPreconditioner (const StokesPreconditioner &data)
+		      :
+		      local_matrix (data.local_matrix),
+		      local_dof_indices (data.local_dof_indices)
+      {}
+
+
+
+      template <int dim>
+      struct StokesSystem : public StokesPreconditioner<dim>
+      {
+	  StokesSystem (const FiniteElement<dim> &stokes_fe);
+	  StokesSystem (const StokesSystem<dim> &data);
+
+	  Vector<double> local_rhs;
+      };
+
+      template <int dim>
+      StokesSystem<dim>::
+      StokesSystem (const FiniteElement<dim> &stokes_fe)
+		      :
+		      StokesPreconditioner<dim> (stokes_fe),
+		      local_rhs (stokes_fe.dofs_per_cell)
+      {}
+
+      template <int dim>
+      StokesSystem<dim>::
+      StokesSystem (const StokesSystem<dim> &data)
+		      :
+		      StokesPreconditioner<dim> (data),
+		      local_rhs (data.local_rhs)
+      {}
+
+
+
+      template <int dim>
+      struct TemperatureMatrix
+      {
+	  TemperatureMatrix (const FiniteElement<dim> &temperature_fe);
+	  TemperatureMatrix (const TemperatureMatrix &data);
+
+	  FullMatrix<double>          local_mass_matrix;
+	  FullMatrix<double>          local_stiffness_matrix;
+	  std::vector<unsigned int>   local_dof_indices;
+      };
+
+      template <int dim>
+      TemperatureMatrix<dim>::
+      TemperatureMatrix (const FiniteElement<dim> &temperature_fe)
+		      :
+		      local_mass_matrix (temperature_fe.dofs_per_cell,
+					 temperature_fe.dofs_per_cell),
+		      local_stiffness_matrix (temperature_fe.dofs_per_cell,
+					      temperature_fe.dofs_per_cell),
+		      local_dof_indices (temperature_fe.dofs_per_cell)
+      {}
+
+      template <int dim>
+      TemperatureMatrix<dim>::
+      TemperatureMatrix (const TemperatureMatrix &data)
+		      :
+		      local_mass_matrix (data.local_mass_matrix),
+		      local_stiffness_matrix (data.local_stiffness_matrix),
+		      local_dof_indices (data.local_dof_indices)
+      {}
+
+
+
+      template <int dim>
+      struct TemperatureRHS
+      {
+	  TemperatureRHS (const FiniteElement<dim> &temperature_fe);
+	  TemperatureRHS (const TemperatureRHS &data);
+
+	  Vector<double>              local_rhs;
+	  std::vector<unsigned int>   local_dof_indices;
+	  FullMatrix<double>          matrix_for_bc;
+      };
+
+      template <int dim>
+      TemperatureRHS<dim>::
+      TemperatureRHS (const FiniteElement<dim> &temperature_fe)
+		      :
+		      local_rhs (temperature_fe.dofs_per_cell),
+		      local_dof_indices (temperature_fe.dofs_per_cell),
+		      matrix_for_bc (temperature_fe.dofs_per_cell,
+				     temperature_fe.dofs_per_cell)
+      {}
+
+      template <int dim>
+      TemperatureRHS<dim>::
+      TemperatureRHS (const TemperatureRHS &data)
+		      :
+		      local_rhs (data.local_rhs),
+		      local_dof_indices (data.local_dof_indices),
+		      matrix_for_bc (data.matrix_for_bc)
+      {}
+    }
+  }
+
+
+
+				   // @sect3{The <code>BoussinesqFlowProblem</code> class template}
+				   //
+				   // This is the declaration of the
+				   // main class. It is very similar
+				   // to step-31 but there are a
+				   // number differences we will
+				   // comment on below.
+				   //
+				   // The top of the class is
+				   // essentially the same as in
+				   // step-31, listing the public
+				   // methods and a set of private
+				   // functions that do the heavy
+				   // lifting. Compared to step-31
+				   // there are only two additions to
+				   // this section: the function
+				   // <code>get_cfl_number()</code>
+				   // that computes the maximum CFL
+				   // number over all cells which
+				   // we then compute the global time
+				   // step from, and the function
+				   // <code>get_entropy_variation()</code>
+				   // that is used in the computation
+				   // of the entropy stabilization. It
+				   // is akin to the
+				   // <code>get_extrapolated_temperature_range()</code>
+				   // we have used in step-31 for this
+				   // purpose, but works on the
+				   // entropy instead of the
+				   // temperature instead.
+  template <int dim>
+  class BoussinesqFlowProblem
+  {
+    public:
+      struct Parameters;
+      BoussinesqFlowProblem (Parameters &parameters);
+      void run ();
+
+      int 										m_myrank;		// for debugging
+
+    private:
+      void setup_dofs ();
+      void assemble_stokes_preconditioner ();
+      void build_stokes_preconditioner ();
+      void assemble_stokes_system ();
+      void assemble_temperature_matrix ();
+      void assemble_temperature_system (const double maximal_velocity);
+      void project_temperature_field ();
+      double get_maximal_velocity () const;
+      double get_cfl_number () const;
+      double get_entropy_variation (const double average_temperature) const;
+      std::pair<double,double> get_extrapolated_temperature_range () const;
+      void solve ();
+      void output_results ();
+      void refine_mesh (const unsigned int max_grid_level);
+
+      double
+      compute_viscosity(const std::vector<double>          &old_temperature,
+			const std::vector<double>          &old_old_temperature,
+			const std::vector<Tensor<1,dim> >  &old_temperature_grads,
+			const std::vector<Tensor<1,dim> >  &old_old_temperature_grads,
+			const std::vector<double>          &old_temperature_laplacians,
+			const std::vector<double>          &old_old_temperature_laplacians,
+			const std::vector<Tensor<1,dim> >  &old_velocity_values,
+			const std::vector<Tensor<1,dim> >  &old_old_velocity_values,
+			const std::vector<SymmetricTensor<2,dim> >  &old_strain_rates,
+			const std::vector<SymmetricTensor<2,dim> >  &old_old_strain_rates,
+			const double                        global_u_infty,
+			const double                        global_T_variation,
+			const double                        average_temperature,
+			const double                        global_entropy_variation,
+			const double                        cell_diameter) const;
+
+    public:
+
+				       // The first significant new
+				       // component is the definition
+				       // of a struct for the
+				       // parameters according to the
+				       // discussion in the
+				       // introduction. This structure
+				       // is initialized by reading
+				       // from a parameter file during
+				       // construction of this object.
+      struct Parameters
+      {
+	  Parameters (const std::string &parameter_filename);
+
+	  static void declare_parameters (ParameterHandler &prm);
+	  void parse_parameters (ParameterHandler &prm);
+
+	  double       end_time;
+
+	  unsigned int initial_global_refinement;
+	  unsigned int initial_adaptive_refinement;
+
+	  bool         generate_graphical_output;
+	  unsigned int graphical_output_interval;
+
+	  unsigned int adaptive_refinement_interval;
+
+	  double       stabilization_alpha;
+	  double       stabilization_c_R;
+	  double       stabilization_beta;
+
+	  unsigned int stokes_velocity_degree;
+	  bool         use_locally_conservative_discretization;
+
+	  unsigned int temperature_degree;
+      };
+
+    private:
+      Parameters                               &parameters;
+
+				       // The <code>pcout</code> (for
+				       // <i>%parallel
+				       // <code>std::cout</code></i>)
+				       // object is used to simplify
+				       // writing output: each MPI
+				       // process can use this to
+				       // generate output as usual,
+				       // but since each of these
+				       // processes will (hopefully)
+				       // produce the same output it
+				       // will just be replicated many
+				       // times over; with the
+				       // ConditionalOStream class,
+				       // only the output generated by
+				       // one MPI process will
+				       // actually be printed to
+				       // screen, whereas the output
+				       // by all the other threads
+				       // will simply be forgotten.
+      ConditionalOStream                        pcout;
+
+				       // The following member
+				       // variables will then again be
+				       // similar to those in step-31
+				       // (and to other tutorial
+				       // programs). As mentioned in
+				       // the introduction, we fully
+				       // distribute computations, so
+				       // we will have to use the
+				       // parallel::distributed::Triangulation
+				       // class (see step-40) but the
+				       // remainder of these variables
+				       // is rather standard with two
+				       // exceptions:
+				       //
+				       // - The <code>mapping</code>
+				       // variable is used to denote a
+				       // higher-order polynomial
+				       // mapping. As mentioned in the
+				       // introduction, we use this
+				       // mapping when forming
+				       // integrals through quadrature
+				       // for all cells that are
+				       // adjacent to either the inner
+				       // or outer boundaries of our
+				       // domain where the boundary is
+				       // curved.
+				       //
+				       // - In a bit of naming
+				       // confusion, you will notice
+				       // below that some of the
+				       // variables from namespace
+				       // PETScWrappers are taken
+				       // from namespace
+				       // PETScWrappers::MPI (such
+				       // as the right hand side
+				       // vectors) whereas others are
+				       // not (such as the various
+				       // matrices). For the matrices,
+				       // we happen to use the same
+				       // class names for %parallel
+				       // and sequential data
+				       // structures, i.e., all
+				       // matrices will actually be
+				       // considered %parallel
+				       // below. On the other hand,
+				       // for vectors, only those from
+				       // namespace
+				       // PETScWrappers::MPI are
+				       // actually distributed. In
+				       // particular, we will
+				       // frequently have to query
+				       // velocities and temperatures
+				       // at arbitrary quadrature
+				       // points; consequently, rather
+				       // than importing ghost
+				       // information of a vector
+				       // whenever we need access to
+				       // degrees of freedom that are
+				       // relevant locally but owned
+				       // by another processor, we
+				       // solve linear systems in
+				       // %parallel but then
+				       // immediately initialize a
+				       // vector including ghost
+				       // entries of the solution for
+				       // further processing. The
+				       // various
+				       // <code>*_solution</code>
+				       // vectors are therefore filled
+				       // immediately after solving
+				       // their respective linear
+				       // system in %parallel and will
+				       // always contain values for
+				       // all @ref
+				       // GlossLocallyRelevantDof
+				       // "locally relevant degrees of freedom";
+				       // the fully
+				       // distributed vectors that we
+				       // obtain from the solution
+				       // process and that only ever
+				       // contain the @ref
+				       // GlossLocallyOwnedDof
+				       // "locally owned degrees of freedom"
+				       // are destroyed
+				       // immediately after the
+				       // solution process and after
+				       // we have copied the relevant
+				       // values into the member
+				       // variable vectors.
+      parallel::distributed::Triangulation<dim> triangulation;
+      double                                    global_Omega_diameter;
+
+      const MappingQ<dim>                       mapping;
+
+      const FESystem<dim>                       stokes_fe;
+      DoFHandler<dim>                           stokes_dof_handler;
+      ConstraintMatrix                          stokes_constraints;
+
+      PETScWrappers::MPI::BlockSparseMatrix       stokes_matrix;
+      PETScWrappers::MPI::BlockSparseMatrix       stokes_preconditioner_matrix;
+
+      PETScWrappers::MPI::BlockVector        stokes_solution;
+      PETScWrappers::MPI::BlockVector        old_stokes_solution;
+      PETScWrappers::MPI::BlockVector        stokes_rhs;
+
+
+      FE_Q<dim>                                 temperature_fe;
+      DoFHandler<dim>                           temperature_dof_handler;
+      ConstraintMatrix                          temperature_constraints;
+
+      PETScWrappers::MPI::SparseMatrix       temperature_mass_matrix;
+      PETScWrappers::MPI::SparseMatrix       temperature_stiffness_matrix;
+      PETScWrappers::MPI::SparseMatrix       temperature_matrix;
+
+      PETScWrappers::MPI::Vector             temperature_solution;
+      PETScWrappers::MPI::Vector             old_temperature_solution;
+      PETScWrappers::MPI::Vector             old_old_temperature_solution;
+      PETScWrappers::MPI::Vector             temperature_rhs;
+
+
+      double                                    time_step;
+      double                                    old_time_step;
+      unsigned int                              timestep_number;
+
+      std_cxx1x::shared_ptr<PETScWrappers::PreconditionBoomerAMG>    Amg_preconditioner;
+      std_cxx1x::shared_ptr<PETScWrappers::PreconditionJacobi> Mp_preconditioner;
+      std_cxx1x::shared_ptr<PETScWrappers::PreconditionJacobi> T_preconditioner;
+
+      bool                                      rebuild_stokes_matrix;
+      bool                                      rebuild_stokes_preconditioner;
+      bool                                      rebuild_temperature_matrices;
+      bool                                      rebuild_temperature_preconditioner;
+
+				       // The next member variable,
+				       // <code>computing_timer</code>
+				       // is used to conveniently
+				       // account for compute time
+				       // spent in certain "sections"
+				       // of the code that are
+				       // repeatedly entered. For
+				       // example, we will enter (and
+				       // leave) sections for Stokes
+				       // matrix assembly and would
+				       // like to accumulate the run
+				       // time spent in this section
+				       // over all time steps. Every
+				       // so many time steps as well
+				       // as at the end of the program
+				       // (through the destructor of
+				       // the TimerOutput class) we
+				       // will then produce a nice
+				       // summary of the times spent
+				       // in the different sections
+				       // into which we categorize the
+				       // run-time of this program.
+      TimerOutput                               computing_timer;
+
+				       // After these member variables
+				       // we have a number of
+				       // auxiliary functions that
+				       // have been broken out of the
+				       // ones listed
+				       // above. Specifically, there
+				       // are first three functions
+				       // that we call from
+				       // <code>setup_dofs</code> and
+				       // then the ones that do the
+				       // assembling of linear
+				       // systems:
+      void setup_stokes_matrix (const std::vector<IndexSet> &stokes_partitioning);
+      void setup_stokes_preconditioner (const std::vector<IndexSet> &stokes_partitioning);
+      void setup_temperature_matrices (const IndexSet &temperature_partitioning);
+
+
+				       // Following the @ref
+				       // MTWorkStream
+				       // "task-based parallelization"
+				       // paradigm,
+				       // we split all the assembly
+				       // routines into two parts: a
+				       // first part that can do all
+				       // the calculations on a
+				       // certain cell without taking
+				       // care of other threads, and a
+				       // second part (which is
+				       // writing the local data into
+				       // the global matrices and
+				       // vectors) which can be
+				       // entered by only one thread
+				       // at a time. In order to
+				       // implement that, we provide
+				       // functions for each of those
+				       // two steps for all the four
+				       // assembly routines that we
+				       // use in this program. The
+				       // following eight functions do
+				       // exactly this:
+      void
+      local_assemble_stokes_preconditioner (const typename DoFHandler<dim>::active_cell_iterator &cell,
+					    Assembly::Scratch::StokesPreconditioner<dim> &scratch,
+					    Assembly::CopyData::StokesPreconditioner<dim> &data);
+
+      void
+      copy_local_to_global_stokes_preconditioner (const Assembly::CopyData::StokesPreconditioner<dim> &data);
+
+
+      void
+      local_assemble_stokes_system (const typename DoFHandler<dim>::active_cell_iterator &cell,
+				    Assembly::Scratch::StokesSystem<dim>  &scratch,
+				    Assembly::CopyData::StokesSystem<dim> &data);
+
+      void
+      copy_local_to_global_stokes_system (const Assembly::CopyData::StokesSystem<dim> &data);
+
+
+      void
+      local_assemble_temperature_matrix (const typename DoFHandler<dim>::active_cell_iterator &cell,
+					 Assembly::Scratch::TemperatureMatrix<dim>  &scratch,
+					 Assembly::CopyData::TemperatureMatrix<dim> &data);
+
+      void
+      copy_local_to_global_temperature_matrix (const Assembly::CopyData::TemperatureMatrix<dim> &data);
+
+
+
+      void
+      local_assemble_temperature_rhs (const std::pair<double,double> global_T_range,
+				      const double                   global_max_velocity,
+				      const double                   global_entropy_variation,
+				      const typename DoFHandler<dim>::active_cell_iterator &cell,
+				      Assembly::Scratch::TemperatureRHS<dim> &scratch,
+				      Assembly::CopyData::TemperatureRHS<dim> &data);
+
+      void
+      copy_local_to_global_temperature_rhs (const Assembly::CopyData::TemperatureRHS<dim> &data);
+
+				       // Finally, we forward declare
+				       // a member class that we will
+				       // define later on and that
+				       // will be used to compute a
+				       // number of quantities from
+				       // our solution vectors that
+				       // we'd like to put into the
+				       // output files for
+				       // visualization.
+      class Postprocessor;
+  };
+
+
+				   // @sect3{BoussinesqFlowProblem class implementation}
+
+				   // @sect4{BoussinesqFlowProblem::Parameters}
+				   //
+				   // Here comes the definition of the
+				   // parameters for the Stokes
+				   // problem. We allow to set the end
+				   // time for the simulation, the
+				   // level of refinements (both
+				   // global and adaptive, which in
+				   // the sum specify what maximum
+				   // level the cells are allowed to
+				   // have), and the interval between
+				   // refinements in the time
+				   // stepping.
+				   //
+				   // Then, we let the user specify
+				   // constants for the stabilization
+				   // parameters (as discussed in the
+				   // introduction), the polynomial
+				   // degree for the Stokes velocity
+				   // space, whether to use the
+				   // locally conservative
+				   // discretization based on FE_DGP
+				   // elements for the pressure or not
+				   // (FE_Q elements for pressure),
+				   // and the polynomial degree for
+				   // the temperature interpolation.
+				   //
+				   // The constructor checks for a
+				   // valid input file (if not, a file
+				   // with default parameters for the
+				   // quantities is written), and
+				   // eventually parses the
+				   // parameters.
+  template <int dim>
+  BoussinesqFlowProblem<dim>::Parameters::Parameters (const std::string &parameter_filename)
+		  :
+		  end_time (1e8),
+		  initial_global_refinement (2),
+		  initial_adaptive_refinement (2),
+		  adaptive_refinement_interval (10),
+		  stabilization_alpha (2),
+		  stabilization_c_R (0.11),
+		  stabilization_beta (0.078),
+		  stokes_velocity_degree (2),
+		  use_locally_conservative_discretization (true),
+		  temperature_degree (2)
+  {
+    ParameterHandler prm;
+    BoussinesqFlowProblem<dim>::Parameters::declare_parameters (prm);
+
+    std::ifstream parameter_file (parameter_filename.c_str());
+
+    if (!parameter_file)
+      {
+	parameter_file.close ();
+
+	std::ostringstream message;
+	message << "Input parameter file <"
+		<< parameter_filename << "> not found. Creating a"
+		<< std::endl
+		<< "template file of the same name."
+		<< std::endl;
+
+	std::ofstream parameter_out (parameter_filename.c_str());
+	prm.print_parameters (parameter_out,
+			      ParameterHandler::Text);
+
+	AssertThrow (false, ExcMessage (message.str().c_str()));
+      }
+
+    const bool success = prm.read_input (parameter_file);
+    AssertThrow (success, ExcMessage ("Invalid input parameter file."));
+
+    parse_parameters (prm);
+  }
+
+
+
+				   // Next we have a function that
+				   // declares the parameters that we
+				   // expect in the input file,
+				   // together with their data types,
+				   // default values and a
+				   // description:
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::Parameters::
+  declare_parameters (ParameterHandler &prm)
+  {
+    prm.declare_entry ("End time", "1e8",
+		       Patterns::Double (0),
+		       "The end time of the simulation in years.");
+    prm.declare_entry ("Initial global refinement", "2",
+		       Patterns::Integer (0),
+		       "The number of global refinement steps performed on "
+		       "the initial coarse mesh, before the problem is first "
+		       "solved there.");
+    prm.declare_entry ("Initial adaptive refinement", "2",
+		       Patterns::Integer (0),
+		       "The number of adaptive refinement steps performed after "
+		       "initial global refinement.");
+    prm.declare_entry ("Time steps between mesh refinement", "10",
+		       Patterns::Integer (1),
+		       "The number of time steps after which the mesh is to be "
+		       "adapted based on computed error indicators.");
+    prm.declare_entry ("Generate graphical output", "false",
+		       Patterns::Bool (),
+		       "Whether graphical output is to be generated or not. "
+		       "You may not want to get graphical output if the number "
+		       "of processors is large.");
+    prm.declare_entry ("Time steps between graphical output", "50",
+		       Patterns::Integer (1),
+		       "The number of time steps between each generation of "
+		       "graphical output files.");
+
+    prm.enter_subsection ("Stabilization parameters");
+    {
+      prm.declare_entry ("alpha", "2",
+			 Patterns::Double (1, 2),
+			 "The exponent in the entropy viscosity stabilization.");
+      prm.declare_entry ("c_R", "0.11",
+			 Patterns::Double (0),
+			 "The c_R factor in the entropy viscosity "
+			 "stabilization.");
+      prm.declare_entry ("beta", "0.078",
+			 Patterns::Double (0),
+			 "The beta factor in the artificial viscosity "
+			 "stabilization. An appropriate value for 2d is 0.052 "
+			 "and 0.078 for 3d.");
+    }
+    prm.leave_subsection ();
+
+    prm.enter_subsection ("Discretization");
+    {
+      prm.declare_entry ("Stokes velocity polynomial degree", "2",
+			 Patterns::Integer (1),
+			 "The polynomial degree to use for the velocity variables "
+			 "in the Stokes system.");
+      prm.declare_entry ("Temperature polynomial degree", "2",
+			 Patterns::Integer (1),
+			 "The polynomial degree to use for the temperature variable.");
+      prm.declare_entry ("Use locally conservative discretization", "true",
+			 Patterns::Bool (),
+			 "Whether to use a Stokes discretization that is locally "
+			 "conservative at the expense of a larger number of degrees "
+			 "of freedom, or to go with a cheaper discretization "
+			 "that does not locally conserve mass (although it is "
+			 "globally conservative.");
+    }
+    prm.leave_subsection ();
+  }
+
+
+
+				   // And then we need a function that
+				   // reads the contents of the
+				   // ParameterHandler object we get
+				   // by reading the input file and
+				   // puts the results into variables
+				   // that store the values of the
+				   // parameters we have previously
+				   // declared:
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::Parameters::
+  parse_parameters (ParameterHandler &prm)
+  {
+    end_time                    = prm.get_double ("End time");
+    initial_global_refinement   = prm.get_integer ("Initial global refinement");
+    initial_adaptive_refinement = prm.get_integer ("Initial adaptive refinement");
+
+    adaptive_refinement_interval= prm.get_integer ("Time steps between mesh refinement");
+
+    generate_graphical_output   = prm.get_bool ("Generate graphical output");
+    graphical_output_interval   = prm.get_integer ("Time steps between graphical output");
+
+    prm.enter_subsection ("Stabilization parameters");
+    {
+      stabilization_alpha = prm.get_double ("alpha");
+      stabilization_c_R   = prm.get_double ("c_R");
+      stabilization_beta  = prm.get_double ("beta");
+    }
+    prm.leave_subsection ();
+
+    prm.enter_subsection ("Discretization");
+    {
+      stokes_velocity_degree = prm.get_integer ("Stokes velocity polynomial degree");
+      temperature_degree     = prm.get_integer ("Temperature polynomial degree");
+      use_locally_conservative_discretization
+	= prm.get_bool ("Use locally conservative discretization");
+    }
+    prm.leave_subsection ();
+  }
+
+
+
+
+				   // @sect4{BoussinesqFlowProblem::BoussinesqFlowProblem}
+				   //
+				   // The constructor of the problem
+				   // is very similar to the
+				   // constructor in step-31. What is
+				   // different is the %parallel
+				   // communication: Trilinos uses a
+				   // message passing interface (MPI)
+				   // for data distribution. When
+				   // entering the
+				   // BoussinesqFlowProblem class, we
+				   // have to decide how the
+				   // parallization is to be done. We
+				   // choose a rather simple strategy
+				   // and let all processors that are
+				   // running the program work
+				   // together, specified by the
+				   // communicator
+				   // <code>MPI_COMM_WORLD</code>. Next,
+				   // we create the output stream (as
+				   // we already did in step-18) that
+				   // only generates output on the
+				   // first MPI process and is
+				   // completely forgetful on all
+				   // others. The implementation of
+				   // this idea is to check the
+				   // process number when
+				   // <code>pcout</code> gets a true
+				   // argument, and it uses the
+				   // <code>std::cout</code> stream
+				   // for output. If we are one
+				   // processor five, for instance,
+				   // then we will give a
+				   // <code>false</code> argument to
+				   // <code>pcout</code>, which means
+				   // that the output of that
+				   // processor will not be
+				   // printed. With the exception of
+				   // the mapping object (for which we
+				   // use polynomials of degree 4) all
+				   // but the final member variable
+				   // are exactly the same as in
+				   // step-31.
+				   //
+				   // This final object, the
+				   // TimerOutput object, is then told
+				   // to restrict output to the
+				   // <code>pcout</code> stream
+				   // (processor 0), and then we
+				   // specify that we want to get a
+				   // summary table at the end of the
+				   // program which shows us wallclock
+				   // times (as opposed to CPU
+				   // times). We will manually also
+				   // request intermediate summaries
+				   // every so many time steps in the
+				   // <code>run()</code> function
+				   // below.
+  template <int dim>
+  BoussinesqFlowProblem<dim>::BoussinesqFlowProblem (Parameters &parameters_)
+		  :
+		  parameters (parameters_),
+		  pcout (std::cout,
+			 (Utilities::MPI::this_mpi_process(MPI_COMM_WORLD)
+			  == 0)),
+
+		  triangulation (MPI_COMM_WORLD,
+				 typename Triangulation<dim>::MeshSmoothing
+				 (Triangulation<dim>::smoothing_on_refinement |
+				  Triangulation<dim>::smoothing_on_coarsening)),
+
+		  mapping (4),
+
+		  stokes_fe (FE_Q<dim>(parameters.stokes_velocity_degree),
+			     dim,
+			     (parameters.use_locally_conservative_discretization
+			      ?
+			      static_cast<const FiniteElement<dim> &>
+			      (FE_DGP<dim>(parameters.stokes_velocity_degree-1))
+			      :
+			      static_cast<const FiniteElement<dim> &>
+			      (FE_Q<dim>(parameters.stokes_velocity_degree-1))),
+			     1),
+
+		  stokes_dof_handler (triangulation),
+
+		  temperature_fe (parameters.temperature_degree),
+		  temperature_dof_handler (triangulation),
+
+		  time_step (0),
+		  old_time_step (0),
+		  timestep_number (0),
+		  rebuild_stokes_matrix (true),
+		  rebuild_stokes_preconditioner (true),
+		  rebuild_temperature_matrices (true),
+		  rebuild_temperature_preconditioner (true),
+
+		  computing_timer (pcout,
+				   TimerOutput::summary,
+				   TimerOutput::wall_times)
+  {}
+
+
+
+				   // @sect4{The BoussinesqFlowProblem helper functions}
+				   // @sect5{BoussinesqFlowProblem::get_maximal_velocity}
+				   // Except for two small details,
+				   // the function to compute the
+				   // global maximum of the velocity
+				   // is the same as in step-31. The
+				   // first detail is actually common
+				   // to all functions that implement
+				   // loops over all cells in the
+				   // triangulation: When operating in
+				   // %parallel, each processor can
+				   // only work on a chunk of cells
+				   // since each processor only has a
+				   // certain part of the entire
+				   // triangulation. This chunk of
+				   // cells that we want to work on is
+				   // identified via a so-called
+				   // <code>subdomain_id</code>, as we
+				   // also did in step-18. All we need
+				   // to change is hence to perform
+				   // the cell-related operations only
+				   // on cells that are owned by the
+				   // current process (as opposed to
+				   // ghost or artificial cells),
+				   // i.e. for which the subdomain id
+				   // equals the number of the process
+				   // ID. Since this is a commonly
+				   // used operation, there is a
+				   // shortcut for this operation: we
+				   // can ask whether the cell is
+				   // owned by the current processor
+				   // using
+				   // <code>cell-@>is_locally_owned()</code>.
+				   //
+				   // The second difference is the way
+				   // we calculate the maximum
+				   // value. Before, we could simply
+				   // have a <code>double</code>
+				   // variable that we checked against
+				   // on each quadrature point for
+				   // each cell. Now, we have to be a
+				   // bit more careful since each
+				   // processor only operates on a
+				   // subset of cells. What we do is
+				   // to first let each processor
+				   // calculate the maximum among its
+				   // cells, and then do a global
+				   // communication operation
+				   // <code>Utilities::MPI::max</code>
+				   // that computes the maximum value
+				   // among all the maximum values of
+				   // the individual processors. MPI
+				   // provides such a call, but it's
+				   // even simpler to use the
+				   // respective function in namespace
+				   // Utilities::MPI using the MPI
+				   // communicator object since that
+				   // will do the right thing even if
+				   // we work without MPI and on a
+				   // single machine only. The call to
+				   // <code>Utilities::MPI::max</code>
+				   // needs two arguments, namely the
+				   // local maximum (input) and the
+				   // MPI communicator, which is
+				   // MPI_COMM_WORLD in this example.
+  template <int dim>
+  double BoussinesqFlowProblem<dim>::get_maximal_velocity () const
+  {
+    const QIterated<dim> quadrature_formula (QTrapez<1>(),
+					     parameters.stokes_velocity_degree);
+    const unsigned int n_q_points = quadrature_formula.size();
+
+    FEValues<dim> fe_values (mapping, stokes_fe, quadrature_formula, update_values);
+    std::vector<Tensor<1,dim> > velocity_values(n_q_points);
+
+    const FEValuesExtractors::Vector velocities (0);
+
+    double max_local_velocity = 0;
+
+    typename DoFHandler<dim>::active_cell_iterator
+      cell = stokes_dof_handler.begin_active(),
+      endc = stokes_dof_handler.end();
+    for (; cell!=endc; ++cell)
+      if (cell->is_locally_owned())
+	{
+	  fe_values.reinit (cell);
+	  fe_values[velocities].get_function_values (stokes_solution,
+						     velocity_values);
+
+	  for (unsigned int q=0; q<n_q_points; ++q)
+	    max_local_velocity = std::max (max_local_velocity,
+					   velocity_values[q].norm());
+	}
+
+    return Utilities::MPI::max (max_local_velocity, MPI_COMM_WORLD);
+  }
+
+
+				   // @sect5{BoussinesqFlowProblem::get_cfl_number}
+				   // The next function does something
+				   // similar, but we now compute the
+				   // CFL number, i.e., maximal
+				   // velocity on a cell divided by
+				   // the cell diameter. This number
+				   // is necessary to determine the
+				   // time step size, as we use a
+				   // semi-explicit time stepping
+				   // scheme for the temperature
+				   // equation (see step-31 for a
+				   // discussion). We compute it in
+				   // the same way as above: Compute
+				   // the local maximum over all
+				   // locally owned cells, then
+				   // exchange it via MPI to find the
+				   // global maximum.
+  template <int dim>
+  double BoussinesqFlowProblem<dim>::get_cfl_number () const
+  {
+    const QIterated<dim> quadrature_formula (QTrapez<1>(),
+					     parameters.stokes_velocity_degree);
+    const unsigned int n_q_points = quadrature_formula.size();
+
+    FEValues<dim> fe_values (mapping, stokes_fe, quadrature_formula, update_values);
+    std::vector<Tensor<1,dim> > velocity_values(n_q_points);
+
+    const FEValuesExtractors::Vector velocities (0);
+
+    double max_local_cfl = 0;
+
+    typename DoFHandler<dim>::active_cell_iterator
+      cell = stokes_dof_handler.begin_active(),
+      endc = stokes_dof_handler.end();
+    for (; cell!=endc; ++cell)
+      if (cell->is_locally_owned())
+	{
+	  fe_values.reinit (cell);
+	  fe_values[velocities].get_function_values (stokes_solution,
+						     velocity_values);
+
+	  double max_local_velocity = 1e-10;
+	  for (unsigned int q=0; q<n_q_points; ++q)
+	    max_local_velocity = std::max (max_local_velocity,
+					   velocity_values[q].norm());
+	  max_local_cfl = std::max(max_local_cfl,
+				   max_local_velocity / cell->diameter());
+	}
+
+    return Utilities::MPI::max (max_local_cfl, MPI_COMM_WORLD);
+  }
+
+
+				   // @sect5{BoussinesqFlowProblem::get_entropy_variation}
+				   // Next comes the computation of
+				   // the global entropy variation
+				   // $\|E(T)-\bar{E}(T)\|_\infty$
+				   // where the entropy $E$ is defined
+				   // as discussed in the
+				   // introduction.  This is needed for
+				   // the evaluation of the
+				   // stabilization in the temperature
+				   // equation as explained in the
+				   // introduction. The entropy
+				   // variation is actually only
+				   // needed if we use $\alpha=2$ as a
+				   // power in the residual
+				   // computation. The infinity norm
+				   // is computed by the maxima over
+				   // quadrature points, as usual in
+				   // discrete computations.
+				   //
+				   // In order to compute this quantity, we
+				   // first have to find the space-average
+				   // $\bar{E}(T)$ and then evaluate the
+				   // maximum. However, that means that we
+				   // would need to perform two loops. We can
+				   // avoid the overhead by noting that
+				   // $\|E(T)-\bar{E}(T)\|_\infty =
+				   // \max\big(E_{\textrm{max}}(T)-\bar{E}(T),
+				   // \bar{E}(T)-E_{\textrm{min}}(T)\big)$, i.e., the
+				   // maximum out of the deviation from the
+				   // average entropy in positive and negative
+				   // directions. The four quantities we need
+				   // for the latter formula (maximum entropy,
+				   // minimum entropy, average entropy, area)
+				   // can all be evaluated in the same loop
+				   // over all cells, so we choose this
+				   // simpler variant.
+  template <int dim>
+  double
+  BoussinesqFlowProblem<dim>::get_entropy_variation (const double average_temperature) const
+  {
+    if (parameters.stabilization_alpha != 2)
+      return 1.;
+
+    const QGauss<dim> quadrature_formula (parameters.temperature_degree+1);
+    const unsigned int n_q_points = quadrature_formula.size();
+
+    FEValues<dim> fe_values (temperature_fe, quadrature_formula,
+			     update_values | update_JxW_values);
+    std::vector<double> old_temperature_values(n_q_points);
+    std::vector<double> old_old_temperature_values(n_q_points);
+
+				     // In the two functions above we
+				     // computed the maximum of
+				     // numbers that were all
+				     // non-negative, so we knew that
+				     // zero was certainly a lower
+				     // bound. On the other hand, here
+				     // we need to find the maximum
+				     // deviation from the average
+				     // value, i.e., we will need to
+				     // know the maximal and minimal
+				     // values of the entropy for
+				     // which we don't a priori know
+				     // the sign.
+				     //
+				     // To compute it, we can
+				     // therefore start with the
+				     // largest and smallest possible
+				     // values we can store in a
+				     // double precision number: The
+				     // minimum is initialized with a
+				     // bigger and the maximum with a
+				     // smaller number than any one
+				     // that is going to appear. We
+				     // are then guaranteed that these
+				     // numbers will be overwritten in
+				     // the loop on the first cell or,
+				     // if this processor does not own
+				     // any cells, in the
+				     // communication step at the
+				     // latest. The following loop
+				     // then computes the minimum and
+				     // maximum local entropy as well
+				     // as keeps track of the
+				     // area/volume of the part of the
+				     // domain we locally own and the
+				     // integral over the entropy on
+				     // it:
+    double min_entropy = std::numeric_limits<double>::max(),
+	   max_entropy = -std::numeric_limits<double>::max(),
+		  area = 0,
+    entropy_integrated = 0;
+
+    typename DoFHandler<dim>::active_cell_iterator
+      cell = temperature_dof_handler.begin_active(),
+      endc = temperature_dof_handler.end();
+    for (; cell!=endc; ++cell)
+      if (cell->is_locally_owned())
+	{
+	  fe_values.reinit (cell);
+	  fe_values.get_function_values (old_temperature_solution,
+					 old_temperature_values);
+	  fe_values.get_function_values (old_old_temperature_solution,
+					 old_old_temperature_values);
+	  for (unsigned int q=0; q<n_q_points; ++q)
+	    {
+	      const double T = (old_temperature_values[q] +
+				old_old_temperature_values[q]) / 2;
+	      const double entropy = ((T-average_temperature) *
+				      (T-average_temperature));
+
+	      min_entropy = std::min (min_entropy, entropy);
+	      max_entropy = std::max (max_entropy, entropy);
+	      area += fe_values.JxW(q);
+	      entropy_integrated += fe_values.JxW(q) * entropy;
+	    }
+	}
+
+				     // Now we only need to exchange
+				     // data between processors: we
+				     // need to sum the two integrals
+				     // (<code>area</code>,
+				     // <code>entropy_integrated</code>),
+				     // and get the extrema for
+				     // maximum and minimum. We could
+				     // do this through four different
+				     // data exchanges, but we can it
+				     // with two: Utilities::MPI::sum
+				     // also exists in a variant that
+				     // takes an array of values that
+				     // are all to be summed up. And
+				     // we can also utilize the
+				     // Utilities::MPI::max function
+				     // by realizing that forming the
+				     // minimum over the minimal
+				     // entropies equals forming the
+				     // negative of the maximum over
+				     // the negative of the minimal
+				     // entropies; this maximum can
+				     // then be combined with forming
+				     // the maximum over the maximal
+				     // entropies.
+    const double local_sums[2]   = { entropy_integrated, area },
+		 local_maxima[2] = { -min_entropy, max_entropy };
+    double global_sums[2], global_maxima[2];
+
+    Utilities::MPI::sum (local_sums,   MPI_COMM_WORLD, global_sums);
+    Utilities::MPI::max (local_maxima, MPI_COMM_WORLD, global_maxima);
+
+				     // Having computed everything
+				     // this way, we can then compute
+				     // the average entropy and find
+				     // the $L^\infty$ norm by taking
+				     // the larger of the deviation of
+				     // the maximum or minimum from
+				     // the average:
+    const double average_entropy = global_sums[0] / global_sums[1];
+    const double entropy_diff = std::max(global_maxima[1] - average_entropy,
+					 average_entropy - (-global_maxima[0]));
+    return entropy_diff;
+  }
+
+
+
+				   // @sect5{BoussinesqFlowProblem::get_extrapolated_temperature_range}
+				   // The next function computes the
+				   // minimal and maximal value of the
+				   // extrapolated temperature over
+				   // the entire domain. Again, this
+				   // is only a slightly modified
+				   // version of the respective
+				   // function in step-31. As in the
+				   // function above, we collect local
+				   // minima and maxima and then
+				   // compute the global extrema using
+				   // the same trick as above.
+				   //
+				   // As already discussed in step-31, the
+				   // function needs to distinguish between
+				   // the first and all following time steps
+				   // because it uses a higher order
+				   // temperature extrapolation scheme when at
+				   // least two previous time steps are
+				   // available.
+  template <int dim>
+  std::pair<double,double>
+  BoussinesqFlowProblem<dim>::get_extrapolated_temperature_range () const
+  {
+    const QIterated<dim> quadrature_formula (QTrapez<1>(),
+					     parameters.temperature_degree);
+    const unsigned int n_q_points = quadrature_formula.size();
+
+    FEValues<dim> fe_values (mapping, temperature_fe, quadrature_formula,
+			     update_values);
+    std::vector<double> old_temperature_values(n_q_points);
+    std::vector<double> old_old_temperature_values(n_q_points);
+
+    double min_local_temperature = std::numeric_limits<double>::max(),
+	   max_local_temperature = -std::numeric_limits<double>::max();
+
+    if (timestep_number != 0)
+      {
+	typename DoFHandler<dim>::active_cell_iterator
+	  cell = temperature_dof_handler.begin_active(),
+	  endc = temperature_dof_handler.end();
+	for (; cell!=endc; ++cell)
+	  if (cell->is_locally_owned())
+	    {
+	      fe_values.reinit (cell);
+	      fe_values.get_function_values (old_temperature_solution,
+					     old_temperature_values);
+	      fe_values.get_function_values (old_old_temperature_solution,
+					     old_old_temperature_values);
+
+	      for (unsigned int q=0; q<n_q_points; ++q)
+		{
+		  const double temperature =
+		    (1. + time_step/old_time_step) * old_temperature_values[q]-
+		    time_step/old_time_step * old_old_temperature_values[q];
+
+		  min_local_temperature = std::min (min_local_temperature,
+						    temperature);
+		  max_local_temperature = std::max (max_local_temperature,
+						    temperature);
+		}
+	    }
+      }
+    else
+      {
+	typename DoFHandler<dim>::active_cell_iterator
+	  cell = temperature_dof_handler.begin_active(),
+	  endc = temperature_dof_handler.end();
+	for (; cell!=endc; ++cell)
+	  if (cell->is_locally_owned())
+	    {
+	      fe_values.reinit (cell);
+	      fe_values.get_function_values (old_temperature_solution,
+					     old_temperature_values);
+
+	      for (unsigned int q=0; q<n_q_points; ++q)
+		{
+		  const double temperature = old_temperature_values[q];
+
+		  min_local_temperature = std::min (min_local_temperature,
+						    temperature);
+		  max_local_temperature = std::max (max_local_temperature,
+						    temperature);
+		}
+	    }
+      }
+
+    double local_extrema[2] = { -min_local_temperature,
+				max_local_temperature };
+    double global_extrema[2];
+    Utilities::MPI::max (local_extrema, MPI_COMM_WORLD, global_extrema);
+
+    return std::make_pair(-global_extrema[0], global_extrema[1]);
+  }
+
+
+				   // @sect5{BoussinesqFlowProblem::compute_viscosity}
+				   // The function that calculates the
+				   // viscosity is purely local and so needs
+				   // no communication at all. It is mostly
+				   // the same as in step-31 but with an
+				   // updated formulation of the viscosity if
+				   // $\alpha=2$ is chosen:
+  template <int dim>
+  double
+  BoussinesqFlowProblem<dim>::
+  compute_viscosity (const std::vector<double>          &old_temperature,
+		     const std::vector<double>          &old_old_temperature,
+		     const std::vector<Tensor<1,dim> >  &old_temperature_grads,
+		     const std::vector<Tensor<1,dim> >  &old_old_temperature_grads,
+		     const std::vector<double>          &old_temperature_laplacians,
+		     const std::vector<double>          &old_old_temperature_laplacians,
+		     const std::vector<Tensor<1,dim> >  &old_velocity_values,
+		     const std::vector<Tensor<1,dim> >  &old_old_velocity_values,
+		     const std::vector<SymmetricTensor<2,dim> >  &old_strain_rates,
+		     const std::vector<SymmetricTensor<2,dim> >  &old_old_strain_rates,
+		     const double                        global_u_infty,
+		     const double                        global_T_variation,
+		     const double                        average_temperature,
+		     const double                        global_entropy_variation,
+		     const double                        cell_diameter) const
+  {
+    if (global_u_infty == 0)
+      return 5e-3 * cell_diameter;
+
+    const unsigned int n_q_points = old_temperature.size();
+
+    double max_residual = 0;
+    double max_velocity = 0;
+
+    for (unsigned int q=0; q < n_q_points; ++q)
+      {
+	const Tensor<1,dim> u = (old_velocity_values[q] +
+				 old_old_velocity_values[q]) / 2;
+
+	const SymmetricTensor<2,dim> strain_rate = (old_strain_rates[q] +
+						    old_old_strain_rates[q]) / 2;
+
+	const double T = (old_temperature[q] + old_old_temperature[q]) / 2;
+	const double dT_dt = (old_temperature[q] - old_old_temperature[q])
+			     / old_time_step;
+	const double u_grad_T = u * (old_temperature_grads[q] +
+				     old_old_temperature_grads[q]) / 2;
+
+	const double kappa_Delta_T = EquationData::kappa
+				     * (old_temperature_laplacians[q] +
+					old_old_temperature_laplacians[q]) / 2;
+	const double gamma
+	  = ((EquationData::radiogenic_heating * EquationData::density(T)
+	      +
+	      2 * EquationData::eta * strain_rate * strain_rate) /
+	     (EquationData::density(T) * EquationData::specific_heat));
+
+	double residual
+	  = std::abs(dT_dt + u_grad_T - kappa_Delta_T - gamma);
+	if (parameters.stabilization_alpha == 2)
+	  residual *= std::abs(T - average_temperature);
+
+	max_residual = std::max (residual,        max_residual);
+	max_velocity = std::max (std::sqrt (u*u), max_velocity);
+      }
+
+    const double max_viscosity = (parameters.stabilization_beta *
+				  max_velocity * cell_diameter);
+    if (timestep_number == 0)
+      return max_viscosity;
+    else
+      {
+	Assert (old_time_step > 0, ExcInternalError());
+
+	double entropy_viscosity;
+	if (parameters.stabilization_alpha == 2)
+	  entropy_viscosity = (parameters.stabilization_c_R *
+			       cell_diameter * cell_diameter *
+			       max_residual /
+			       global_entropy_variation);
+	else
+	  entropy_viscosity = (parameters.stabilization_c_R *
+			       cell_diameter * global_Omega_diameter *
+			       max_velocity * max_residual /
+			       (global_u_infty * global_T_variation));
+
+	return std::min (max_viscosity, entropy_viscosity);
+      }
+  }
+
+
+
+  				   // @sect5{BoussinesqFlowProblem::project_temperature_field}
+
+				   // This function is new compared to
+				   // step-31. What is does is to re-implement
+				   // the library function
+				   // <code>VectorTools::project()</code> for
+				   // an MPI-based parallelization, a function
+				   // we used for generating an initial vector
+				   // for temperature based on some initial
+				   // function. The library function only
+				   // works with shared memory but doesn't
+				   // know how to utilize multiple machines
+				   // coupled through MPI to compute the
+				   // projected field. The details of a
+				   // <code>project()</code> function are not
+				   // very difficult. All we do is to use a
+				   // mass matrix and put the evaluation of
+				   // the initial value function on the right
+				   // hand side. The mass matrix for
+				   // temperature we can simply generate using
+				   // the respective assembly function, so all
+				   // we need to do here is to create the
+				   // right hand side and do a CG solve. The
+				   // assembly function does a loop over all
+				   // cells and evaluates the function in the
+				   // <code>EquationData</code> namespace, and
+				   // does this only on cells owned by the
+				   // respective processor. The implementation
+				   // of this assembly differs from the
+				   // assembly we do for the principal
+				   // assembly functions further down (which
+				   // include thread-based parallelization
+				   // with the WorkStream concept). Here we
+				   // chose to keep things simple (keeping in
+				   // mind that this function is also only
+				   // called once at the beginning of the
+				   // program, not in every time step), and
+				   // generating the right hand side is cheap
+				   // anyway so we won't even notice that this
+				   // part is not parallized by threads.
+				   //
+				   // Regarding the implementation of
+				   // inhomogeneous Dirichlet boundary
+				   // conditions: Since we use the temperature
+				   // ConstraintMatrix, we could apply the
+				   // boundary conditions directly when
+				   // building the respective matrix and right
+				   // hand side. In this case, the boundary
+				   // conditions are inhomogeneous, which
+				   // makes this procedure somewhat tricky
+				   // since we get the matrix from some other
+				   // function that uses its own integration
+				   // and assembly loop. However, the correct
+				   // imposition of boundary conditions needs
+				   // the matrix data we work on plus the
+				   // right hand side simultaneously, since
+				   // the right hand side is created by
+				   // Gaussian elimination on the matrix
+				   // rows. In order to not introduce the
+				   // matrix assembly at this place, but still
+				   // having the matrix data available, we
+				   // choose to create a dummy matrix
+				   // <code>matrix_for_bc</code> that we only
+				   // fill with data when we need it for
+				   // imposing boundary conditions. These
+				   // positions are exactly those where we
+				   // have an inhomogeneous entry in the
+				   // ConstraintMatrix. There are only a few
+				   // such positions (on the boundary DoFs),
+				   // so it is still much cheaper to use this
+				   // function than to create the full matrix
+				   // here. To implement this, we ask the
+				   // constraint matrix whether the DoF under
+				   // consideration is inhomogeneously
+				   // constrained. In that case, we generate
+				   // the respective matrix column that we
+				   // need for creating the correct right hand
+				   // side. Note that this (manually
+				   // generated) matrix entry needs to be
+				   // exactly the entry that we would fill the
+				   // matrix with &mdash; otherwise, this will
+				   // not work.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::project_temperature_field ()
+  {
+    assemble_temperature_matrix ();
+
+    QGauss<dim> quadrature(parameters.temperature_degree+2);
+    UpdateFlags update_flags = UpdateFlags(update_values   |
+					   update_quadrature_points |
+					   update_JxW_values);
+    FEValues<dim> fe_values (mapping, temperature_fe, quadrature, update_flags);
+
+    const unsigned int dofs_per_cell = fe_values.dofs_per_cell,
+		       n_q_points    = fe_values.n_quadrature_points;
+
+    std::vector<unsigned int> local_dof_indices (dofs_per_cell);
+    Vector<double> cell_vector (dofs_per_cell);
+    FullMatrix<double> matrix_for_bc (dofs_per_cell, dofs_per_cell);
+
+    std::vector<double> rhs_values(n_q_points);
+
+    PETScWrappers::MPI::Vector
+      rhs (MPI_COMM_WORLD,temperature_mass_matrix.m(),temperature_mass_matrix.local_size()), //rhs (temperature_mass_matrix.row_partitioner()),
+      solution (MPI_COMM_WORLD,temperature_mass_matrix.m(),temperature_mass_matrix.local_size()); //solution (temperature_mass_matrix.row_partitioner());
+
+    const EquationData::TemperatureInitialValues<dim> initial_temperature;
+
+    typename DoFHandler<dim>::active_cell_iterator
+      cell = temperature_dof_handler.begin_active(),
+      endc = temperature_dof_handler.end();
+
+    for (; cell!=endc; ++cell)
+      if (cell->is_locally_owned())
+	{
+	  cell->get_dof_indices (local_dof_indices);
+	  fe_values.reinit (cell);
+
+	  initial_temperature.value_list (fe_values.get_quadrature_points(),
+					  rhs_values);
+
+	  cell_vector = 0;
+	  matrix_for_bc = 0;
+	  for (unsigned int point=0; point<n_q_points; ++point)
+	    for (unsigned int i=0; i<dofs_per_cell; ++i)
+	      {
+		cell_vector(i) += rhs_values[point] *
+				  fe_values.shape_value(i,point) *
+				  fe_values.JxW(point);
+		if (temperature_constraints.is_inhomogeneously_constrained(local_dof_indices[i]))
+		  {
+		    for (unsigned int j=0; j<dofs_per_cell; ++j)
+		      matrix_for_bc(j,i) += fe_values.shape_value(i,point) *
+					    fe_values.shape_value(j,point) *
+					    fe_values.JxW(point);
+		  }
+	      }
+
+	  temperature_constraints.distribute_local_to_global (cell_vector,
+							      local_dof_indices,
+							      rhs,
+							      matrix_for_bc);
+	}
+
+    rhs.compress (); //rhs.compress (Add);
+    solution.compress();
+//    return;
+
+				     // Now that we have the right linear
+				     // system, we solve it using the CG
+				     // method with a simple Jacobi
+				     // preconditioner:
+    SolverControl solver_control(5*rhs.size(), 1e-12*rhs.l2_norm());
+    SolverCG<PETScWrappers::MPI::Vector> cg(solver_control);
+
+    PETScWrappers::PreconditionJacobi preconditioner_mass;
+    preconditioner_mass.initialize(temperature_mass_matrix);
+
+    cg.solve (temperature_mass_matrix, solution, rhs, preconditioner_mass);
+
+    temperature_constraints.distribute (solution);
+
+				     // Having so computed the current
+				     // temperature field, let us set the
+				     // member variable that holds the
+				     // temperature nodes. Strictly speaking,
+				     // we really only need to set
+				     // <code>old_temperature_solution</code>
+				     // since the first thing we will do is to
+				     // compute the Stokes solution that only
+				     // requires the previous time step's
+				     // temperature field. That said, nothing
+				     // good can come from not initializing
+				     // the other vectors as well (especially
+				     // since it's a relatively cheap
+				     // operation and we only have to do it
+				     // once at the beginning of the program)
+				     // if we ever want to extend our
+				     // numerical method or physical model,
+				     // and so we initialize
+				     // <code>temperature_solution</code> and
+				     // <code>old_old_temperature_solution</code>
+				     // as well. As a sidenote, while the
+				     // <code>solution</code> vector is
+				     // strictly distributed (i.e. each
+				     // processor only stores a mutually
+				     // exclusive subset of elements), the
+				     // assignment makes sure that the vectors
+				     // on the left hand side (which where
+				     // initialized to contain ghost elements
+				     // as well) also get the correct ghost
+				     // elements. In other words, the
+				     // assignment here requires communication
+				     // between processors:
+    CIG::reduce_accuracy(solution);
+
+    temperature_solution = solution;
+    old_temperature_solution = solution;
+    old_old_temperature_solution = solution;
+    temperature_solution.update_ghost_values();
+    old_temperature_solution.update_ghost_values();
+    old_old_temperature_solution.update_ghost_values();
+  }
+
+
+
+
+				   // @sect4{The BoussinesqFlowProblem setup functions}
+
+				   // The following three functions set up the
+				   // Stokes matrix, the matrix used for the
+				   // Stokes preconditioner, and the
+				   // temperature matrix. The code is mostly
+				   // the same as in step-31, but it has been
+				   // broken out into three functions of their
+				   // own for simplicity.
+				   //
+				   // The main functional difference between
+				   // the code here and that in step-31 is
+				   // that the matrices we want to set up are
+				   // distributed across multiple
+				   // processors. Since we still want to build
+				   // up the sparsity pattern first for
+				   // efficiency reasons, we could continue to
+				   // build the <i>entire</i> sparsity pattern
+				   // as a
+				   // BlockCompressedSimpleSparsityPattern, as
+				   // we did in step-31. However, that would
+				   // be inefficient: every processor would
+				   // build the same sparsity pattern, but
+				   // only initialize a small part of the
+				   // matrix using it. It also violates the
+				   // principle that every processor should
+				   // only work on those cells it owns (and,
+				   // if necessary the layer of ghost cells
+				   // around it).
+				   //
+				   // Rather, we use an object of type
+				   // PETScWrappers::BlockSparsityPattern,
+				   // which is (obviously) a wrapper around a
+				   // sparsity pattern object provided by
+				   // Trilinos. The advantage is that the
+				   // Trilinos sparsity pattern class can
+				   // communicate across multiple processors:
+				   // if this processor fills in all the
+				   // nonzero entries that result from the
+				   // cells it owns, and every other processor
+				   // does so as well, then at the end after
+				   // some MPI communication initiated by the
+				   // <code>compress()</code> call, we will
+				   // have the globally assembled sparsity
+				   // pattern available with which the global
+				   // matrix can be initialized.
+				   //
+				   // The only other change we need to make is
+				   // to tell the
+				   // DoFTools::make_sparsity_pattern() function
+				   // that it is only supposed to work on a
+				   // subset of cells, namely the ones whose
+				   // <code>subdomain_id</code> equals the
+				   // number of the current processor, and to
+				   // ignore all other cells.
+				   //
+				   // This strategy is replicated across all
+				   // three of the following functions.
+				   //
+				   // Note that Trilinos matrices store the
+				   // information contained in the sparsity
+				   // patterns, so we can safely release the
+				   // <code>sp</code> variable once the matrix
+				   // has been given the sparsity structure.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::
+  setup_stokes_matrix (const std::vector<IndexSet> &stokes_partitioning)
+  {
+	  assert(false);
+
+    stokes_matrix.clear ();
+
+    TrilinosWrappers::BlockSparsityPattern sp (stokes_partitioning,
+					       MPI_COMM_WORLD);
+
+    Table<2,DoFTools::Coupling> coupling (dim+1, dim+1);
+
+    for (unsigned int c=0; c<dim+1; ++c)
+      for (unsigned int d=0; d<dim+1; ++d)
+	if (! ((c==dim) && (d==dim)))
+	  coupling[c][d] = DoFTools::always;
+	else
+	  coupling[c][d] = DoFTools::none;
+
+    DoFTools::make_sparsity_pattern (stokes_dof_handler,
+				     coupling, sp,
+				     stokes_constraints, false,
+				     Utilities::MPI::
+				     this_mpi_process(MPI_COMM_WORLD));
+    sp.compress();
+
+//    stokes_matrix.reinit (sp);
+//    stokes_matrix.reinit(sp.n_block_rows(), sp.n_block_cols());
+//    for (unsigned int r=0; r<sp.n_block_rows(); ++r)
+//    	for (unsigned int c=0; c<sp.n_block_cols(); ++c)
+//    	{
+//    		stokes_matrix.block(r,s).reinit((block_sparsity_pattern.block(r,c));
+//    	}
+  }
+
+
+
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::
+  setup_stokes_preconditioner (const std::vector<IndexSet> &stokes_partitioning)
+  {
+	  assert(false);
+
+    Amg_preconditioner.reset ();
+    Mp_preconditioner.reset ();
+
+    stokes_preconditioner_matrix.clear ();
+
+    TrilinosWrappers::BlockSparsityPattern sp (stokes_partitioning,
+					       MPI_COMM_WORLD);
+
+    Table<2,DoFTools::Coupling> coupling (dim+1, dim+1);
+    for (unsigned int c=0; c<dim+1; ++c)
+      for (unsigned int d=0; d<dim+1; ++d)
+	if (c == d)
+	  coupling[c][d] = DoFTools::always;
+	else
+	  coupling[c][d] = DoFTools::none;
+
+    DoFTools::make_sparsity_pattern (stokes_dof_handler,
+				     coupling, sp,
+				     stokes_constraints, false,
+				     Utilities::MPI::
+				     this_mpi_process(MPI_COMM_WORLD));
+    sp.compress();
+
+//    stokes_preconditioner_matrix.reinit (sp);
+  }
+
+
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::
+  setup_temperature_matrices (const IndexSet &temperature_partitioner)
+  {
+    T_preconditioner.reset ();
+    temperature_mass_matrix.clear ();
+    temperature_stiffness_matrix.clear ();
+    temperature_matrix.clear ();
+
+//    PETScWrappers::SparsityPattern sp (temperature_partitioner,
+//					  MPI_COMM_WORLD);
+//    DoFTools::make_sparsity_pattern (temperature_dof_handler, sp,
+//				     temperature_constraints, false,
+//				     Utilities::MPI::
+//				     this_mpi_process(MPI_COMM_WORLD));
+//    sp.compress();
+    int my_rank = Utilities::MPI::this_mpi_process(MPI_COMM_WORLD);
+    const unsigned int n_local_dofs = temperature_dof_handler.n_locally_owned_dofs();
+    const unsigned int n_dofs = temperature_dof_handler.n_dofs();
+//       = DoFTools::count_dofs_with_subdomain_association (temperature_dof_handler,
+//       		Utilities::MPI::this_mpi_process(MPI_COMM_WORLD));
+
+//    temperature_matrix.reinit (sp);
+//    temperature_mass_matrix.reinit (sp);
+//    temperature_stiffness_matrix.reinit (sp);
+    temperature_matrix.reinit (MPI_COMM_WORLD,
+    		temperature_dof_handler.n_dofs(),
+    		temperature_dof_handler.n_dofs(),
+    		n_local_dofs,
+    		n_local_dofs,
+    		temperature_dof_handler.max_couplings_between_dofs());
+    temperature_mass_matrix.reinit (MPI_COMM_WORLD,
+    		temperature_dof_handler.n_dofs(),
+    		temperature_dof_handler.n_dofs(),
+    		n_local_dofs,
+    		n_local_dofs,
+    		temperature_dof_handler.max_couplings_between_dofs());
+    temperature_stiffness_matrix.reinit (MPI_COMM_WORLD,
+    		temperature_dof_handler.n_dofs(),
+    		temperature_dof_handler.n_dofs(),
+    		n_local_dofs,
+    		n_local_dofs,
+    		temperature_dof_handler.max_couplings_between_dofs());
+  }
+
+
+
+				   // The remainder of the setup function
+				   // (after splitting out the three functions
+				   // above) mostly has to deal with the
+				   // things we need to do for parallelization
+				   // across processors. Because setting all
+				   // of this up is a significant compute time
+				   // expense of the program, we put
+				   // everything we do here into a timer group
+				   // so that we can get summary information
+				   // about the fraction of time spent in this
+				   // part of the program at its end.
+				   //
+				   // At the top as usual we enumerate degrees
+				   // of freedom and sort them by
+				   // component/block, followed by writing
+				   // their numbers to the screen from
+				   // processor zero. The DoFHandler::distributed_dofs() function, when applied to a parallel::distributed::Triangulation object, sorts degrees of freedom in such a
+				   // way that all degrees of freedom
+				   // associated with subdomain zero come
+				   // before all those associated with
+				   // subdomain one, etc. For the Stokes
+				   // part, this entails, however, that
+				   // velocities and pressures become
+				   // intermixed, but this is trivially
+				   // solved by sorting again by blocks; it
+				   // is worth noting that this latter
+				   // operation leaves the relative ordering
+				   // of all velocities and pressures alone,
+				   // i.e. within the velocity block we will
+				   // still have all those associated with
+				   // subdomain zero before all velocities
+				   // associated with subdomain one,
+				   // etc. This is important since we store
+				   // each of the blocks of this matrix
+				   // distributed across all processors and
+				   // want this to be done in such a way
+				   // that each processor stores that part
+				   // of the matrix that is roughly equal to
+				   // the degrees of freedom located on
+				   // those cells that it will actually work
+				   // on.
+				   //
+				   // When printing the numbers of degrees of
+				   // freedom, note that these numbers are
+				   // going to be large if we use many
+				   // processors. Consequently, we let the
+				   // stream put a comma separator in between
+				   // every three digits. The state of the
+				   // stream, using the locale, is saved from
+				   // before to after this operation. While
+				   // slightly opaque, the code works because
+				   // the default locale (which we get using
+				   // the constructor call
+				   // <code>std::locale("")</code>) implies
+				   // printing numbers with a comma separator
+				   // for every third digit (i.e., thousands,
+				   // millions, billions).
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::setup_dofs ()
+  {
+    computing_timer.enter_section("Setup dof systems");
+
+    std::vector<unsigned int> stokes_sub_blocks (dim+1,0);
+    stokes_sub_blocks[dim] = 1;
+    stokes_dof_handler.distribute_dofs (stokes_fe);
+    DoFRenumbering::component_wise (stokes_dof_handler, stokes_sub_blocks);
+
+    temperature_dof_handler.distribute_dofs (temperature_fe);
+
+    std::vector<unsigned int> stokes_dofs_per_block (2);
+    DoFTools::count_dofs_per_block (stokes_dof_handler, stokes_dofs_per_block,
+				    stokes_sub_blocks);
+
+    const unsigned int n_u = stokes_dofs_per_block[0],
+		       n_p = stokes_dofs_per_block[1],
+		       n_T = temperature_dof_handler.n_dofs();
+
+    std::locale s = pcout.get_stream().getloc();
+    pcout.get_stream().imbue(std::locale(""));
+    pcout << "Number of active cells: "
+	  << triangulation.n_global_active_cells()
+	  << " (on "
+	  << triangulation.n_levels()
+	  << " levels)"
+	  << std::endl
+	  << "Number of degrees of freedom: "
+	  << n_u + n_p + n_T
+	  << " (" << n_u << '+' << n_p << '+'<< n_T <<')'
+	  << std::endl
+	  << std::endl;
+    pcout.get_stream().imbue(s);
+
+
+				     // After this, we have to set up the
+				     // various partitioners (of type
+				     // <code>IndexSet</code>, see the
+				     // introduction) that describe which
+				     // parts of each matrix or vector will be
+				     // stored where, then call the functions
+				     // that actually set up the matrices, and
+				     // at the end also resize the various
+				     // vectors we keep around in this
+				     // program.
+    std::vector<IndexSet> stokes_partitioning, stokes_relevant_partitioning;
+    IndexSet temperature_partitioning (n_T), temperature_relevant_partitioning (n_T);
+    IndexSet stokes_relevant_set;
+    {
+      IndexSet stokes_index_set = stokes_dof_handler.locally_owned_dofs();
+      stokes_partitioning.push_back(stokes_index_set.get_view(0,n_u));
+      stokes_partitioning.push_back(stokes_index_set.get_view(n_u,n_u+n_p));
+
+      DoFTools::extract_locally_relevant_dofs (stokes_dof_handler,
+					       stokes_relevant_set);
+      stokes_relevant_partitioning.push_back(stokes_relevant_set.get_view(0,n_u));
+      stokes_relevant_partitioning.push_back(stokes_relevant_set.get_view(n_u,n_u+n_p));
+
+      temperature_partitioning = temperature_dof_handler.locally_owned_dofs();
+      DoFTools::extract_locally_relevant_dofs (temperature_dof_handler,
+					       temperature_relevant_partitioning);
+    }
+
+				     // Following this, we can compute
+				     // constraints for the solution vectors,
+				     // including hanging node constraints and
+				     // homogenous and inhomogenous boundary
+				     // values for the Stokes and temperature
+				     // fields. Note that as for everything
+				     // else, the constraint objects can not
+				     // hold <i>all</i> constraints on every
+				     // processor. Rather, each processor
+				     // needs to store only those that are
+				     // actually necessary for correctness
+				     // given that it only assembles linear
+				     // systems on cells it owns. As discussed
+				     // in the
+				     // @ref distributed_paper "this paper",
+				     // the set of constraints we need to know
+				     // about is exactly the set of
+				     // constraints on all locally relevant
+				     // degrees of freedom, so this is what we
+				     // use to initialize the constraint
+				     // objects.
+    {
+      stokes_constraints.clear ();
+      stokes_constraints.reinit (stokes_relevant_set);
+
+      DoFTools::make_hanging_node_constraints (stokes_dof_handler,
+					       stokes_constraints);
+
+      std::vector<bool> velocity_mask (dim+1, true);
+      velocity_mask[dim] = false;
+      VectorTools::interpolate_boundary_values (stokes_dof_handler,
+						0,
+						ZeroFunction<dim>(dim+1),
+						stokes_constraints,
+						velocity_mask);
+
+      std::set<types::boundary_id> no_normal_flux_boundaries;
+      no_normal_flux_boundaries.insert (1);
+      VectorTools::compute_no_normal_flux_constraints (stokes_dof_handler, 0,
+                                                       no_normal_flux_boundaries,
+                                                       stokes_constraints,
+                                                       mapping);
+      stokes_constraints.close ();
+    }
+    {
+      temperature_constraints.clear ();
+      temperature_constraints.reinit (temperature_relevant_partitioning);
+
+      DoFTools::make_hanging_node_constraints (temperature_dof_handler,
+                                               temperature_constraints);
+      VectorTools::interpolate_boundary_values (temperature_dof_handler,
+                                                0,
+                                                EquationData::TemperatureInitialValues<dim>(),
+                                                temperature_constraints);
+      VectorTools::interpolate_boundary_values (temperature_dof_handler,
+                                                1,
+                                                EquationData::TemperatureInitialValues<dim>(),
+                                                temperature_constraints);
+      temperature_constraints.close ();
+    }
+
+                                     // All this done, we can then initialize
+                                     // the various matrix and vector objects
+                                     // to their proper sizes. At the end, we
+                                     // also record that all matrices and
+                                     // preconditioners have to be re-computed
+                                     // at the beginning of the next time
+                                     // step.
+    std::vector<unsigned int> block_sizes, local_sizes;
+    CIG::convert_block_partitioning(stokes_partitioning,n_u,n_p,block_sizes,local_sizes);
+
+//    setup_stokes_matrix (stokes_partitioning);
+    CIG::setup_petsc_matrix(block_sizes,local_sizes,stokes_dof_handler.max_couplings_between_dofs(),stokes_matrix);
+//    setup_stokes_preconditioner (stokes_partitioning);
+    CIG::setup_petsc_matrix(block_sizes,local_sizes,stokes_dof_handler.max_couplings_between_dofs(),stokes_preconditioner_matrix);
+    setup_temperature_matrices (temperature_partitioning);
+
+    stokes_rhs.reinit(block_sizes,MPI_COMM_WORLD,local_sizes);	//    stokes_rhs.reinit (stokes_partitioning, MPI_COMM_WORLD);
+    CIG::setup_petsc_vector(block_sizes, stokes_partitioning,stokes_relevant_partitioning,stokes_solution);
+//    old_stokes_solution.reinit (stokes_solution);
+    CIG::setup_petsc_vector(block_sizes, stokes_partitioning,stokes_relevant_partitioning,old_stokes_solution);
+
+    temperature_rhs.reinit (MPI_COMM_WORLD, temperature_partitioning);
+    temperature_solution.reinit (MPI_COMM_WORLD, temperature_partitioning, temperature_relevant_partitioning);
+    old_temperature_solution.reinit (MPI_COMM_WORLD, temperature_partitioning, temperature_relevant_partitioning);
+    old_old_temperature_solution.reinit(MPI_COMM_WORLD, temperature_partitioning, temperature_relevant_partitioning);
+
+    rebuild_stokes_matrix              = true;
+    rebuild_stokes_preconditioner      = true;
+    rebuild_temperature_matrices       = true;
+    rebuild_temperature_preconditioner = true;
+
+    computing_timer.exit_section();
+  }
+
+
+
+                                   // @sect4{The BoussinesqFlowProblem assembly functions}
+                                   //
+                                   // Following the discussion in the
+                                   // introduction and in the @ref threads
+                                   // module, we split the assembly functions
+                                   // into different parts:
+                                   //
+                                   // <ul> <li> The local calculations of
+                                   // matrices and right hand sides, given a
+                                   // certain cell as input (these functions
+                                   // are named <code>local_assemble_*</code>
+                                   // below). The resulting function is, in
+                                   // other words, essentially the body of the
+                                   // loop over all cells in step-31. Note,
+                                   // however, that these functions store the
+                                   // result from the local calculations in
+                                   // variables of classes from the CopyData
+                                   // namespace.
+                                   //
+                                   // <li>These objects are then given to the
+                                   // second step which writes the local data
+                                   // into the global data structures (these
+                                   // functions are named
+                                   // <code>copy_local_to_global_*</code>
+                                   // below). These functions are pretty
+                                   // trivial.
+                                   //
+                                   // <li>These two subfunctions are then used
+                                   // in the respective assembly routine
+                                   // (called <code>assemble_*</code> below),
+                                   // where a WorkStream object is set up and
+                                   // runs over all the cells that belong to
+                                   // the processor's subdomain.  </ul>
+
+                                   // @sect5{Stokes preconditioner assembly}
+                                   //
+                                   // Let us start with the functions that
+                                   // builds the Stokes preconditioner. The
+                                   // first two of these are pretty trivial,
+                                   // given the discussion above. Note in
+                                   // particular that the main point in using
+                                   // the scratch data object is that we want
+                                   // to avoid allocating any objects on the
+                                   // free space each time we visit a new
+                                   // cell. As a consequence, the assembly
+                                   // function below only has automatic local
+                                   // variables, and everything else is
+                                   // accessed through the scratch data
+                                   // object, which is allocated only once
+                                   // before we start the loop over all cells:
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::
+  local_assemble_stokes_preconditioner (const typename DoFHandler<dim>::active_cell_iterator &cell,
+                                        Assembly::Scratch::StokesPreconditioner<dim> &scratch,
+                                        Assembly::CopyData::StokesPreconditioner<dim> &data)
+  {
+    const unsigned int   dofs_per_cell   = stokes_fe.dofs_per_cell;
+    const unsigned int   n_q_points      = scratch.stokes_fe_values.n_quadrature_points;
+
+    const FEValuesExtractors::Vector velocities (0);
+    const FEValuesExtractors::Scalar pressure (dim);
+
+    scratch.stokes_fe_values.reinit (cell);
+    cell->get_dof_indices (data.local_dof_indices);
+
+    data.local_matrix = 0;
+
+    for (unsigned int q=0; q<n_q_points; ++q)
+      {
+        for (unsigned int k=0; k<dofs_per_cell; ++k)
+          {
+            scratch.grad_phi_u[k] = scratch.stokes_fe_values[velocities].gradient(k,q);
+            scratch.phi_p[k]      = scratch.stokes_fe_values[pressure].value (k, q);
+          }
+
+        for (unsigned int i=0; i<dofs_per_cell; ++i)
+          for (unsigned int j=0; j<dofs_per_cell; ++j)
+            data.local_matrix(i,j) += (EquationData::eta *
+                                       scalar_product (scratch.grad_phi_u[i],
+                                                       scratch.grad_phi_u[j])
+                                       +
+                                       (1./EquationData::eta) *
+                                       EquationData::pressure_scaling *
+                                       EquationData::pressure_scaling *
+                                       (scratch.phi_p[i] * scratch.phi_p[j]))
+                                      * scratch.stokes_fe_values.JxW(q);
+      }
+  }
+
+
+
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::
+  copy_local_to_global_stokes_preconditioner (const Assembly::CopyData::StokesPreconditioner<dim> &data)
+  {
+    stokes_constraints.distribute_local_to_global (data.local_matrix,
+                                                   data.local_dof_indices,
+                                                   stokes_preconditioner_matrix);
+  }
+
+
+                                   // Now for the function that actually puts
+                                   // things together, using the WorkStream
+                                   // functions.  WorkStream::run needs a
+                                   // start and end iterator to enumerate the
+                                   // cells it is supposed to work
+                                   // on. Typically, one would use
+                                   // DoFHandler::begin_active() and
+                                   // DoFHandler::end() for that but here we
+                                   // actually only want the subset of cells
+                                   // that in fact are owned by the current
+                                   // processor. This is where the
+                                   // FilteredIterator class comes into play:
+                                   // you give it a range of cells and it
+                                   // provides an iterator that only iterates
+                                   // over that subset of cells that satisfy a
+                                   // certain predicate (a predicate is a
+                                   // function of one argument that either
+                                   // returns true or false). The predicate we
+                                   // use here is
+                                   // IteratorFilters::LocallyOwnedCell, i.e.,
+                                   // it returns true exactly if the cell is
+                                   // owned by the current processor. The
+                                   // resulting iterator range is then exactly
+                                   // what we need.
+                                   //
+                                   // With this obstacle out of the way, we
+                                   // call the WorkStream::run function with
+                                   // this set of cells, scratch and copy
+                                   // objects, and with pointers to two
+                                   // functions: the local assembly and
+                                   // copy-local-to-global function. These
+                                   // functions need to have very specific
+                                   // signatures: three arguments in the first
+                                   // and one argument in the latter case (see
+                                   // the documentation of the WorkStream::run
+                                   // function for the meaning of these
+                                   // arguments).  Note how we use the
+                                   // construct <code>std_cxx1x::bind</code>
+                                   // to create a function object that
+                                   // satisfies this requirement. It uses
+                                   // placeholders <code>_1, std_cxx1x::_2,
+                                   // _3</code> for the local assembly
+                                   // function that specify cell, scratch
+                                   // data, and copy data, as well as the
+                                   // placeholder <code>_1</code> for the copy
+                                   // function that expects the data to be
+                                   // written into the global matrix. On the
+                                   // other hand, the implicit zeroth argument
+                                   // of member functions (namely the
+                                   // <code>this</code> pointer of the object
+                                   // on which that member function is to
+                                   // operate on) is <i>bound</i> to the
+                                   // <code>this</code> pointer of the current
+                                   // function. The WorkStream::run function,
+                                   // as a consequence, does not need to know
+                                   // anything about the object these
+                                   // functions work on.
+                                   //
+                                   // When the WorkStream is executed, it will
+                                   // create several local assembly routines
+                                   // of the first kind for several cells and
+                                   // let some available processors work on
+                                   // them. The function that needs to be
+                                   // synchronized, i.e., the write operation
+                                   // into the global matrix, however, is
+                                   // executed by only one thread at a time in
+                                   // the prescribed order. Of course, this
+                                   // only holds for the parallelization on a
+                                   // single MPI process. Different MPI
+                                   // processes will have their own WorkStream
+                                   // objects and do that work completely
+                                   // independently (and in different memory
+                                   // spaces). In a distributed calculation,
+                                   // some data will accumulate at degrees of
+                                   // freedom that are not owned by the
+                                   // respective processor. It would be
+                                   // inefficient to send data around every
+                                   // time we encounter such a dof. What
+                                   // happens instead is that the Trilinos
+                                   // sparse matrix will keep that data and
+                                   // send it to the owner at the end of
+                                   // assembly, by calling the
+                                   // <code>compress()</code> command.
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::assemble_stokes_preconditioner ()
+  {
+    stokes_preconditioner_matrix = 0;
+
+    const QGauss<dim> quadrature_formula(parameters.stokes_velocity_degree+1);
+
+    typedef
+      FilteredIterator<typename DoFHandler<dim>::active_cell_iterator>
+      CellFilter;
+
+    WorkStream::
+      run (CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       stokes_dof_handler.begin_active()),
+           CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       stokes_dof_handler.end()),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            local_assemble_stokes_preconditioner,
+                            this,
+                            std_cxx1x::_1,
+                            std_cxx1x::_2,
+                            std_cxx1x::_3),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            copy_local_to_global_stokes_preconditioner,
+                            this,
+                            std_cxx1x::_1),
+           Assembly::Scratch::
+           StokesPreconditioner<dim> (stokes_fe, quadrature_formula,
+                                      mapping,
+                                      update_JxW_values |
+                                      update_values |
+                                      update_gradients),
+           Assembly::CopyData::
+           StokesPreconditioner<dim> (stokes_fe));
+
+    stokes_preconditioner_matrix.compress();
+  }
+
+
+
+                                   // The final function in this block
+                                   // initiates assembly of the Stokes
+                                   // preconditioner matrix and then in fact
+                                   // builds the Stokes preconditioner. It is
+                                   // mostly the same as in the serial
+                                   // case. The only difference to step-31 is
+                                   // that we use a Jacobi preconditioner for
+                                   // the pressure mass matrix instead of IC,
+                                   // as discussed in the introduction.
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::build_stokes_preconditioner ()
+  {
+    if (rebuild_stokes_preconditioner == false)
+      return;
+
+    computing_timer.enter_section ("   Build Stokes preconditioner");
+    pcout << "   Rebuilding Stokes preconditioner..." << std::flush;
+
+    assemble_stokes_preconditioner ();
+
+    std::vector<std::vector<bool> > constant_modes;
+    std::vector<bool>  velocity_components (dim+1,true);
+    velocity_components[dim] = false;
+    DoFTools::extract_constant_modes (stokes_dof_handler, velocity_components,
+                                      constant_modes);
+
+    Mp_preconditioner.reset  (new PETScWrappers::PreconditionJacobi());
+    Amg_preconditioner.reset (new PETScWrappers::PreconditionBoomerAMG());
+
+    PETScWrappers::PreconditionBoomerAMG::AdditionalData Amg_data;
+//    Amg_data.constant_modes = constant_modes;
+//    Amg_data.elliptic = true;
+//    Amg_data.higher_order_elements = true;
+//    Amg_data.smoother_sweeps = 2;
+//    Amg_data.aggregation_threshold = 0.02;
+
+    Mp_preconditioner->initialize (stokes_preconditioner_matrix.block(1,1));
+    Amg_preconditioner->initialize (stokes_preconditioner_matrix.block(0,0),
+                                    Amg_data);
+
+    rebuild_stokes_preconditioner = false;
+
+    pcout << std::endl;
+    computing_timer.exit_section();
+  }
+
+
+                                   // @sect5{Stokes system assembly}
+
+                                   // The next three functions implement the
+                                   // assembly of the Stokes system, again
+                                   // split up into a part performing local
+                                   // calculations, one for writing the local
+                                   // data into the global matrix and vector,
+                                   // and one for actually running the loop
+                                   // over all cells with the help of the
+                                   // WorkStream class. Note that the assembly
+                                   // of the Stokes matrix needs only to be
+                                   // done in case we have changed the
+                                   // mesh. Otherwise, just the
+                                   // (temperature-dependent) right hand side
+                                   // needs to be calculated here. Since we
+                                   // are working with distributed matrices
+                                   // and vectors, we have to call the
+                                   // respective <code>compress()</code>
+                                   // functions in the end of the assembly in
+                                   // order to send non-local data to the
+                                   // owner process.
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::
+  local_assemble_stokes_system (const typename DoFHandler<dim>::active_cell_iterator &cell,
+                                Assembly::Scratch::StokesSystem<dim> &scratch,
+                                Assembly::CopyData::StokesSystem<dim> &data)
+  {
+//	  static int debug_index = 0;
+//	  std::cout << "debug_index = " << debug_index << std::endl;
+
+    const unsigned int dofs_per_cell = scratch.stokes_fe_values.get_fe().dofs_per_cell;
+    const unsigned int n_q_points    = scratch.stokes_fe_values.n_quadrature_points;
+
+    const FEValuesExtractors::Vector velocities (0);
+    const FEValuesExtractors::Scalar pressure (dim);
+
+    scratch.stokes_fe_values.reinit (cell);
+
+    typename DoFHandler<dim>::active_cell_iterator
+      temperature_cell (&triangulation,
+                        cell->level(),
+                        cell->index(),
+                        &temperature_dof_handler);
+    scratch.temperature_fe_values.reinit (temperature_cell);
+
+    if (rebuild_stokes_matrix)
+      data.local_matrix = 0;
+    data.local_rhs = 0;
+
+    scratch.temperature_fe_values.get_function_values (old_temperature_solution,
+                                                       scratch.old_temperature_values);
+
+    for (unsigned int q=0; q<n_q_points; ++q)
+      {
+        const double old_temperature = scratch.old_temperature_values[q];
+
+        for (unsigned int k=0; k<dofs_per_cell; ++k)
+          {
+            scratch.phi_u[k] = scratch.stokes_fe_values[velocities].value (k,q);
+            if (rebuild_stokes_matrix)
+              {
+                scratch.grads_phi_u[k] = scratch.stokes_fe_values[velocities].symmetric_gradient(k,q);
+                scratch.div_phi_u[k]   = scratch.stokes_fe_values[velocities].divergence (k, q);
+                scratch.phi_p[k]       = scratch.stokes_fe_values[pressure].value (k, q);
+              }
+          }
+
+        if (rebuild_stokes_matrix == true)
+          for (unsigned int i=0; i<dofs_per_cell; ++i)
+            for (unsigned int j=0; j<dofs_per_cell; ++j)
+              data.local_matrix(i,j) += (EquationData::eta * 2 *
+                                         (scratch.grads_phi_u[i] * scratch.grads_phi_u[j])
+                                         - (EquationData::pressure_scaling *
+                                            scratch.div_phi_u[i] * scratch.phi_p[j])
+                                         - (EquationData::pressure_scaling *
+                                            scratch.phi_p[i] * scratch.div_phi_u[j]))
+                                        * scratch.stokes_fe_values.JxW(q);
+
+        const Tensor<1,dim>
+          gravity = EquationData::gravity_vector (scratch.stokes_fe_values
+                                                  .quadrature_point(q));
+
+        for (unsigned int i=0; i<dofs_per_cell; ++i)
+        {
+//        	std::cout.precision(20);
+//        	std::cout << std::fixed;
+//        	std::cout << "i = " << i << ", " << data.local_rhs(q) << ", " <<
+//        			old_temperature << ", " <<
+//        			EquationData::density(old_temperature) << ", " <<
+//        			gravity << ", " <<
+//        			scratch.phi_u[i] << ", " <<
+//        			scratch.stokes_fe_values.JxW(q) << std::endl;
+
+          data.local_rhs(i) += (EquationData::density(old_temperature) *
+                                gravity  *
+                                scratch.phi_u[i]) *
+                               scratch.stokes_fe_values.JxW(q);
+        }
+      }
+
+    cell->get_dof_indices (data.local_dof_indices);
+
+//    debug_index++;
+//    data.local_rhs.print(std::cout,7,false,false);
+  }
+
+
+
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::
+  copy_local_to_global_stokes_system (const Assembly::CopyData::StokesSystem<dim> &data)
+  {
+//	  return;
+//	  static int debug_index = 0;
+//	  std::cout << "debug_index = " << debug_index << std::endl;
+//	  debug_index++;
+//	  data.local_rhs.print(std::cout,7,false,false);
+
+    if (rebuild_stokes_matrix == true)
+      stokes_constraints.distribute_local_to_global (data.local_matrix,
+                                                     data.local_rhs,
+                                                     data.local_dof_indices,
+                                                     stokes_matrix,
+                                                     stokes_rhs);
+    else
+      stokes_constraints.distribute_local_to_global (data.local_rhs,
+                                                     data.local_dof_indices,
+                                                     stokes_rhs);
+  }
+
+
+
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::assemble_stokes_system ()
+  {
+    computing_timer.enter_section ("   Assemble Stokes system");
+
+    if (rebuild_stokes_matrix == true)
+      stokes_matrix=0;
+
+    stokes_rhs=0;
+
+    const QGauss<dim> quadrature_formula(parameters.stokes_velocity_degree+1);
+
+    typedef
+      FilteredIterator<typename DoFHandler<dim>::active_cell_iterator>
+      CellFilter;
+
+    WorkStream::
+      run (CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       stokes_dof_handler.begin_active()),
+           CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       stokes_dof_handler.end()),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            local_assemble_stokes_system,
+                            this,
+                            std_cxx1x::_1,
+                            std_cxx1x::_2,
+                            std_cxx1x::_3),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            copy_local_to_global_stokes_system,
+                            this,
+                            std_cxx1x::_1),
+           Assembly::Scratch::
+           StokesSystem<dim> (stokes_fe, mapping, quadrature_formula,
+                              (update_values    |
+                               update_quadrature_points  |
+                               update_JxW_values |
+                               (rebuild_stokes_matrix == true
+                                ?
+                                update_gradients
+                                :
+                                UpdateFlags(0))),
+                              temperature_fe,
+                              update_values),
+           Assembly::CopyData::
+           StokesSystem<dim> (stokes_fe));
+
+    stokes_matrix.compress(dealii::VectorOperation::add);
+    stokes_rhs.compress(dealii::VectorOperation::add); //stokes_rhs.compress(Add);
+
+//    std::ofstream filename_matrix, filename_vector;
+//    filename_matrix.open("stokes_matrix00.txt");
+//    stokes_matrix.block(1,0).write_ascii();
+//    stokes_matrix.block(1,1).write_ascii();
+//    filename_vector.open("stokes_rhs.txt");
+//    stokes_rhs.print(filename_vector,7,false,false);
+
+    rebuild_stokes_matrix = false;
+
+    pcout << std::endl;
+    computing_timer.exit_section();
+  }
+
+
+                                   // @sect5{Temperature matrix assembly}
+
+                                   // The task to be performed by the next
+                                   // three functions is to calculate a mass
+                                   // matrix and a Laplace matrix on the
+                                   // temperature system. These will be
+                                   // combined in order to yield the
+                                   // semi-implicit time stepping matrix that
+                                   // consists of the mass matrix plus a time
+                                   // step-dependent weight factor times the
+                                   // Laplace matrix. This function is again
+                                   // essentially the body of the loop over
+                                   // all cells from step-31.
+                                   //
+                                   // The two following functions perform
+                                   // similar services as the ones above.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::
+  local_assemble_temperature_matrix (const typename DoFHandler<dim>::active_cell_iterator &cell,
+                                     Assembly::Scratch::TemperatureMatrix<dim> &scratch,
+                                     Assembly::CopyData::TemperatureMatrix<dim> &data)
+  {
+    const unsigned int dofs_per_cell = scratch.temperature_fe_values.get_fe().dofs_per_cell;
+    const unsigned int n_q_points    = scratch.temperature_fe_values.n_quadrature_points;
+
+    scratch.temperature_fe_values.reinit (cell);
+    cell->get_dof_indices (data.local_dof_indices);
+
+    data.local_mass_matrix = 0;
+    data.local_stiffness_matrix = 0;
+
+    for (unsigned int q=0; q<n_q_points; ++q)
+      {
+        for (unsigned int k=0; k<dofs_per_cell; ++k)
+          {
+            scratch.grad_phi_T[k] = scratch.temperature_fe_values.shape_grad (k,q);
+            scratch.phi_T[k]      = scratch.temperature_fe_values.shape_value (k, q);
+          }
+
+        for (unsigned int i=0; i<dofs_per_cell; ++i)
+          for (unsigned int j=0; j<dofs_per_cell; ++j)
+            {
+              data.local_mass_matrix(i,j)
+                += (scratch.phi_T[i] * scratch.phi_T[j]
+                    *
+                    scratch.temperature_fe_values.JxW(q));
+              data.local_stiffness_matrix(i,j)
+                += (EquationData::kappa * scratch.grad_phi_T[i] * scratch.grad_phi_T[j]
+                    *
+                    scratch.temperature_fe_values.JxW(q));
+            }
+      }
+  }
+
+
+
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::
+  copy_local_to_global_temperature_matrix (const Assembly::CopyData::TemperatureMatrix<dim> &data)
+  {
+    temperature_constraints.distribute_local_to_global (data.local_mass_matrix,
+                                                        data.local_dof_indices,
+                                                        temperature_mass_matrix);
+    temperature_constraints.distribute_local_to_global (data.local_stiffness_matrix,
+                                                        data.local_dof_indices,
+                                                        temperature_stiffness_matrix);
+  }
+
+
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::assemble_temperature_matrix ()
+  {
+    if (rebuild_temperature_matrices == false)
+      return;
+
+    computing_timer.enter_section ("   Assemble temperature matrices");
+    temperature_mass_matrix = 0;
+    temperature_stiffness_matrix = 0;
+
+    const QGauss<dim> quadrature_formula(parameters.temperature_degree+2);
+
+    typedef
+      FilteredIterator<typename DoFHandler<dim>::active_cell_iterator>
+      CellFilter;
+
+    WorkStream::
+      run (CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       temperature_dof_handler.begin_active()),
+           CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       temperature_dof_handler.end()),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            local_assemble_temperature_matrix,
+                            this,
+                            std_cxx1x::_1,
+                            std_cxx1x::_2,
+                            std_cxx1x::_3),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            copy_local_to_global_temperature_matrix,
+                            this,
+                            std_cxx1x::_1),
+           Assembly::Scratch::
+           TemperatureMatrix<dim> (temperature_fe, mapping, quadrature_formula),
+           Assembly::CopyData::
+           TemperatureMatrix<dim> (temperature_fe));
+
+    temperature_mass_matrix.compress();
+    temperature_stiffness_matrix.compress();
+
+    rebuild_temperature_matrices = false;
+    rebuild_temperature_preconditioner = true;
+
+    computing_timer.exit_section();
+  }
+
+
+                                   // @sect5{Temperature right hand side assembly}
+
+                                   // This is the last assembly function. It
+                                   // calculates the right hand side of the
+                                   // temperature system, which includes the
+                                   // convection and the stabilization
+                                   // terms. It includes a lot of evaluations
+                                   // of old solutions at the quadrature
+                                   // points (which are necessary for
+                                   // calculating the artificial viscosity of
+                                   // stabilization), but is otherwise similar
+                                   // to the other assembly functions. Notice,
+                                   // once again, how we resolve the dilemma
+                                   // of having inhomogeneous boundary
+                                   // conditions, by just making a right hand
+                                   // side at this point (compare the comments
+                                   // for the <code>project()</code> function
+                                   // above): We create some matrix columns
+                                   // with exactly the values that would be
+                                   // entered for the temperature stiffness
+                                   // matrix, in case we have inhomogeneously
+                                   // constrained dofs. That will account for
+                                   // the correct balance of the right hand
+                                   // side vector with the matrix system of
+                                   // temperature.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::
+  local_assemble_temperature_rhs (const std::pair<double,double> global_T_range,
+                                  const double                   global_max_velocity,
+                                  const double                   global_entropy_variation,
+                                  const typename DoFHandler<dim>::active_cell_iterator &cell,
+                                  Assembly::Scratch::TemperatureRHS<dim> &scratch,
+                                  Assembly::CopyData::TemperatureRHS<dim> &data)
+  {
+    const bool use_bdf2_scheme = (timestep_number != 0);
+
+    const unsigned int dofs_per_cell = scratch.temperature_fe_values.get_fe().dofs_per_cell;
+    const unsigned int n_q_points    = scratch.temperature_fe_values.n_quadrature_points;
+
+    const FEValuesExtractors::Vector velocities (0);
+
+    data.local_rhs = 0;
+    data.matrix_for_bc = 0;
+    cell->get_dof_indices (data.local_dof_indices);
+
+    scratch.temperature_fe_values.reinit (cell);
+
+    typename DoFHandler<dim>::active_cell_iterator
+      stokes_cell (&triangulation,
+                   cell->level(),
+                   cell->index(),
+                   &stokes_dof_handler);
+    scratch.stokes_fe_values.reinit (stokes_cell);
+
+    scratch.temperature_fe_values.get_function_values (old_temperature_solution,
+                                                       scratch.old_temperature_values);
+    scratch.temperature_fe_values.get_function_values (old_old_temperature_solution,
+                                                       scratch.old_old_temperature_values);
+
+    scratch.temperature_fe_values.get_function_gradients (old_temperature_solution,
+                                                          scratch.old_temperature_grads);
+    scratch.temperature_fe_values.get_function_gradients (old_old_temperature_solution,
+                                                          scratch.old_old_temperature_grads);
+
+    scratch.temperature_fe_values.get_function_laplacians (old_temperature_solution,
+                                                           scratch.old_temperature_laplacians);
+    scratch.temperature_fe_values.get_function_laplacians (old_old_temperature_solution,
+                                                           scratch.old_old_temperature_laplacians);
+
+    scratch.stokes_fe_values[velocities].get_function_values (stokes_solution,
+                                                              scratch.old_velocity_values);
+    scratch.stokes_fe_values[velocities].get_function_values (old_stokes_solution,
+                                                              scratch.old_old_velocity_values);
+    scratch.stokes_fe_values[velocities].get_function_symmetric_gradients (stokes_solution,
+                                                                           scratch.old_strain_rates);
+    scratch.stokes_fe_values[velocities].get_function_symmetric_gradients (old_stokes_solution,
+                                                                           scratch.old_old_strain_rates);
+
+    const double nu
+      = compute_viscosity (scratch.old_temperature_values,
+                           scratch.old_old_temperature_values,
+                           scratch.old_temperature_grads,
+                           scratch.old_old_temperature_grads,
+                           scratch.old_temperature_laplacians,
+                           scratch.old_old_temperature_laplacians,
+                           scratch.old_velocity_values,
+                           scratch.old_old_velocity_values,
+                           scratch.old_strain_rates,
+                           scratch.old_old_strain_rates,
+                           global_max_velocity,
+                           global_T_range.second - global_T_range.first,
+                           0.5 * (global_T_range.second + global_T_range.first),
+                           global_entropy_variation,
+                           cell->diameter());
+
+    for (unsigned int q=0; q<n_q_points; ++q)
+      {
+        for (unsigned int k=0; k<dofs_per_cell; ++k)
+          {
+            scratch.phi_T[k]      = scratch.temperature_fe_values.shape_value (k, q);
+            scratch.grad_phi_T[k] = scratch.temperature_fe_values.shape_grad (k,q);
+          }
+
+
+        const double T_term_for_rhs
+          = (use_bdf2_scheme ?
+             (scratch.old_temperature_values[q] *
+              (1 + time_step/old_time_step)
+              -
+              scratch.old_old_temperature_values[q] *
+              (time_step * time_step) /
+              (old_time_step * (time_step + old_time_step)))
+             :
+             scratch.old_temperature_values[q]);
+
+        const double ext_T
+          = (use_bdf2_scheme ?
+             (scratch.old_temperature_values[q] *
+              (1 + time_step/old_time_step)
+              -
+              scratch.old_old_temperature_values[q] *
+              time_step/old_time_step)
+             :
+             scratch.old_temperature_values[q]);
+
+        const Tensor<1,dim> ext_grad_T
+          = (use_bdf2_scheme ?
+             (scratch.old_temperature_grads[q] *
+              (1 + time_step/old_time_step)
+              -
+              scratch.old_old_temperature_grads[q] *
+              time_step/old_time_step)
+             :
+             scratch.old_temperature_grads[q]);
+
+        const Tensor<1,dim> extrapolated_u
+          = (use_bdf2_scheme ?
+             (scratch.old_velocity_values[q] *
+              (1 + time_step/old_time_step)
+              -
+              scratch.old_old_velocity_values[q] *
+              time_step/old_time_step)
+             :
+             scratch.old_velocity_values[q]);
+
+        const SymmetricTensor<2,dim> extrapolated_strain_rate
+          = (use_bdf2_scheme ?
+             (scratch.old_strain_rates[q] *
+              (1 + time_step/old_time_step)
+              -
+              scratch.old_old_strain_rates[q] *
+              time_step/old_time_step)
+             :
+             scratch.old_strain_rates[q]);
+
+        const double gamma
+          = ((EquationData::radiogenic_heating * EquationData::density(ext_T)
+              +
+              2 * EquationData::eta * extrapolated_strain_rate * extrapolated_strain_rate) /
+             (EquationData::density(ext_T) * EquationData::specific_heat));
+
+        for (unsigned int i=0; i<dofs_per_cell; ++i)
+          {
+            data.local_rhs(i) += (T_term_for_rhs * scratch.phi_T[i]
+                                  -
+                                  time_step *
+                                  extrapolated_u * ext_grad_T * scratch.phi_T[i]
+                                  -
+                                  time_step *
+                                  nu * ext_grad_T * scratch.grad_phi_T[i]
+                                  +
+                                  time_step *
+                                  gamma * scratch.phi_T[i])
+                                 *
+                                 scratch.temperature_fe_values.JxW(q);
+
+            if (temperature_constraints.is_inhomogeneously_constrained(data.local_dof_indices[i]))
+              {
+                for (unsigned int j=0; j<dofs_per_cell; ++j)
+                  data.matrix_for_bc(j,i) += (scratch.phi_T[i] * scratch.phi_T[j] *
+                                              (use_bdf2_scheme ?
+                                               ((2*time_step + old_time_step) /
+                                                (time_step + old_time_step)) : 1.)
+                                              +
+                                              scratch.grad_phi_T[i] *
+                                              scratch.grad_phi_T[j] *
+                                              EquationData::kappa *
+                                              time_step)
+                                             *
+                                             scratch.temperature_fe_values.JxW(q);
+              }
+          }
+      }
+  }
+
+
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::
+  copy_local_to_global_temperature_rhs (const Assembly::CopyData::TemperatureRHS<dim> &data)
+  {
+    temperature_constraints.distribute_local_to_global (data.local_rhs,
+                                                        data.local_dof_indices,
+                                                        temperature_rhs,
+                                                        data.matrix_for_bc);
+  }
+
+
+
+                                   // In the function that runs the WorkStream
+                                   // for actually calculating the right hand
+                                   // side, we also generate the final
+                                   // matrix. As mentioned above, it is a sum
+                                   // of the mass matrix and the Laplace
+                                   // matrix, times some time step-dependent
+                                   // weight. This weight is specified by the
+                                   // BDF-2 time integration scheme, see the
+                                   // introduction in step-31. What is new in
+                                   // this tutorial program (in addition to
+                                   // the use of MPI parallelization and the
+                                   // WorkStream class), is that we now
+                                   // precompute the temperature
+                                   // preconditioner as well. The reason is
+                                   // that the setup of the Jacobi
+                                   // preconditioner takes a noticeable time
+                                   // compared to the solver because we
+                                   // usually only need between 10 and 20
+                                   // iterations for solving the temperature
+                                   // system (this might sound strange, as
+                                   // Jacobi really only consists of a
+                                   // diagonal, but in Trilinos it is derived
+                                   // from more general framework for point
+                                   // relaxation preconditioners which is a
+                                   // bit inefficient). Hence, it is more
+                                   // efficient to precompute the
+                                   // preconditioner, even though the matrix
+                                   // entries may slightly change because the
+                                   // time step might change. This is not too
+                                   // big a problem because we remesh every
+                                   // few time steps (and regenerate the
+                                   // preconditioner then).
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::assemble_temperature_system (const double maximal_velocity)
+  {
+    const bool use_bdf2_scheme = (timestep_number != 0);
+
+    if (use_bdf2_scheme == true)
+      {
+        temperature_matrix.copy_from (temperature_mass_matrix);
+        temperature_matrix *= (2*time_step + old_time_step) /
+                              (time_step + old_time_step);
+        temperature_matrix.add (time_step, temperature_stiffness_matrix);
+      }
+    else
+      {
+        temperature_matrix.copy_from (temperature_mass_matrix);
+        temperature_matrix.add (time_step, temperature_stiffness_matrix);
+      }
+    temperature_matrix.compress();
+
+    if (rebuild_temperature_preconditioner == true)
+      {
+        T_preconditioner.reset (new PETScWrappers::PreconditionJacobi());
+        T_preconditioner->initialize (temperature_matrix);
+        rebuild_temperature_preconditioner = false;
+      }
+
+                                     // The next part is computing the right
+                                     // hand side vectors.  To do so, we first
+                                     // compute the average temperature $T_m$
+                                     // that we use for evaluating the
+                                     // artificial viscosity stabilization
+                                     // through the residual $E(T) =
+                                     // (T-T_m)^2$. We do this by defining the
+                                     // midpoint between maximum and minimum
+                                     // temperature as average temperature in
+                                     // the definition of the entropy
+                                     // viscosity. An alternative would be to
+                                     // use the integral average, but the
+                                     // results are not very sensitive to this
+                                     // choice. The rest then only requires
+                                     // calling WorkStream::run again, binding
+                                     // the arguments to the
+                                     // <code>local_assemble_temperature_rhs</code>
+                                     // function that are the same in every
+                                     // call to the correct values:
+    temperature_rhs = 0;
+
+    const QGauss<dim> quadrature_formula(parameters.temperature_degree+2);
+    const std::pair<double,double>
+      global_T_range = get_extrapolated_temperature_range();
+
+    const double average_temperature = 0.5 * (global_T_range.first +
+                                              global_T_range.second);
+    const double global_entropy_variation =
+      get_entropy_variation (average_temperature);
+
+    typedef
+      FilteredIterator<typename DoFHandler<dim>::active_cell_iterator>
+      CellFilter;
+
+    WorkStream::
+      run (CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       temperature_dof_handler.begin_active()),
+           CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       temperature_dof_handler.end()),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            local_assemble_temperature_rhs,
+                            this,
+                            global_T_range,
+                            maximal_velocity,
+                            global_entropy_variation,
+                            std_cxx1x::_1,
+                            std_cxx1x::_2,
+                            std_cxx1x::_3),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            copy_local_to_global_temperature_rhs,
+                            this,
+                            std_cxx1x::_1),
+           Assembly::Scratch::
+           TemperatureRHS<dim> (temperature_fe, stokes_fe, mapping,
+                                quadrature_formula),
+           Assembly::CopyData::
+           TemperatureRHS<dim> (temperature_fe));
+
+    temperature_rhs.compress(); //temperature_rhs.compress(Add);
+  }
+
+
+
+
+                                   // @sect4{BoussinesqFlowProblem::solve}
+
+                                   // This function solves the linear systems
+                                   // in each time step of the Boussinesq
+                                   // problem. First, we
+                                   // work on the Stokes system and then on
+                                   // the temperature system. In essence, it
+                                   // does the same things as the respective
+                                   // function in step-31. However, there are a few
+                                   // changes here.
+                                   //
+                                   // The first change is related to the way
+                                   // we store our solution: we keep the
+                                   // vectors with locally owned degrees of
+                                   // freedom plus ghost nodes on each MPI
+                                   // node. When we enter a solver which is
+                                   // supposed to perform matrix-vector
+                                   // products with a distributed matrix, this
+                                   // is not the appropriate form,
+                                   // though. There, we will want to have the
+                                   // solution vector to be distributed in the
+                                   // same way as the matrix, i.e. without any
+                                   // ghosts. So what we do first is to
+                                   // generate a distributed vector called
+                                   // <code>distributed_stokes_solution</code>
+                                   // and put only the locally owned dofs into
+                                   // that, which is neatly done by the
+                                   // <code>operator=</code> of the Trilinos
+                                   // vector.
+                                   //
+                                   // Next, we scale the pressure solution (or
+                                   // rather, the initial guess) for the
+                                   // solver so that it matches with the
+                                   // length scales in the matrices, as
+                                   // discussed in the introduction. We also
+                                   // immediately scale the pressure solution
+                                   // back to the correct units after the
+                                   // solution is completed.  We also need to
+                                   // set the pressure values at hanging nodes
+                                   // to zero. This we also did in step-31 in
+                                   // order not to disturb the Schur
+                                   // complement by some vector entries that
+                                   // actually are irrelevant during the solve
+                                   // stage. As a difference to step-31, here
+                                   // we do it only for the locally owned
+                                   // pressure dofs. After solving for the
+                                   // Stokes solution, each processor copies
+                                   // the distributed solution back into the
+                                   // solution vector that also includes ghost
+                                   // elements.
+                                   //
+                                   // The third and most obvious change is
+                                   // that we have two variants for the Stokes
+                                   // solver: A fast solver that sometimes
+                                   // breaks down, and a robust solver that is
+                                   // slower. This is what we already
+                                   // discussed in the introduction. Here is
+                                   // how we realize it: First, we perform 30
+                                   // iterations with the fast solver based on
+                                   // the simple preconditioner based on the
+                                   // AMG V-cycle instead of an approximate
+                                   // solve (this is indicated by the
+                                   // <code>false</code> argument to the
+                                   // <code>LinearSolvers::BlockSchurPreconditioner</code>
+                                   // object). If we converge, everything is
+                                   // fine. If we do not converge, the solver
+                                   // control object will throw an exception
+                                   // SolverControl::NoConvergence. Usually,
+                                   // this would abort the program because we
+                                   // don't catch them in our usual
+                                   // <code>solve()</code> functions. This is
+                                   // certainly not what we want to happen
+                                   // here. Rather, we want to switch to the
+                                   // strong solver and continue the solution
+                                   // process with whatever vector we got so
+                                   // far. Hence, we catch the exception with
+                                   // the C++ try/catch mechanism. We then
+                                   // simply go through the same solver
+                                   // sequence again in the <code>catch</code>
+                                   // clause, this time passing the @p true
+                                   // flag to the preconditioner for the
+                                   // strong solver, signaling an approximate
+                                   // CG solve.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::solve ()
+  {
+    computing_timer.enter_section ("   Solve Stokes system");
+
+    {
+      pcout << "   Solving Stokes system... " << std::flush;
+
+      PETScWrappers::MPI::BlockVector
+        distributed_stokes_solution (stokes_rhs);
+      distributed_stokes_solution = stokes_solution;
+
+      distributed_stokes_solution.block(1) /= EquationData::pressure_scaling;
+
+      const unsigned int
+        start = (distributed_stokes_solution.block(0).size() +
+                 distributed_stokes_solution.block(1).local_range().first),
+        end   = (distributed_stokes_solution.block(0).size() +
+                 distributed_stokes_solution.block(1).local_range().second);
+      for (unsigned int i=start; i<end; ++i)
+        if (stokes_constraints.is_constrained (i))
+          distributed_stokes_solution(i) = 0;
+
+
+      PrimitiveVectorMemory<PETScWrappers::MPI::BlockVector> mem;
+
+      unsigned int n_iterations = 0;
+      const double solver_tolerance = 1e-8 * stokes_rhs.l2_norm();
+//      const double solver_tolerance = 1;
+      std::cout << "stokes_rhs.l2_norm() = " << stokes_rhs.l2_norm() << std::endl;
+      SolverControl solver_control (300, solver_tolerance);
+
+      try
+        {
+          const LinearSolvers::BlockSchurPreconditioner<PETScWrappers::PreconditionBoomerAMG,
+                                                        PETScWrappers::PreconditionJacobi>
+            preconditioner (stokes_matrix, stokes_preconditioner_matrix,
+                            *Mp_preconditioner, *Amg_preconditioner,
+                            false);
+
+          SolverGMRES<PETScWrappers::MPI::BlockVector>
+            solver(solver_control, mem,
+            		SolverGMRES<PETScWrappers::MPI::BlockVector>::
+                   AdditionalData(300, true));
+          solver.solve(stokes_matrix, distributed_stokes_solution, stokes_rhs,
+                       preconditioner);
+
+          n_iterations = solver_control.last_step();
+        }
+
+      catch (SolverControl::NoConvergence)
+        {
+          const LinearSolvers::BlockSchurPreconditioner<PETScWrappers::PreconditionBoomerAMG,
+                                                        PETScWrappers::PreconditionJacobi>
+            preconditioner (stokes_matrix, stokes_preconditioner_matrix,
+                            *Mp_preconditioner, *Amg_preconditioner,
+                            true);
+
+          SolverControl solver_control_refined (stokes_matrix.m(), solver_tolerance);
+          SolverGMRES<PETScWrappers::MPI::BlockVector>
+            solver(solver_control_refined, mem,
+            		SolverGMRES<PETScWrappers::MPI::BlockVector>::
+                   AdditionalData(500, true));
+          solver.solve(stokes_matrix, distributed_stokes_solution, stokes_rhs,
+                       preconditioner);
+
+          n_iterations = (solver_control.last_step() +
+                          solver_control_refined.last_step());
+        }
+
+
+      stokes_constraints.distribute (distributed_stokes_solution);
+
+      distributed_stokes_solution.block(1) *= EquationData::pressure_scaling;
+
+      stokes_solution = distributed_stokes_solution;
+      pcout << n_iterations  << " iterations."
+            << std::endl;
+
+      pcout << "distributed_stokes_solution linfty_norm: " <<
+    		  distributed_stokes_solution.block(0).linfty_norm() << ", " <<
+    		  distributed_stokes_solution.block(1).linfty_norm() << std::endl;
+
+    }
+    computing_timer.exit_section();
+
+
+                                     // Now let's turn to the temperature
+                                     // part: First, we compute the time step
+                                     // size. We found that we need smaller
+                                     // time steps for 3D than for 2D for the
+                                     // shell geometry. This is because the
+                                     // cells are more distorted in that case
+                                     // (it is the smallest edge length that
+                                     // determines the CFL number). Instead of
+                                     // computing the time step from maximum
+                                     // velocity and minimal mesh size as in
+                                     // step-31, we compute local CFL numbers,
+                                     // i.e., on each cell we compute the
+                                     // maximum velocity times the mesh size,
+                                     // and compute the maximum of
+                                     // them. Hence, we need to choose the
+                                     // factor in front of the time step
+                                     // slightly smaller.
+                                     //
+                                     // After temperature right hand side
+                                     // assembly, we solve the linear system
+                                     // for temperature (with fully
+                                     // distributed vectors without any
+                                     // ghosts), apply constraints and copy
+                                     // the vector back to one with ghosts.
+                                     //
+                                     // In the end, we extract the temperature
+                                     // range similarly to step-31 to produce
+                                     // some output (for example in order to
+                                     // help us choose the stabilization
+                                     // constants, as discussed in the
+                                     // introduction). The only difference is
+                                     // that we need to exchange maxima over
+                                     // all processors.
+    computing_timer.enter_section ("   Assemble temperature rhs");
+    {
+      old_time_step = time_step;
+
+      const double scaling = (dim==3 ? 0.25 : 1.0);
+      time_step = (scaling/(2.1*dim*std::sqrt(1.*dim)) /
+                   (parameters.temperature_degree *
+                    get_cfl_number()));
+
+      const double maximal_velocity = get_maximal_velocity();
+      pcout << "   Maximal velocity: "
+            << maximal_velocity *EquationData::year_in_seconds * 100
+            << " cm/year"
+            << std::endl;
+      pcout << "   " << "Time step: "
+            << time_step/EquationData::year_in_seconds
+            << " years"
+            << std::endl;
+
+      temperature_solution = old_temperature_solution;
+      assemble_temperature_system (maximal_velocity);
+    }
+    computing_timer.exit_section ();
+
+    computing_timer.enter_section ("   Solve temperature system");
+    {
+      SolverControl solver_control (temperature_matrix.m(),
+                                    1e-12*temperature_rhs.l2_norm());
+//      SolverCG<PETScWrappers::MPI::Vector>   cg (solver_control);
+      PETScWrappers::SolverGMRES cg(solver_control,MPI_COMM_WORLD);
+
+      PETScWrappers::MPI::Vector
+        distributed_temperature_solution (temperature_rhs);
+      distributed_temperature_solution = temperature_solution;
+
+      cg.solve (temperature_matrix, distributed_temperature_solution,
+                temperature_rhs, *T_preconditioner);
+//      cg.solve (temperature_matrix, distributed_temperature_solution,
+//                      temperature_rhs, PETScWrappers::PreconditionILU(temperature_matrix));
+                      
+      temperature_constraints.distribute (distributed_temperature_solution);
+      temperature_solution = distributed_temperature_solution;
+
+      pcout << "   "
+            << solver_control.last_step()
+            << " CG iterations for temperature" << std::endl;
+      computing_timer.exit_section();
+
+      double temperature[2] = { std::numeric_limits<double>::max(),
+                                -std::numeric_limits<double>::max() };
+      double global_temperature[2];
+
+//      for (unsigned int i=0; i<distributed_temperature_solution.local_size(); ++i)
+//        {
+//          temperature[0] = std::min<double> (temperature[0],
+//                                             distributed_temperature_solution.trilinos_vector()[0][i]);
+//          temperature[1] = std::max<double> (temperature[1],
+//                                             distributed_temperature_solution.trilinos_vector()[0][i]);
+//        }
+      temperature[0] = distributed_temperature_solution.min();
+      temperature[1] = distributed_temperature_solution.max();
+
+      temperature[0] *= -1.0;
+      Utilities::MPI::max (temperature, MPI_COMM_WORLD, global_temperature);
+      global_temperature[0] *= -1.0;
+
+      pcout << "   Temperature range: "
+            << global_temperature[0] << ' ' << global_temperature[1]
+            << std::endl;
+    }
+  }
+
+
+                                   // @sect4{BoussinesqFlowProblem::output_results}
+
+                                   // Next comes the function that generates
+                                   // the output. The quantities to output
+                                   // could be introduced manually like we did
+                                   // in step-31. An alternative is to hand
+                                   // this task over to a class PostProcessor
+                                   // that inherits from the class
+                                   // DataPostprocessor, which can be attached
+                                   // to DataOut. This allows us to output
+                                   // derived quantities from the solution,
+                                   // like the friction heating included in
+                                   // this example. It overloads the virtual
+                                   // function
+                                   // DataPostprocessor::compute_derived_quantities_vector,
+                                   // which is then internally called from
+                                   // DataOut::build_patches. We have to give
+                                   // it values of the numerical solution, its
+                                   // derivatives, normals to the cell, the
+                                   // actual evaluation points and any
+                                   // additional quantities. This follows the
+                                   // same procedure as discussed in step-29
+                                   // and other programs.
+  template <int dim>
+  class BoussinesqFlowProblem<dim>::Postprocessor : public DataPostprocessor<dim>
+  {
+    public:
+      Postprocessor (const unsigned int partition,
+                     const double       minimal_pressure);
+
+      virtual
+      void
+      compute_derived_quantities_vector (const std::vector<Vector<double> >              &uh,
+                                         const std::vector<std::vector<Tensor<1,dim> > > &duh,
+                                         const std::vector<std::vector<Tensor<2,dim> > > &dduh,
+                                         const std::vector<Point<dim> >                  &normals,
+                                         const std::vector<Point<dim> >                  &evaluation_points,
+                                         std::vector<Vector<double> >                    &computed_quantities) const;
+
+      virtual std::vector<std::string> get_names () const;
+
+      virtual
+      std::vector<DataComponentInterpretation::DataComponentInterpretation>
+      get_data_component_interpretation () const;
+
+      virtual UpdateFlags get_needed_update_flags () const;
+
+    private:
+      const unsigned int partition;
+      const double       minimal_pressure;
+  };
+
+
+  template <int dim>
+  BoussinesqFlowProblem<dim>::Postprocessor::
+  Postprocessor (const unsigned int partition,
+                 const double       minimal_pressure)
+                  :
+                  partition (partition),
+                  minimal_pressure (minimal_pressure)
+  {}
+
+
+                                   // Here we define the names for the
+                                   // variables we want to output. These are
+                                   // the actual solution values for velocity,
+                                   // pressure, and temperature, as well as
+                                   // the friction heating and to each cell
+                                   // the number of the processor that owns
+                                   // it. This allows us to visualize the
+                                   // partitioning of the domain among the
+                                   // processors. Except for the velocity,
+                                   // which is vector-valued, all other
+                                   // quantities are scalar.
+  template <int dim>
+  std::vector<std::string>
+  BoussinesqFlowProblem<dim>::Postprocessor::get_names() const
+  {
+    std::vector<std::string> solution_names (dim, "velocity");
+    solution_names.push_back ("p");
+    solution_names.push_back ("T");
+    solution_names.push_back ("friction_heating");
+    solution_names.push_back ("partition");
+
+    return solution_names;
+  }
+
+
+  template <int dim>
+  std::vector<DataComponentInterpretation::DataComponentInterpretation>
+  BoussinesqFlowProblem<dim>::Postprocessor::
+  get_data_component_interpretation () const
+  {
+    std::vector<DataComponentInterpretation::DataComponentInterpretation>
+      interpretation (dim,
+                      DataComponentInterpretation::component_is_part_of_vector);
+
+    interpretation.push_back (DataComponentInterpretation::component_is_scalar);
+    interpretation.push_back (DataComponentInterpretation::component_is_scalar);
+    interpretation.push_back (DataComponentInterpretation::component_is_scalar);
+    interpretation.push_back (DataComponentInterpretation::component_is_scalar);
+
+    return interpretation;
+  }
+
+
+  template <int dim>
+  UpdateFlags
+  BoussinesqFlowProblem<dim>::Postprocessor::get_needed_update_flags() const
+  {
+    return update_values | update_gradients | update_q_points;
+  }
+
+
+                                   // Now we implement the function that
+                                   // computes the derived quantities. As we
+                                   // also did for the output, we rescale the
+                                   // velocity from its SI units to something
+                                   // more readable, namely cm/year. Next, the
+                                   // pressure is scaled to be between 0 and
+                                   // the maximum pressure. This makes it more
+                                   // easily comparable -- in essence making
+                                   // all pressure variables positive or
+                                   // zero. Temperature is taken as is, and
+                                   // the friction heating is computed as $2
+                                   // \eta \varepsilon(\mathbf{u}) \cdot
+                                   // \varepsilon(\mathbf{u})$.
+                                   //
+                                   // The quantities we output here are more
+                                   // for illustration, rather than for actual
+                                   // scientific value. We come back to this
+                                   // briefly in the results section of this
+                                   // program and explain what one may in fact
+                                   // be interested in.
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::Postprocessor::
+  compute_derived_quantities_vector (const std::vector<Vector<double> >              &uh,
+                                     const std::vector<std::vector<Tensor<1,dim> > > &duh,
+                                     const std::vector<std::vector<Tensor<2,dim> > > &/*dduh*/,
+                                     const std::vector<Point<dim> >                  &/*normals*/,
+                                     const std::vector<Point<dim> >                  &/*evaluation_points*/,
+                                     std::vector<Vector<double> >                    &computed_quantities) const
+  {
+    const unsigned int n_quadrature_points = uh.size();
+    Assert (duh.size() == n_quadrature_points,                  ExcInternalError());
+    Assert (computed_quantities.size() == n_quadrature_points,  ExcInternalError());
+    Assert (uh[0].size() == dim+2,                              ExcInternalError());
+
+    for (unsigned int q=0; q<n_quadrature_points; ++q)
+      {
+        for (unsigned int d=0; d<dim; ++d)
+          computed_quantities[q](d)
+            = (uh[q](d) *  EquationData::year_in_seconds * 100);
+
+        const double pressure = (uh[q](dim)-minimal_pressure);
+        computed_quantities[q](dim) = pressure;
+
+        const double temperature = uh[q](dim+1);
+        computed_quantities[q](dim+1) = temperature;
+
+        Tensor<2,dim> grad_u;
+        for (unsigned int d=0; d<dim; ++d)
+          grad_u[d] = duh[q][d];
+        const SymmetricTensor<2,dim> strain_rate = symmetrize (grad_u);
+        computed_quantities[q](dim+2) = 2 * EquationData::eta *
+                                        strain_rate * strain_rate;
+
+        computed_quantities[q](dim+3) = partition;
+      }
+  }
+
+
+                                   // The <code>output_results()</code>
+                                   // function does mostly what the
+                                   // corresponding one did in to step-31, in
+                                   // particular the merging data from the two
+                                   // DoFHandler objects (for the Stokes and
+                                   // the temperature parts of the problem)
+                                   // into one. There is one minor change: we
+                                   // make sure that each processor only works
+                                   // on the subdomain it owns locally (and
+                                   // not on ghost or artificial cells) when
+                                   // building the joint solution vector. The
+                                   // same will then have to be done in
+                                   // DataOut::build_patches(), but that
+                                   // function does so automatically.
+                                   //
+                                   // What we end up with is a set of patches
+                                   // that we can write using the functions in
+                                   // DataOutBase in a variety of output
+                                   // formats. Here, we then have to pay
+                                   // attention that what each processor
+                                   // writes is really only its own part of
+                                   // the domain, i.e. we will want to write
+                                   // each processor's contribution into a
+                                   // separate file. This we do by adding an
+                                   // additional number to the filename when
+                                   // we write the solution. This is not
+                                   // really new, we did it similarly in
+                                   // step-40. Note that we write in the
+                                   // compressed format @p .vtu instead of
+                                   // plain vtk files, which saves quite some
+                                   // storage.
+                                   //
+                                   // All the rest of the work is done in the
+                                   // PostProcessor class.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::output_results ()
+  {
+    computing_timer.enter_section ("Postprocessing");
+
+    const FESystem<dim> joint_fe (stokes_fe, 1,
+                                  temperature_fe, 1);
+
+    DoFHandler<dim> joint_dof_handler (triangulation);
+    joint_dof_handler.distribute_dofs (joint_fe);
+    Assert (joint_dof_handler.n_dofs() ==
+            stokes_dof_handler.n_dofs() + temperature_dof_handler.n_dofs(),
+            ExcInternalError());
+
+    PETScWrappers::MPI::Vector joint_solution;
+    joint_solution.reinit(MPI_COMM_WORLD, joint_dof_handler.locally_owned_dofs()); //    joint_solution.reinit (joint_dof_handler.locally_owned_dofs(), MPI_COMM_WORLD);
+
+    {
+      std::vector<unsigned int> local_joint_dof_indices (joint_fe.dofs_per_cell);
+      std::vector<unsigned int> local_stokes_dof_indices (stokes_fe.dofs_per_cell);
+      std::vector<unsigned int> local_temperature_dof_indices (temperature_fe.dofs_per_cell);
+
+      typename DoFHandler<dim>::active_cell_iterator
+        joint_cell       = joint_dof_handler.begin_active(),
+        joint_endc       = joint_dof_handler.end(),
+        stokes_cell      = stokes_dof_handler.begin_active(),
+        temperature_cell = temperature_dof_handler.begin_active();
+      for (; joint_cell!=joint_endc;
+           ++joint_cell, ++stokes_cell, ++temperature_cell)
+        if (joint_cell->is_locally_owned())
+          {
+            joint_cell->get_dof_indices (local_joint_dof_indices);
+            stokes_cell->get_dof_indices (local_stokes_dof_indices);
+            temperature_cell->get_dof_indices (local_temperature_dof_indices);
+
+            for (unsigned int i=0; i<joint_fe.dofs_per_cell; ++i)
+              if (joint_fe.system_to_base_index(i).first.first == 0)
+                {
+                  Assert (joint_fe.system_to_base_index(i).second
+                          <
+                          local_stokes_dof_indices.size(),
+                          ExcInternalError());
+
+                  joint_solution(local_joint_dof_indices[i])
+                    = stokes_solution(local_stokes_dof_indices
+                                      [joint_fe.system_to_base_index(i).second]);
+                }
+              else
+                {
+                  Assert (joint_fe.system_to_base_index(i).first.first == 1,
+                          ExcInternalError());
+                  Assert (joint_fe.system_to_base_index(i).second
+                          <
+                          local_temperature_dof_indices.size(),
+                          ExcInternalError());
+                  joint_solution(local_joint_dof_indices[i])
+                    = temperature_solution(local_temperature_dof_indices
+                                           [joint_fe.system_to_base_index(i).second]);
+                }
+          }
+    }
+
+//    joint_solution.print(std::cout);
+    joint_solution.compress();
+
+    IndexSet locally_relevant_joint_dofs(joint_dof_handler.n_dofs());
+    DoFTools::extract_locally_relevant_dofs (joint_dof_handler, locally_relevant_joint_dofs);
+    PETScWrappers::MPI::Vector locally_relevant_joint_solution;
+    locally_relevant_joint_solution.reinit (MPI_COMM_WORLD,joint_dof_handler.locally_owned_dofs(),locally_relevant_joint_dofs); //    locally_relevant_joint_solution.reinit (locally_relevant_joint_dofs, MPI_COMM_WORLD);
+    locally_relevant_joint_solution = joint_solution;
+
+    Postprocessor postprocessor (Utilities::MPI::this_mpi_process(MPI_COMM_WORLD),
+    		stokes_solution.block(1).min()); //stokes_solution.block(1).minimal_value());
+
+    DataOut<dim> data_out;
+    data_out.attach_dof_handler (joint_dof_handler);
+    data_out.add_data_vector (locally_relevant_joint_solution, postprocessor);
+    data_out.build_patches ();
+
+    static int out_index=0;
+    const std::string filename = ("solution-" +
+                                  Utilities::int_to_string (out_index, 5) +
+                                  "." +
+                                  Utilities::int_to_string
+                                  (triangulation.locally_owned_subdomain(), 4) +
+                                  ".vtu");
+    std::ofstream output (filename.c_str());
+    data_out.write_vtu (output);
+
+
+                                     // At this point, all processors have
+                                     // written their own files to disk. We
+                                     // could visualize them individually in
+                                     // Visit or Paraview, but in reality we
+                                     // of course want to visualize the whole
+                                     // set of files at once. To this end, we
+                                     // create a master file in each of the
+                                     // formats understood by Visit
+                                     // (<code>.visit</code>) and Paraview
+                                     // (<code>.pvtu</code>) on the zeroth
+                                     // processor that describes how the
+                                     // individual files are defining the
+                                     // global data set.
+    if (Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) == 0)
+      {
+        std::vector<std::string> filenames;
+        for (unsigned int i=0; i<Utilities::MPI::n_mpi_processes(MPI_COMM_WORLD); ++i)
+          filenames.push_back (std::string("solution-") +
+                               Utilities::int_to_string (out_index, 5) +
+                               "." +
+                               Utilities::int_to_string(i, 4) +
+                               ".vtu");
+        const std::string
+          pvtu_master_filename = ("solution-" +
+                                  Utilities::int_to_string (out_index, 5) +
+                                  ".pvtu");
+        std::ofstream pvtu_master (pvtu_master_filename.c_str());
+        data_out.write_pvtu_record (pvtu_master, filenames);
+
+        const std::string
+          visit_master_filename = ("solution-" +
+                                   Utilities::int_to_string (out_index, 5) +
+                                   ".visit");
+        std::ofstream visit_master (visit_master_filename.c_str());
+        data_out.write_visit_record (visit_master, filenames);
+      }
+
+    computing_timer.exit_section ();
+    out_index++;
+  }
+
+
+
+                                   // @sect4{BoussinesqFlowProblem::refine_mesh}
+
+                                   // This function isn't really new
+                                   // either. Since the
+                                   // <code>setup_dofs</code> function that we
+                                   // call in the middle has its own timer
+                                   // section, we split timing this function
+                                   // into two sections. It will also allow us
+                                   // to easily identify which of the two is
+                                   // more expensive.
+                                   //
+                                   // One thing of note, however, is that we
+                                   // only want to compute error indicators on
+                                   // the locally owned subdomain. In order to
+                                   // achieve this, we pass one additional
+                                   // argument to the
+                                   // KellyErrorEstimator::estimate
+                                   // function. Note that the vector for error
+                                   // estimates is resized to the number of
+                                   // active cells present on the current
+                                   // process, which is less than the total
+                                   // number of active cells on all processors
+                                   // (but more than the number of locally
+                                   // owned active cells); each processor only
+                                   // has a few coarse cells around the
+                                   // locally owned ones, as also explained in
+                                   // step-40.
+                                   //
+                                   // The local error estimates are then
+                                   // handed to a %parallel version of
+                                   // GridRefinement (in namespace
+                                   // parallel::distributed::GridRefinement,
+                                   // see also step-40) which looks at the
+                                   // errors and finds the cells that need
+                                   // refinement by comparing the error values
+                                   // across processors. As in step-31, we
+                                   // want to limit the maximum grid level. So
+                                   // in case some cells have been marked that
+                                   // are already at the finest level, we
+                                   // simply clear the refine flags.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::refine_mesh (const unsigned int max_grid_level)
+  {
+    computing_timer.enter_section ("Refine mesh structure, part 1");
+    Vector<float> estimated_error_per_cell (triangulation.n_active_cells());
+
+    KellyErrorEstimator<dim>::estimate (temperature_dof_handler,
+                                        QGauss<dim-1>(parameters.temperature_degree+1),
+                                        typename FunctionMap<dim>::type(),
+                                        temperature_solution,
+                                        estimated_error_per_cell,
+                                        std::vector<bool>(),
+                                        0,
+                                        0,
+                                        triangulation.locally_owned_subdomain());
+
+    parallel::distributed::GridRefinement::
+      refine_and_coarsen_fixed_fraction (triangulation,
+                                         estimated_error_per_cell,
+                                         0.3, 0.1);
+
+    if (triangulation.n_levels() > max_grid_level)
+      for (typename Triangulation<dim>::active_cell_iterator
+             cell = triangulation.begin_active(max_grid_level);
+           cell != triangulation.end(); ++cell)
+        cell->clear_refine_flag ();
+
+                                     // With all flags marked as necessary, we
+                                     // set up the
+                                     // parallel::distributed::SolutionTransfer
+                                     // object to transfer the solutions for
+                                     // the current time level and the next
+                                     // older one. The syntax is similar to
+                                     // the non-%parallel solution transfer
+                                     // (with the exception that here a
+                                     // pointer to the vector entries is
+                                     // enough). The remainder of the function
+                                     // is concerned with setting up the data
+                                     // structures again after mesh refinement
+                                     // and restoring the solution vectors on
+                                     // the new mesh.
+    std::vector<const PETScWrappers::MPI::Vector *> x_temperature (2);
+    x_temperature[0] = &temperature_solution;
+    x_temperature[1] = &old_temperature_solution;
+    std::vector<const PETScWrappers::MPI::BlockVector *> x_stokes (2);
+    x_stokes[0] = &stokes_solution;
+    x_stokes[1] = &old_stokes_solution;
+
+    parallel::distributed::SolutionTransfer<dim,PETScWrappers::MPI::Vector>
+      temperature_trans(temperature_dof_handler);
+    parallel::distributed::SolutionTransfer<dim,PETScWrappers::MPI::BlockVector>
+      stokes_trans(stokes_dof_handler);
+
+    triangulation.prepare_coarsening_and_refinement();
+    temperature_trans.prepare_for_coarsening_and_refinement(x_temperature);
+    stokes_trans.prepare_for_coarsening_and_refinement(x_stokes);
+
+    triangulation.execute_coarsening_and_refinement ();
+    computing_timer.exit_section();
+
+    setup_dofs ();
+
+    computing_timer.enter_section ("Refine mesh structure, part 2");
+
+    {
+      PETScWrappers::MPI::Vector distributed_temp1 (temperature_rhs);
+      PETScWrappers::MPI::Vector distributed_temp2 (temperature_rhs);
+
+      std::vector<PETScWrappers::MPI::Vector *> tmp (2);
+      tmp[0] = &(distributed_temp1);
+      tmp[1] = &(distributed_temp2);
+      temperature_trans.interpolate(tmp);
+
+      temperature_solution     = distributed_temp1;
+      old_temperature_solution = distributed_temp2;
+    }
+
+    {
+      PETScWrappers::MPI::BlockVector distributed_stokes (stokes_rhs);
+      PETScWrappers::MPI::BlockVector old_distributed_stokes (stokes_rhs);
+
+      std::vector<PETScWrappers::MPI::BlockVector *> stokes_tmp (2);
+      stokes_tmp[0] = &(distributed_stokes);
+      stokes_tmp[1] = &(old_distributed_stokes);
+
+      stokes_trans.interpolate (stokes_tmp);
+      stokes_solution     = distributed_stokes;
+      old_stokes_solution = old_distributed_stokes;
+    }
+
+    computing_timer.exit_section();
+  }
+
+
+
+                                   // @sect4{BoussinesqFlowProblem::run}
+
+                                   // This is the final and controlling
+                                   // function in this class. It, in fact,
+                                   // runs the entire rest of the program and
+                                   // is, once more, very similar to
+                                   // step-31. We use a different mesh now (a
+                                   // GridGenerator::hyper_shell instead of a
+                                   // simple cube geometry), and use the
+                                   // <code>project_temperature_field()</code>
+                                   // function instead of the library function
+                                   // <code>VectorTools::project</code>, the
+                                   // rest is as before.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::run ()
+  {
+    GridGenerator::hyper_shell (triangulation,
+                                Point<dim>(),
+                                EquationData::R0,
+                                EquationData::R1,
+                                (dim==3) ? 96 : 12,
+                                true);
+    static HyperShellBoundary<dim> boundary;
+    triangulation.set_boundary (0, boundary);
+    triangulation.set_boundary (1, boundary);
+
+    global_Omega_diameter = GridTools::diameter (triangulation);
+
+    triangulation.refine_global (parameters.initial_global_refinement);
+
+    setup_dofs();
+
+    unsigned int pre_refinement_step = 0;
+
+    start_time_iteration:
+
+    project_temperature_field ();
+//    temperature_solution.print(std::cout,3,false,false);
+//    pcout << " temperature_solution linfty_norm: " << temperature_solution.linfty_norm() << std::endl;
+//    return;
+
+    timestep_number           = 0;
+    time_step = old_time_step = 0;
+
+    double time = 0;
+
+    do
+      {
+    	if(timestep_number==3)
+    		break;
+
+        pcout << "Timestep " << timestep_number
+              << ":  t=" << time/EquationData::year_in_seconds
+              << " years"
+              << std::endl;
+
+        assemble_stokes_system ();
+//        return;
+
+        build_stokes_preconditioner ();
+        assemble_temperature_matrix ();
+
+//        return;
+
+        solve ();
+
+        pcout << std::endl;
+
+        if ((timestep_number == 0) &&
+            (pre_refinement_step < parameters.initial_adaptive_refinement))
+          {
+            refine_mesh (parameters.initial_global_refinement +
+                         parameters.initial_adaptive_refinement);
+            ++pre_refinement_step;
+            goto start_time_iteration;
+          }
+        else if ((timestep_number > 0)
+                 &&
+                 (timestep_number % parameters.adaptive_refinement_interval == 0))
+          refine_mesh (parameters.initial_global_refinement +
+                       parameters.initial_adaptive_refinement);
+
+        if ((parameters.generate_graphical_output == true)
+            &&
+            (timestep_number % parameters.graphical_output_interval == 0))
+          output_results ();
+
+                                         // In order to speed up linear
+                                         // solvers, we extrapolate the
+                                         // solutions from the old time levels
+                                         // to the new one. This gives a very
+                                         // good initial guess, cutting the
+                                         // number of iterations needed in
+                                         // solvers by more than one half. We
+                                         // do not need to extrapolate in the
+                                         // last iteration, so if we reached
+                                         // the final time, we stop here.
+                                         //
+                                         // As the last thing during a
+                                         // time step (before actually
+                                         // bumping up the number of
+                                         // the time step), we check
+                                         // whether the current time
+                                         // step number is divisible
+                                         // by 100, and if so we let
+                                         // the computing timer print
+                                         // a summary of CPU times
+                                         // spent so far.
+        if (time > parameters.end_time * EquationData::year_in_seconds)
+          break;
+
+        PETScWrappers::MPI::BlockVector old_old_stokes_solution(old_stokes_solution);
+        old_old_stokes_solution      = old_stokes_solution;
+        old_stokes_solution          = stokes_solution;
+        old_old_temperature_solution = old_temperature_solution;
+        old_temperature_solution     = temperature_solution;
+        if (old_time_step > 0)
+          {
+            //Trilinos sadd does not like ghost vectors even as input. Copy into distributed vectors for now:
+            {
+              PETScWrappers::MPI::BlockVector distr_solution (stokes_rhs);
+              distr_solution = stokes_solution;
+              PETScWrappers::MPI::BlockVector distr_old_solution (stokes_rhs);
+              distr_old_solution = old_old_stokes_solution;
+              distr_solution .sadd (1.+time_step/old_time_step, -time_step/old_time_step,
+                  distr_old_solution);
+              stokes_solution = distr_solution;
+            }
+            {
+              PETScWrappers::MPI::Vector distr_solution (temperature_rhs);
+              distr_solution = temperature_solution;
+              PETScWrappers::MPI::Vector distr_old_solution (temperature_rhs);
+              distr_old_solution = old_old_temperature_solution;
+              distr_solution .sadd (1.+time_step/old_time_step, -time_step/old_time_step,
+                  distr_old_solution);
+              temperature_solution = distr_solution;
+            }
+          }
+
+        if ((timestep_number > 0) && (timestep_number % 100 == 0))
+          computing_timer.print_summary ();
+
+        time += time_step;
+        ++timestep_number;
+      }
+    while (true);
+
+                                     // If we are generating graphical
+                                     // output, do so also for the last
+                                     // time step unless we had just
+                                     // done so before we left the
+                                     // do-while loop
+    if ((parameters.generate_graphical_output == true)
+        &&
+        !((timestep_number-1) % parameters.graphical_output_interval == 0))
+      output_results ();
+  }
+}
+
+
+
+                                 // @sect3{The <code>main</code> function}
+
+                                 // The main function is short as usual and
+                                 // very similar to the one in step-31. Since
+                                 // we use a parameter file which is specified
+                                 // as an argument in the command line, we
+                                 // have to read it in here and pass it on to
+                                 // the Parameters class for parsing. If no
+                                 // filename is given in the command line, we
+                                 // simply use the <code>\step-32.prm</code>
+                                 // file which is distributed together with
+                                 // the program.
+                                 //
+                                 // Because 3d computations are simply
+                                 // very slow unless you throw a lot
+                                 // of processors at them, the program
+                                 // defaults to 2d. You can get the 3d
+                                 // version by changing the constant
+                                 // dimension below to 3.
+int main (int argc, char *argv[])
+{
+	using namespace Step32;
+  using namespace dealii;
+
+  Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv);
+  PetscInitialize(&argc,&argv,0,0);		//  Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv);
+
+  std::cout << "dealii::Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) = " <<
+		  dealii::Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) << std::endl;
+  try
+    {
+      deallog.depth_console (0);
+
+      std::string parameter_filename;
+      if (argc>=2)
+        parameter_filename = argv[1];
+      else
+        parameter_filename = "step-32.prm";
+
+      const int dim = 2;
+      BoussinesqFlowProblem<dim>::Parameters  parameters(parameter_filename);
+      BoussinesqFlowProblem<dim> flow_problem (parameters);
+      flow_problem.m_myrank = dealii::Utilities::MPI::this_mpi_process(MPI_COMM_WORLD);
+      flow_problem.run ();
+    }
+  catch (std::exception &exc)
+    {
+      std::cerr << std::endl << std::endl
+                << "----------------------------------------------------"
+                << std::endl;
+      std::cerr << "Exception on processing: " << std::endl
+                << exc.what() << std::endl
+                << "Aborting!" << std::endl
+                << "----------------------------------------------------"
+                << std::endl;
+
+      return 1;
+    }
+  catch (...)
+    {
+      std::cerr << std::endl << std::endl
+                << "----------------------------------------------------"
+                << std::endl;
+      std::cerr << "Unknown exception!" << std::endl
+                << "Aborting!" << std::endl
+                << "----------------------------------------------------"
+                << std::endl;
+      return 1;
+    }
+
+  dealii::GrowingVectorMemory<dealii::PETScWrappers::MPI::Vector>::release_unused_memory ();
+  dealii::GrowingVectorMemory<dealii::PETScWrappers::Vector>::release_unused_memory ();
+  PetscFinalize();
+
+  return 0;
+}