[cig-commits] r1394 - in branches/s-wang2: . for_deal.II for_deal.II/examples for_deal.II/examples/step-32 for_deal.II/include for_deal.II/include/deal.II for_deal.II/include/deal.II/lac for_deal.II/source for_deal.II/source/lac for_deal.II/source/numerics include/aspect include/aspect/particle include/aspect/postprocess source source/postprocess source/simulator

Wed Nov 28 21:55:57 PST 2012

Author: s-wang
Date: 2012-11-28 22:55:56 -0700 (Wed, 28 Nov 2012)
New Revision: 1394

Added:
   branches/s-wang2/for_deal.II/
   branches/s-wang2/for_deal.II/examples/
   branches/s-wang2/for_deal.II/examples/step-32/
   branches/s-wang2/for_deal.II/examples/step-32/test-step-32.cc
   branches/s-wang2/for_deal.II/include/
   branches/s-wang2/for_deal.II/include/deal.II/
   branches/s-wang2/for_deal.II/include/deal.II/lac/
   branches/s-wang2/for_deal.II/include/deal.II/lac/petsc_matrix_base.h
   branches/s-wang2/for_deal.II/include/deal.II/lac/petsc_parallel_block_vector.h
   branches/s-wang2/for_deal.II/include/deal.II/lac/petsc_solver.h
   branches/s-wang2/for_deal.II/include/deal.II/lac/trilinos_sparse_matrix.h
   branches/s-wang2/for_deal.II/include/deal.II/lac/trilinos_vector_base.h
   branches/s-wang2/for_deal.II/source/
   branches/s-wang2/for_deal.II/source/lac/
   branches/s-wang2/for_deal.II/source/lac/constraint_matrix.cc
   branches/s-wang2/for_deal.II/source/lac/petsc_matrix_base.cc
   branches/s-wang2/for_deal.II/source/lac/petsc_solver.cc
   branches/s-wang2/for_deal.II/source/lac/trilinos_sparse_matrix.cc
   branches/s-wang2/for_deal.II/source/lac/trilinos_vector_base.cc
   branches/s-wang2/for_deal.II/source/numerics/
   branches/s-wang2/for_deal.II/source/numerics/derivative_approximation.inst.in
   branches/s-wang2/include/aspect/global_trilinos.h
Modified:
   branches/s-wang2/include/aspect/global.h
   branches/s-wang2/include/aspect/particle/integrator.h
   branches/s-wang2/include/aspect/particle/world.h
   branches/s-wang2/include/aspect/postprocess/interface.h
   branches/s-wang2/include/aspect/simulator.h
   branches/s-wang2/source/main.cc
   branches/s-wang2/source/postprocess/composition_statistics.cc
   branches/s-wang2/source/postprocess/temperature_statistics.cc
   branches/s-wang2/source/simulator/assembly.cc
   branches/s-wang2/source/simulator/core.cc
   branches/s-wang2/source/simulator/helper_functions.cc
   branches/s-wang2/source/simulator/initial_conditions.cc
   branches/s-wang2/source/simulator/solver.cc
Log:
merged with new aspect.

Added: branches/s-wang2/for_deal.II/examples/step-32/test-step-32.cc
===================================================================

--- branches/s-wang2/for_deal.II/examples/step-32/test-step-32.cc	                        (rev 0)
+++ branches/s-wang2/for_deal.II/examples/step-32/test-step-32.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,4649 @@
+/* Author: Martin Kronbichler, Uppsala University,
+           Wolfgang Bangerth, Texas A&M University,
+           Timo Heister, University of Goettingen, 2008-2011 */
+/*                                                                */
+/*    Copyright (C) 2008, 2009, 2010, 2011, 2012 by the deal.II authors */
+/*                                                                */
+/*    This file is subject to QPL and may not be  distributed     */
+/*    without copyright and license information. Please refer     */
+/*    to the file deal.II/doc/license.html for the  text  and     */
+/*    further information on this license.                        */
+
+				 // @sect3{Include files}
+
+				 //The first task as usual is to
+				 // include the functionality of these
+				 // well-known deal.II library files
+				 // and some C++ header files.
+#include <deal.II/base/quadrature_lib.h>
+#include <deal.II/base/logstream.h>
+#include <deal.II/base/function.h>
+#include <deal.II/base/utilities.h>
+#include <deal.II/base/conditional_ostream.h>
+#include <deal.II/base/work_stream.h>
+#include <deal.II/base/timer.h>
+#include <deal.II/base/parameter_handler.h>
+
+#include <deal.II/lac/full_matrix.h>
+#include <deal.II/lac/solver_bicgstab.h>
+#include <deal.II/lac/solver_cg.h>
+#include <deal.II/lac/solver_gmres.h>
+#include <deal.II/lac/constraint_matrix.h>
+#include <deal.II/lac/block_sparsity_pattern.h>
+#include <deal.II/lac/petsc_parallel_vector.h>
+#include <deal.II/lac/petsc_block_vector.h>
+#include <deal.II/lac/petsc_parallel_block_vector.h>
+#include <deal.II/lac/petsc_sparse_matrix.h>
+#include <deal.II/lac/petsc_parallel_sparse_matrix.h>
+#include <deal.II/lac/petsc_parallel_block_sparse_matrix.h>
+#include <deal.II/lac/petsc_precondition.h>
+#include <deal.II/lac/petsc_solver.h>
+
+#include <deal.II/grid/tria.h>
+#include <deal.II/grid/grid_generator.h>
+#include <deal.II/grid/tria_accessor.h>
+#include <deal.II/grid/tria_iterator.h>
+#include <deal.II/grid/filtered_iterator.h>
+#include <deal.II/grid/tria_boundary_lib.h>
+#include <deal.II/grid/grid_tools.h>
+#include <deal.II/grid/grid_refinement.h>
+
+#include <deal.II/dofs/dof_handler.h>
+#include <deal.II/dofs/dof_renumbering.h>
+#include <deal.II/dofs/dof_accessor.h>
+#include <deal.II/dofs/dof_tools.h>
+
+#include <deal.II/fe/fe_q.h>
+#include <deal.II/fe/fe_dgq.h>
+#include <deal.II/fe/fe_dgp.h>
+#include <deal.II/fe/fe_system.h>
+#include <deal.II/fe/fe_values.h>
+#include <deal.II/fe/mapping_q.h>
+
+#include <deal.II/numerics/vector_tools.h>
+#include <deal.II/numerics/matrix_tools.h>
+#include <deal.II/numerics/data_out.h>
+#include <deal.II/numerics/error_estimator.h>
+#include <deal.II/numerics/solution_transfer.h>
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <limits>
+#include <locale>
+#include <string>
+
+				 // This is the only include file that
+				 // is new: It introduces the
+				 // parallel::distributed::SolutionTransfer
+				 // equivalent of the
+				 // dealii::SolutionTransfer class to
+				 // take a solution from on mesh to
+				 // the next one upon mesh refinement,
+				 // but in the case of parallel
+				 // distributed triangulations:
+#include <deal.II/distributed/solution_transfer.h>
+
+				 // The following classes are used in
+				 // parallel distributed computations
+				 // and have all already been
+				 // introduced in step-40:
+#include <deal.II/base/index_set.h>
+#include <deal.II/distributed/tria.h>
+#include <deal.II/distributed/grid_refinement.h>
+
+
+/**
+ * utilities to replace Trilinos with PETSc.
+ */
+namespace CIG
+{
+/**
+ *	convert a block_partition used for trilinos into data used for petsc.
+ *	It is assumed that block_partition.size()==2.
+ */
+void convert_block_partitioning(
+		const std::vector<dealii::IndexSet> &block_partition,
+		int n_u, int n_p,
+		std::vector<unsigned int> &block_sizes,
+		std::vector<unsigned int> &local_sizes)
+{
+	Assert(block_partition.size()==2, dealii::ExcMessage("logic error"));
+
+	// init,
+	block_sizes.clear();
+	local_sizes.clear();
+
+	// block_sizes
+	block_sizes.push_back(n_u);
+	block_sizes.push_back(n_p);
+
+	// local_sizes
+	local_sizes.push_back(block_partition[0].n_elements());
+	local_sizes.push_back(block_partition[1].n_elements());
+}
+
+
+void setup_petsc_matrix(
+		std::vector<unsigned int> &block_sizes,
+		std::vector<unsigned int> &local_sizes,
+		int max_coupling_between_dofs,
+		dealii::PETScWrappers::MPI::BlockSparseMatrix  &matrix)
+{
+	Assert(block_sizes.size()==2, dealii::ExcMessage("logic error"));
+
+	int n_u = block_sizes[0];
+	int n_p = block_sizes[1];
+	matrix.reinit(2,2);
+	matrix.block(0,0).reinit(
+			MPI_COMM_WORLD,n_u,n_u,local_sizes[0],local_sizes[0],max_coupling_between_dofs);
+	matrix.block(0,1).reinit(
+				MPI_COMM_WORLD,n_u,n_p,local_sizes[0],local_sizes[1],max_coupling_between_dofs);
+	matrix.block(1,0).reinit(
+				MPI_COMM_WORLD,n_p,n_u,local_sizes[1],local_sizes[0],max_coupling_between_dofs);
+	matrix.block(1,1).reinit(
+				MPI_COMM_WORLD,n_p,n_p,local_sizes[1],local_sizes[1],max_coupling_between_dofs);
+	matrix.collect_sizes();
+}
+
+void setup_petsc_vector(
+		std::vector<unsigned int> &block_sizes,
+		 std::vector<dealii::IndexSet> &partitioning,
+		 std::vector<dealii::IndexSet> &relevant_partitioning,
+		 dealii::PETScWrappers::MPI::BlockVector &vector)
+{
+	Assert(block_sizes.size()==2, dealii::ExcMessage("logic error"));
+
+	vector.reinit(block_sizes,MPI_COMM_WORLD);
+	vector.block(0).reinit(MPI_COMM_WORLD,partitioning[0],relevant_partitioning[0]);
+	vector.block(1).reinit(MPI_COMM_WORLD,partitioning[1],relevant_partitioning[1]);
+	vector.collect_sizes();
+}
+
+template <class VectorType>
+void reduce_accuracy(VectorType &vector)
+{
+	std::pair<unsigned int,unsigned int> range = vector.local_range();
+	for(unsigned int i=range.first; i<range.second; i++)
+		vector[i] = std::floor(vector[i]);
+	vector.compress();
+}
+
+}
+
+				 // The next step is like in all
+				 // previous tutorial programs: We put
+				 // everything into a namespace of its
+				 // own and then import the deal.II
+				 // classes and functions into it:
+namespace Step32
+{
+  using namespace dealii;
+
+				   // @sect3{Equation data}
+
+				   // In the following namespace, we
+				   // define the various pieces of
+				   // equation data that describe the
+				   // problem. This corresponds to the
+				   // various aspects of making the
+				   // problem at least slightly
+				   // realistc and that were
+				   // exhaustively discussed in the
+				   // description of the testcase in
+				   // the introduction.
+				   //
+				   // We start with a few coefficients
+				   // that have constant values (the
+				   // comment after the value
+				   // indicates its physical units):
+  namespace EquationData
+  {
+    const double eta                   = 1e21;    /* Pa s       */
+    const double kappa                 = 1e-6;    /* m / s      */
+    const double reference_density     = 3300;    /* kg / m^3   */
+    const double reference_temperature = 293;     /* K          */
+    const double expansion_coefficient = 2e-5;    /* 1/K        */
+    const double specific_heat         = 1250;    /* J / K / kg */
+    const double radiogenic_heating    = 7.4e-12; /* W / kg     */
+
+
+    const double R0      = 6371000.-2890000.;     /* m          */
+    const double R1      = 6371000.-  35000.;     /* m          */
+
+    const double T0      = 4000+273;              /* K          */
+    const double T1      =  700+273;              /* K          */
+
+
+				     // The next set of definitions
+				     // are for functions that encode
+				     // the density as a function of
+				     // temperature, the gravity
+				     // vector, and the initial values
+				     // for the temperature. Again,
+				     // all of these (along with the
+				     // values they compute) are
+				     // discussed in the introduction:
+    double density (const double temperature)
+    {
+      return (reference_density *
+	      (1 - expansion_coefficient * (temperature -
+					    reference_temperature)));
+    }
+
+
+    template <int dim>
+    Tensor<1,dim> gravity_vector (const Point<dim> &p)
+    {
+      const double r = p.norm();
+      return -(1.245e-6 * r + 7.714e13/r/r) * p / r;
+    }
+
+
+
+    template <int dim>
+    class TemperatureInitialValues : public Function<dim>
+    {
+      public:
+	TemperatureInitialValues () : Function<dim>(1) {}
+
+	virtual double value (const Point<dim>   &p,
+			      const unsigned int  component = 0) const;
+
+	virtual void vector_value (const Point<dim> &p,
+				   Vector<double>   &value) const;
+    };
+
+
+
+    template <int dim>
+    double
+    TemperatureInitialValues<dim>::value (const Point<dim>  &p,
+					  const unsigned int) const
+    {
+      const double r = p.norm();
+      const double h = R1-R0;
+
+      const double s = (r-R0)/h;
+      const double q = (dim==3)?std::max(0.0,cos(numbers::PI*abs(p(2)/R1))):1.0;
+      const double phi   = std::atan2(p(0),p(1));
+      const double tau = s
+			 +
+			 0.2 * s * (1-s) * std::sin(6*phi) * q;
+
+      return T0*(1.0-tau) + T1*tau;
+    }
+
+
+    template <int dim>
+    void
+    TemperatureInitialValues<dim>::vector_value (const Point<dim> &p,
+						 Vector<double>   &values) const
+    {
+      for (unsigned int c=0; c<this->n_components; ++c)
+	values(c) = TemperatureInitialValues<dim>::value (p, c);
+    }
+
+
+				     // As mentioned in the
+				     // introduction we need to
+				     // rescale the pressure to avoid
+				     // the relative ill-conditioning
+				     // of the momentum and mass
+				     // conservation equations. The
+				     // scaling factor is
+				     // $\frac{\eta}{L}$ where $L$ was
+				     // a typical length scale. By
+				     // experimenting it turns out
+				     // that a good length scale is
+				     // the diameter of plumes, which
+				     // is around 10 km:
+    const double pressure_scaling = eta / 10000;
+
+				     // The final number in this
+				     // namespace is a constant that
+				     // denotes the number of seconds
+				     // per (average, tropical)
+				     // year. We use this only when
+				     // generating screen output:
+				     // internally, all computations
+				     // of this program happen in SI
+				     // units (kilogram, meter,
+				     // seconds) but writing
+				     // geological times in seconds
+				     // yields numbers that one can't
+				     // relate to reality, and so we
+				     // convert to years using the
+				     // factor defined here:
+    const double year_in_seconds  = 60*60*24*365.2425;
+
+  }
+
+
+
+				   // @sect3{Preconditioning the Stokes system}
+
+				   // This namespace implements the
+				   // preconditioner. As discussed in the
+				   // introduction, this preconditioner
+				   // differs in a number of key portions from
+				   // the one used in step-31. Specifically,
+				   // it is a right preconditioner,
+				   // implementing the matrix
+				   // @f{align*}\left(\begin{array}{cc}A^{-1}
+				   // & B^T \\ 0 & S^{-1}\end{array}\right)@f}
+				   // where the two inverse matrix operations
+				   // are approximated by linear solvers or,
+				   // if the right flag is given to the
+				   // constructor of this class, by a single
+				   // AMG V-cycle for the velocity block. The
+				   // three code blocks of the
+				   // <code>vmult</code> function implement
+				   // the multiplications with the three
+				   // blocks of this preconditioner matrix and
+				   // should be self explanatory if you have
+				   // read through step-31 or the discussion
+				   // of compositing solvers in step-20.
+  namespace LinearSolvers
+  {
+    template <class PreconditionerA, class PreconditionerMp>
+    class BlockSchurPreconditioner : public Subscriptor
+    {
+      public:
+	BlockSchurPreconditioner (const PETScWrappers::MPI::BlockSparseMatrix  &S,
+				  const PETScWrappers::MPI::BlockSparseMatrix  &Spre,
+				  const PreconditionerMp                     &Mppreconditioner,
+				  const PreconditionerA                      &Apreconditioner,
+				  const bool                                  do_solve_A)
+			:
+			stokes_matrix     (&S),
+			stokes_preconditioner_matrix     (&Spre),
+			mp_preconditioner (Mppreconditioner),
+			a_preconditioner  (Apreconditioner),
+			do_solve_A        (do_solve_A)
+	  {}
+
+	void vmult (PETScWrappers::MPI::BlockVector       &dst,
+		    const PETScWrappers::MPI::BlockVector &src) const
+	  {
+	    PETScWrappers::MPI::Vector utmp(src.block(0));
+
+	    {
+	      SolverControl solver_control(5000, 1e-6 * src.block(1).l2_norm());
+
+	      SolverCG<PETScWrappers::MPI::Vector> solver(solver_control);
+
+	      solver.solve(stokes_preconditioner_matrix->block(1,1),
+			   dst.block(1), src.block(1),
+			   mp_preconditioner);
+
+	      dst.block(1) *= -1.0;
+	    }
+
+	    {
+	      stokes_matrix->block(0,1).vmult(utmp, dst.block(1));
+	      utmp*=-1.0;
+	      utmp.add(src.block(0));
+	    }
+
+	    if (do_solve_A == true)
+	      {
+		SolverControl solver_control(5000, utmp.l2_norm()*1e-2);
+		PETScWrappers::SolverCG solver(solver_control);
+		solver.solve(stokes_matrix->block(0,0), dst.block(0), utmp,
+			     a_preconditioner);
+	      }
+	    else
+	      a_preconditioner.vmult (dst.block(0), utmp);
+	  }
+
+      private:
+	const SmartPointer<const PETScWrappers::MPI::BlockSparseMatrix> stokes_matrix;
+	const SmartPointer<const PETScWrappers::MPI::BlockSparseMatrix> stokes_preconditioner_matrix;
+	const PreconditionerMp &mp_preconditioner;
+	const PreconditionerA  &a_preconditioner;
+	const bool do_solve_A;
+    };
+  }
+
+
+
+				   // @sect3{Definition of assembly data structures}
+				   //
+				   // As described in the
+				   // introduction, we will use the
+				   // WorkStream mechanism discussed
+				   // in the @ref threads module to
+				   // parallelize operations among the
+				   // processors of a single
+				   // machine. The WorkStream class
+				   // requires that data is passed
+				   // around in two kinds of data
+				   // structures, one for scratch data
+				   // and one to pass data from the
+				   // assembly function to the
+				   // function that copies local
+				   // contributions into global
+				   // objects.
+				   //
+				   // The following namespace (and the
+				   // two sub-namespaces) contains a
+				   // collection of data structures
+				   // that serve this purpose, one
+				   // pair for each of the four
+				   // operations discussed in the
+				   // introduction that we will want
+				   // to parallelize. Each assembly
+				   // routine gets two sets of data: a
+				   // Scratch array that collects all
+				   // the classes and arrays that are
+				   // used for the calculation of the
+				   // cell contribution, and a
+				   // CopyData array that keeps local
+				   // matrices and vectors which will
+				   // be written into the global
+				   // matrix. Whereas CopyData is a
+				   // container for the final data
+				   // that is written into the global
+				   // matrices and vector (and, thus,
+				   // absolutely necessary), the
+				   // Scratch arrays are merely there
+				   // for performance reasons &mdash;
+				   // it would be much more expensive
+				   // to set up a FEValues object on
+				   // each cell, than creating it only
+				   // once and updating some
+				   // derivative data.
+				   //
+				   // Step-31 had four assembly
+				   // routines: One for the
+				   // preconditioner matrix of the
+				   // Stokes system, one for the
+				   // Stokes matrix and right hand
+				   // side, one for the temperature
+				   // matrices and one for the right
+				   // hand side of the temperature
+				   // equation. We here organize the
+				   // scratch arrays and CopyData
+				   // objects for each of those four
+				   // assembly components using a
+				   // <code>struct</code> environment
+				   // (since we consider these as
+				   // temporary objects we pass
+				   // around, rather than classes that
+				   // implement functionality of their
+				   // own, though this is a more
+				   // subjective point of view to
+				   // distinguish between
+				   // <code>struct</code>s and
+				   // <code>class</code>es).
+				   //
+				   // Regarding the Scratch objects,
+				   // each struct is equipped with a
+				   // constructor that creates an
+				   // FEValues object for a @ref
+				   // FiniteElement "finite element",
+				   // a @ref Quadrature "quadrature formula",
+				   // the @ref Mapping "mapping" that
+				   // describes the
+				   // interpolation of curved
+				   // boundaries, and some @ref
+				   // UpdateFlags "update flags".
+				   // Moreover, we manually implement
+				   // a copy constructor (since the
+				   // FEValues class is not copyable
+				   // by itself), and provide some
+				   // additional vector fields that
+				   // are used to hold intermediate
+				   // data during the computation of
+				   // local contributions.
+				   //
+				   // Let us start with the scratch
+				   // arrays and, specifically, the
+				   // one used for assembly of the
+				   // Stokes preconditioner:
+  namespace Assembly
+  {
+    namespace Scratch
+    {
+      template <int dim>
+      struct StokesPreconditioner
+      {
+	  StokesPreconditioner (const FiniteElement<dim> &stokes_fe,
+				const Quadrature<dim>    &stokes_quadrature,
+				const Mapping<dim>       &mapping,
+				const UpdateFlags         update_flags);
+
+	  StokesPreconditioner (const StokesPreconditioner &data);
+
+
+         FEValues<dim>               stokes_fe_values;
+
+         std::vector<Tensor<2,dim> > grad_phi_u;
+         std::vector<double>         phi_p;
+      };
+
+      template <int dim>
+      StokesPreconditioner<dim>::
+      StokesPreconditioner (const FiniteElement<dim> &stokes_fe,
+			    const Quadrature<dim>    &stokes_quadrature,
+			    const Mapping<dim>       &mapping,
+			    const UpdateFlags         update_flags)
+		      :
+		      stokes_fe_values (mapping, stokes_fe, stokes_quadrature,
+					update_flags),
+                     grad_phi_u (stokes_fe.dofs_per_cell),
+		      phi_p (stokes_fe.dofs_per_cell)
+      {}
+
+
+
+      template <int dim>
+      StokesPreconditioner<dim>::
+      StokesPreconditioner (const StokesPreconditioner &scratch)
+		      :
+		      stokes_fe_values (scratch.stokes_fe_values.get_mapping(),
+					scratch.stokes_fe_values.get_fe(),
+					scratch.stokes_fe_values.get_quadrature(),
+					scratch.stokes_fe_values.get_update_flags()),
+                     grad_phi_u (scratch.grad_phi_u),
+		      phi_p (scratch.phi_p)
+      {}
+
+
+
+				       // The next one is the scratch object
+				       // used for the assembly of the full
+				       // Stokes system. Observe that we
+				       // derive the StokesSystem scratch
+				       // class from the StokesPreconditioner
+				       // class above. We do this because all the
+				       // objects that are necessary for the
+				       // assembly of the preconditioner are
+				       // also needed for the actual matrix
+				       // system and right hand side, plus
+				       // some extra data. This makes the
+				       // program more compact. Note also that
+				       // the assembly of the Stokes system
+				       // and the temperature right hand side
+				       // further down requires data from
+				       // temperature and velocity,
+				       // respectively, so we actually need
+				       // two FEValues objects for those two
+				       // cases.
+      template <int dim>
+      struct StokesSystem : public StokesPreconditioner<dim>
+      {
+	  StokesSystem (const FiniteElement<dim> &stokes_fe,
+			const Mapping<dim>       &mapping,
+			const Quadrature<dim>    &stokes_quadrature,
+			const UpdateFlags         stokes_update_flags,
+			const FiniteElement<dim> &temperature_fe,
+			const UpdateFlags         temperature_update_flags);
+
+	  StokesSystem (const StokesSystem<dim> &data);
+
+
+	  FEValues<dim>                        temperature_fe_values;
+
+	  std::vector<Tensor<1,dim> >          phi_u;
+	  std::vector<SymmetricTensor<2,dim> > grads_phi_u;
+	  std::vector<double>                  div_phi_u;
+
+	  std::vector<double>                  old_temperature_values;
+      };
+
+
+      template <int dim>
+      StokesSystem<dim>::
+      StokesSystem (const FiniteElement<dim> &stokes_fe,
+		    const Mapping<dim>       &mapping,
+		    const Quadrature<dim>    &stokes_quadrature,
+		    const UpdateFlags         stokes_update_flags,
+		    const FiniteElement<dim> &temperature_fe,
+		    const UpdateFlags         temperature_update_flags)
+		      :
+		      StokesPreconditioner<dim> (stokes_fe, stokes_quadrature,
+						 mapping,
+						 stokes_update_flags),
+		      temperature_fe_values (mapping, temperature_fe, stokes_quadrature,
+					     temperature_update_flags),
+		      phi_u (stokes_fe.dofs_per_cell),
+		      grads_phi_u (stokes_fe.dofs_per_cell),
+		      div_phi_u (stokes_fe.dofs_per_cell),
+		      old_temperature_values (stokes_quadrature.size())
+      {}
+
+
+      template <int dim>
+      StokesSystem<dim>::
+      StokesSystem (const StokesSystem<dim> &scratch)
+		      :
+		      StokesPreconditioner<dim> (scratch),
+		      temperature_fe_values (scratch.temperature_fe_values.get_mapping(),
+					     scratch.temperature_fe_values.get_fe(),
+					     scratch.temperature_fe_values.get_quadrature(),
+					     scratch.temperature_fe_values.get_update_flags()),
+		      phi_u (scratch.phi_u),
+		      grads_phi_u (scratch.grads_phi_u),
+		      div_phi_u (scratch.div_phi_u),
+		      old_temperature_values (scratch.old_temperature_values)
+      {}
+
+
+				       // After defining the objects used in
+				       // the assembly of the Stokes system,
+				       // we do the same for the assembly of
+				       // the matrices necessary for the
+				       // temperature system. The general
+				       // structure is very similar:
+      template <int dim>
+      struct TemperatureMatrix
+      {
+	  TemperatureMatrix (const FiniteElement<dim> &temperature_fe,
+			     const Mapping<dim>       &mapping,
+			     const Quadrature<dim>    &temperature_quadrature);
+
+	  TemperatureMatrix (const TemperatureMatrix &data);
+
+
+	  FEValues<dim>               temperature_fe_values;
+
+	  std::vector<double>         phi_T;
+	  std::vector<Tensor<1,dim> > grad_phi_T;
+      };
+
+
+      template <int dim>
+      TemperatureMatrix<dim>::
+      TemperatureMatrix (const FiniteElement<dim> &temperature_fe,
+			 const Mapping<dim>       &mapping,
+			 const Quadrature<dim>    &temperature_quadrature)
+		      :
+		      temperature_fe_values (mapping,
+					     temperature_fe, temperature_quadrature,
+					     update_values    | update_gradients |
+					     update_JxW_values),
+		      phi_T (temperature_fe.dofs_per_cell),
+		      grad_phi_T (temperature_fe.dofs_per_cell)
+      {}
+
+
+      template <int dim>
+      TemperatureMatrix<dim>::
+      TemperatureMatrix (const TemperatureMatrix &scratch)
+		      :
+		      temperature_fe_values (scratch.temperature_fe_values.get_mapping(),
+					     scratch.temperature_fe_values.get_fe(),
+					     scratch.temperature_fe_values.get_quadrature(),
+					     scratch.temperature_fe_values.get_update_flags()),
+		      phi_T (scratch.phi_T),
+		      grad_phi_T (scratch.grad_phi_T)
+      {}
+
+
+				       // The final scratch object is used in
+				       // the assembly of the right hand side
+				       // of the temperature system. This
+				       // object is significantly larger than
+				       // the ones above because a lot more
+				       // quantities enter the computation of
+				       // the right hand side of the
+				       // temperature equation. In particular,
+				       // the temperature values and gradients
+				       // of the previous two time steps need
+				       // to be evaluated at the quadrature
+				       // points, as well as the velocities
+				       // and the strain rates (i.e. the
+				       // symmetric gradients of the velocity)
+				       // that enter the right hand side as
+				       // friction heating terms. Despite the
+				       // number of terms, the following
+				       // should be rather self explanatory:
+      template <int dim>
+      struct TemperatureRHS
+      {
+	  TemperatureRHS (const FiniteElement<dim> &temperature_fe,
+			  const FiniteElement<dim> &stokes_fe,
+			  const Mapping<dim>       &mapping,
+			  const Quadrature<dim>    &quadrature);
+
+	  TemperatureRHS (const TemperatureRHS &data);
+
+
+	  FEValues<dim>                        temperature_fe_values;
+	  FEValues<dim>                        stokes_fe_values;
+
+	  std::vector<double>                  phi_T;
+	  std::vector<Tensor<1,dim> >          grad_phi_T;
+
+	  std::vector<Tensor<1,dim> >          old_velocity_values;
+	  std::vector<Tensor<1,dim> >          old_old_velocity_values;
+
+	  std::vector<SymmetricTensor<2,dim> > old_strain_rates;
+	  std::vector<SymmetricTensor<2,dim> > old_old_strain_rates;
+
+	  std::vector<double>                  old_temperature_values;
+	  std::vector<double>                  old_old_temperature_values;
+	  std::vector<Tensor<1,dim> >          old_temperature_grads;
+	  std::vector<Tensor<1,dim> >          old_old_temperature_grads;
+	  std::vector<double>                  old_temperature_laplacians;
+	  std::vector<double>                  old_old_temperature_laplacians;
+      };
+
+
+      template <int dim>
+      TemperatureRHS<dim>::
+      TemperatureRHS (const FiniteElement<dim> &temperature_fe,
+		      const FiniteElement<dim> &stokes_fe,
+		      const Mapping<dim>       &mapping,
+		      const Quadrature<dim>    &quadrature)
+		      :
+		      temperature_fe_values (mapping,
+					     temperature_fe, quadrature,
+					     update_values    |
+					     update_gradients |
+					     update_hessians  |
+					     update_quadrature_points |
+					     update_JxW_values),
+		      stokes_fe_values (mapping,
+					stokes_fe, quadrature,
+					update_values | update_gradients),
+		      phi_T (temperature_fe.dofs_per_cell),
+		      grad_phi_T (temperature_fe.dofs_per_cell),
+
+		      old_velocity_values (quadrature.size()),
+		      old_old_velocity_values (quadrature.size()),
+		      old_strain_rates (quadrature.size()),
+		      old_old_strain_rates (quadrature.size()),
+
+		      old_temperature_values (quadrature.size()),
+		      old_old_temperature_values(quadrature.size()),
+		      old_temperature_grads(quadrature.size()),
+		      old_old_temperature_grads(quadrature.size()),
+		      old_temperature_laplacians(quadrature.size()),
+		      old_old_temperature_laplacians(quadrature.size())
+      {}
+
+
+      template <int dim>
+      TemperatureRHS<dim>::
+      TemperatureRHS (const TemperatureRHS &scratch)
+		      :
+		      temperature_fe_values (scratch.temperature_fe_values.get_mapping(),
+					     scratch.temperature_fe_values.get_fe(),
+					     scratch.temperature_fe_values.get_quadrature(),
+					     scratch.temperature_fe_values.get_update_flags()),
+		      stokes_fe_values (scratch.stokes_fe_values.get_mapping(),
+					scratch.stokes_fe_values.get_fe(),
+					scratch.stokes_fe_values.get_quadrature(),
+					scratch.stokes_fe_values.get_update_flags()),
+		      phi_T (scratch.phi_T),
+		      grad_phi_T (scratch.grad_phi_T),
+
+		      old_velocity_values (scratch.old_velocity_values),
+		      old_old_velocity_values (scratch.old_old_velocity_values),
+		      old_strain_rates (scratch.old_strain_rates),
+		      old_old_strain_rates (scratch.old_old_strain_rates),
+
+		      old_temperature_values (scratch.old_temperature_values),
+		      old_old_temperature_values (scratch.old_old_temperature_values),
+		      old_temperature_grads (scratch.old_temperature_grads),
+		      old_old_temperature_grads (scratch.old_old_temperature_grads),
+		      old_temperature_laplacians (scratch.old_temperature_laplacians),
+		      old_old_temperature_laplacians (scratch.old_old_temperature_laplacians)
+      {}
+    }
+
+
+				     // The CopyData objects are even
+				     // simpler than the Scratch
+				     // objects as all they have to do
+				     // is to store the results of
+				     // local computations until they
+				     // can be copied into the global
+				     // matrix or vector
+				     // objects. These structures
+				     // therefore only need to provide
+				     // a constructor, a copy
+				     // operation, and some arrays for
+				     // local matrix, local vectors
+				     // and the relation between local
+				     // and global degrees of freedom
+				     // (a.k.a.
+				     // <code>local_dof_indices</code>). Again,
+				     // we have one such structure for
+				     // each of the four operations we
+				     // will parallelize using the
+				     // WorkStream class:
+    namespace CopyData
+    {
+      template <int dim>
+      struct StokesPreconditioner
+      {
+	  StokesPreconditioner (const FiniteElement<dim> &stokes_fe);
+	  StokesPreconditioner (const StokesPreconditioner &data);
+
+	  FullMatrix<double>          local_matrix;
+	  std::vector<unsigned int>   local_dof_indices;
+      };
+
+      template <int dim>
+      StokesPreconditioner<dim>::
+      StokesPreconditioner (const FiniteElement<dim> &stokes_fe)
+		      :
+		      local_matrix (stokes_fe.dofs_per_cell,
+				    stokes_fe.dofs_per_cell),
+		      local_dof_indices (stokes_fe.dofs_per_cell)
+      {}
+
+      template <int dim>
+      StokesPreconditioner<dim>::
+      StokesPreconditioner (const StokesPreconditioner &data)
+		      :
+		      local_matrix (data.local_matrix),
+		      local_dof_indices (data.local_dof_indices)
+      {}
+
+
+
+      template <int dim>
+      struct StokesSystem : public StokesPreconditioner<dim>
+      {
+	  StokesSystem (const FiniteElement<dim> &stokes_fe);
+	  StokesSystem (const StokesSystem<dim> &data);
+
+	  Vector<double> local_rhs;
+      };
+
+      template <int dim>
+      StokesSystem<dim>::
+      StokesSystem (const FiniteElement<dim> &stokes_fe)
+		      :
+		      StokesPreconditioner<dim> (stokes_fe),
+		      local_rhs (stokes_fe.dofs_per_cell)
+      {}
+
+      template <int dim>
+      StokesSystem<dim>::
+      StokesSystem (const StokesSystem<dim> &data)
+		      :
+		      StokesPreconditioner<dim> (data),
+		      local_rhs (data.local_rhs)
+      {}
+
+
+
+      template <int dim>
+      struct TemperatureMatrix
+      {
+	  TemperatureMatrix (const FiniteElement<dim> &temperature_fe);
+	  TemperatureMatrix (const TemperatureMatrix &data);
+
+	  FullMatrix<double>          local_mass_matrix;
+	  FullMatrix<double>          local_stiffness_matrix;
+	  std::vector<unsigned int>   local_dof_indices;
+      };
+
+      template <int dim>
+      TemperatureMatrix<dim>::
+      TemperatureMatrix (const FiniteElement<dim> &temperature_fe)
+		      :
+		      local_mass_matrix (temperature_fe.dofs_per_cell,
+					 temperature_fe.dofs_per_cell),
+		      local_stiffness_matrix (temperature_fe.dofs_per_cell,
+					      temperature_fe.dofs_per_cell),
+		      local_dof_indices (temperature_fe.dofs_per_cell)
+      {}
+
+      template <int dim>
+      TemperatureMatrix<dim>::
+      TemperatureMatrix (const TemperatureMatrix &data)
+		      :
+		      local_mass_matrix (data.local_mass_matrix),
+		      local_stiffness_matrix (data.local_stiffness_matrix),
+		      local_dof_indices (data.local_dof_indices)
+      {}
+
+
+
+      template <int dim>
+      struct TemperatureRHS
+      {
+	  TemperatureRHS (const FiniteElement<dim> &temperature_fe);
+	  TemperatureRHS (const TemperatureRHS &data);
+
+	  Vector<double>              local_rhs;
+	  std::vector<unsigned int>   local_dof_indices;
+	  FullMatrix<double>          matrix_for_bc;
+      };
+
+      template <int dim>
+      TemperatureRHS<dim>::
+      TemperatureRHS (const FiniteElement<dim> &temperature_fe)
+		      :
+		      local_rhs (temperature_fe.dofs_per_cell),
+		      local_dof_indices (temperature_fe.dofs_per_cell),
+		      matrix_for_bc (temperature_fe.dofs_per_cell,
+				     temperature_fe.dofs_per_cell)
+      {}
+
+      template <int dim>
+      TemperatureRHS<dim>::
+      TemperatureRHS (const TemperatureRHS &data)
+		      :
+		      local_rhs (data.local_rhs),
+		      local_dof_indices (data.local_dof_indices),
+		      matrix_for_bc (data.matrix_for_bc)
+      {}
+    }
+  }
+
+
+
+				   // @sect3{The <code>BoussinesqFlowProblem</code> class template}
+				   //
+				   // This is the declaration of the
+				   // main class. It is very similar
+				   // to step-31 but there are a
+				   // number differences we will
+				   // comment on below.
+				   //
+				   // The top of the class is
+				   // essentially the same as in
+				   // step-31, listing the public
+				   // methods and a set of private
+				   // functions that do the heavy
+				   // lifting. Compared to step-31
+				   // there are only two additions to
+				   // this section: the function
+				   // <code>get_cfl_number()</code>
+				   // that computes the maximum CFL
+				   // number over all cells which
+				   // we then compute the global time
+				   // step from, and the function
+				   // <code>get_entropy_variation()</code>
+				   // that is used in the computation
+				   // of the entropy stabilization. It
+				   // is akin to the
+				   // <code>get_extrapolated_temperature_range()</code>
+				   // we have used in step-31 for this
+				   // purpose, but works on the
+				   // entropy instead of the
+				   // temperature instead.
+  template <int dim>
+  class BoussinesqFlowProblem
+  {
+    public:
+      struct Parameters;
+      BoussinesqFlowProblem (Parameters &parameters);
+      void run ();
+
+      int 										m_myrank;		// for debugging
+
+    private:
+      void setup_dofs ();
+      void assemble_stokes_preconditioner ();
+      void build_stokes_preconditioner ();
+      void assemble_stokes_system ();
+      void assemble_temperature_matrix ();
+      void assemble_temperature_system (const double maximal_velocity);
+      void project_temperature_field ();
+      double get_maximal_velocity () const;
+      double get_cfl_number () const;
+      double get_entropy_variation (const double average_temperature) const;
+      std::pair<double,double> get_extrapolated_temperature_range () const;
+      void solve ();
+      void output_results ();
+      void refine_mesh (const unsigned int max_grid_level);
+
+      double
+      compute_viscosity(const std::vector<double>          &old_temperature,
+			const std::vector<double>          &old_old_temperature,
+			const std::vector<Tensor<1,dim> >  &old_temperature_grads,
+			const std::vector<Tensor<1,dim> >  &old_old_temperature_grads,
+			const std::vector<double>          &old_temperature_laplacians,
+			const std::vector<double>          &old_old_temperature_laplacians,
+			const std::vector<Tensor<1,dim> >  &old_velocity_values,
+			const std::vector<Tensor<1,dim> >  &old_old_velocity_values,
+			const std::vector<SymmetricTensor<2,dim> >  &old_strain_rates,
+			const std::vector<SymmetricTensor<2,dim> >  &old_old_strain_rates,
+			const double                        global_u_infty,
+			const double                        global_T_variation,
+			const double                        average_temperature,
+			const double                        global_entropy_variation,
+			const double                        cell_diameter) const;
+
+    public:
+
+				       // The first significant new
+				       // component is the definition
+				       // of a struct for the
+				       // parameters according to the
+				       // discussion in the
+				       // introduction. This structure
+				       // is initialized by reading
+				       // from a parameter file during
+				       // construction of this object.
+      struct Parameters
+      {
+	  Parameters (const std::string &parameter_filename);
+
+	  static void declare_parameters (ParameterHandler &prm);
+	  void parse_parameters (ParameterHandler &prm);
+
+	  double       end_time;
+
+	  unsigned int initial_global_refinement;
+	  unsigned int initial_adaptive_refinement;
+
+	  bool         generate_graphical_output;
+	  unsigned int graphical_output_interval;
+
+	  unsigned int adaptive_refinement_interval;
+
+	  double       stabilization_alpha;
+	  double       stabilization_c_R;
+	  double       stabilization_beta;
+
+	  unsigned int stokes_velocity_degree;
+	  bool         use_locally_conservative_discretization;
+
+	  unsigned int temperature_degree;
+      };
+
+    private:
+      Parameters                               &parameters;
+
+				       // The <code>pcout</code> (for
+				       // <i>%parallel
+				       // <code>std::cout</code></i>)
+				       // object is used to simplify
+				       // writing output: each MPI
+				       // process can use this to
+				       // generate output as usual,
+				       // but since each of these
+				       // processes will (hopefully)
+				       // produce the same output it
+				       // will just be replicated many
+				       // times over; with the
+				       // ConditionalOStream class,
+				       // only the output generated by
+				       // one MPI process will
+				       // actually be printed to
+				       // screen, whereas the output
+				       // by all the other threads
+				       // will simply be forgotten.
+      ConditionalOStream                        pcout;
+
+				       // The following member
+				       // variables will then again be
+				       // similar to those in step-31
+				       // (and to other tutorial
+				       // programs). As mentioned in
+				       // the introduction, we fully
+				       // distribute computations, so
+				       // we will have to use the
+				       // parallel::distributed::Triangulation
+				       // class (see step-40) but the
+				       // remainder of these variables
+				       // is rather standard with two
+				       // exceptions:
+				       //
+				       // - The <code>mapping</code>
+				       // variable is used to denote a
+				       // higher-order polynomial
+				       // mapping. As mentioned in the
+				       // introduction, we use this
+				       // mapping when forming
+				       // integrals through quadrature
+				       // for all cells that are
+				       // adjacent to either the inner
+				       // or outer boundaries of our
+				       // domain where the boundary is
+				       // curved.
+				       //
+				       // - In a bit of naming
+				       // confusion, you will notice
+				       // below that some of the
+				       // variables from namespace
+				       // PETScWrappers are taken
+				       // from namespace
+				       // PETScWrappers::MPI (such
+				       // as the right hand side
+				       // vectors) whereas others are
+				       // not (such as the various
+				       // matrices). For the matrices,
+				       // we happen to use the same
+				       // class names for %parallel
+				       // and sequential data
+				       // structures, i.e., all
+				       // matrices will actually be
+				       // considered %parallel
+				       // below. On the other hand,
+				       // for vectors, only those from
+				       // namespace
+				       // PETScWrappers::MPI are
+				       // actually distributed. In
+				       // particular, we will
+				       // frequently have to query
+				       // velocities and temperatures
+				       // at arbitrary quadrature
+				       // points; consequently, rather
+				       // than importing ghost
+				       // information of a vector
+				       // whenever we need access to
+				       // degrees of freedom that are
+				       // relevant locally but owned
+				       // by another processor, we
+				       // solve linear systems in
+				       // %parallel but then
+				       // immediately initialize a
+				       // vector including ghost
+				       // entries of the solution for
+				       // further processing. The
+				       // various
+				       // <code>*_solution</code>
+				       // vectors are therefore filled
+				       // immediately after solving
+				       // their respective linear
+				       // system in %parallel and will
+				       // always contain values for
+				       // all @ref
+				       // GlossLocallyRelevantDof
+				       // "locally relevant degrees of freedom";
+				       // the fully
+				       // distributed vectors that we
+				       // obtain from the solution
+				       // process and that only ever
+				       // contain the @ref
+				       // GlossLocallyOwnedDof
+				       // "locally owned degrees of freedom"
+				       // are destroyed
+				       // immediately after the
+				       // solution process and after
+				       // we have copied the relevant
+				       // values into the member
+				       // variable vectors.
+      parallel::distributed::Triangulation<dim> triangulation;
+      double                                    global_Omega_diameter;
+
+      const MappingQ<dim>                       mapping;
+
+      const FESystem<dim>                       stokes_fe;
+      DoFHandler<dim>                           stokes_dof_handler;
+      ConstraintMatrix                          stokes_constraints;
+
+      PETScWrappers::MPI::BlockSparseMatrix       stokes_matrix;
+      PETScWrappers::MPI::BlockSparseMatrix       stokes_preconditioner_matrix;
+
+      PETScWrappers::MPI::BlockVector        stokes_solution;
+      PETScWrappers::MPI::BlockVector        old_stokes_solution;
+      PETScWrappers::MPI::BlockVector        stokes_rhs;
+
+
+      FE_Q<dim>                                 temperature_fe;
+      DoFHandler<dim>                           temperature_dof_handler;
+      ConstraintMatrix                          temperature_constraints;
+
+      PETScWrappers::MPI::SparseMatrix       temperature_mass_matrix;
+      PETScWrappers::MPI::SparseMatrix       temperature_stiffness_matrix;
+      PETScWrappers::MPI::SparseMatrix       temperature_matrix;
+
+      PETScWrappers::MPI::Vector             temperature_solution;
+      PETScWrappers::MPI::Vector             old_temperature_solution;
+      PETScWrappers::MPI::Vector             old_old_temperature_solution;
+      PETScWrappers::MPI::Vector             temperature_rhs;
+
+
+      double                                    time_step;
+      double                                    old_time_step;
+      unsigned int                              timestep_number;
+
+      std_cxx1x::shared_ptr<PETScWrappers::PreconditionBoomerAMG>    Amg_preconditioner;
+      std_cxx1x::shared_ptr<PETScWrappers::PreconditionJacobi> Mp_preconditioner;
+      std_cxx1x::shared_ptr<PETScWrappers::PreconditionJacobi> T_preconditioner;
+
+      bool                                      rebuild_stokes_matrix;
+      bool                                      rebuild_stokes_preconditioner;
+      bool                                      rebuild_temperature_matrices;
+      bool                                      rebuild_temperature_preconditioner;
+
+				       // The next member variable,
+				       // <code>computing_timer</code>
+				       // is used to conveniently
+				       // account for compute time
+				       // spent in certain "sections"
+				       // of the code that are
+				       // repeatedly entered. For
+				       // example, we will enter (and
+				       // leave) sections for Stokes
+				       // matrix assembly and would
+				       // like to accumulate the run
+				       // time spent in this section
+				       // over all time steps. Every
+				       // so many time steps as well
+				       // as at the end of the program
+				       // (through the destructor of
+				       // the TimerOutput class) we
+				       // will then produce a nice
+				       // summary of the times spent
+				       // in the different sections
+				       // into which we categorize the
+				       // run-time of this program.
+      TimerOutput                               computing_timer;
+
+				       // After these member variables
+				       // we have a number of
+				       // auxiliary functions that
+				       // have been broken out of the
+				       // ones listed
+				       // above. Specifically, there
+				       // are first three functions
+				       // that we call from
+				       // <code>setup_dofs</code> and
+				       // then the ones that do the
+				       // assembling of linear
+				       // systems:
+      void setup_stokes_matrix (const std::vector<IndexSet> &stokes_partitioning);
+      void setup_stokes_preconditioner (const std::vector<IndexSet> &stokes_partitioning);
+      void setup_temperature_matrices (const IndexSet &temperature_partitioning);
+
+
+				       // Following the @ref
+				       // MTWorkStream
+				       // "task-based parallelization"
+				       // paradigm,
+				       // we split all the assembly
+				       // routines into two parts: a
+				       // first part that can do all
+				       // the calculations on a
+				       // certain cell without taking
+				       // care of other threads, and a
+				       // second part (which is
+				       // writing the local data into
+				       // the global matrices and
+				       // vectors) which can be
+				       // entered by only one thread
+				       // at a time. In order to
+				       // implement that, we provide
+				       // functions for each of those
+				       // two steps for all the four
+				       // assembly routines that we
+				       // use in this program. The
+				       // following eight functions do
+				       // exactly this:
+      void
+      local_assemble_stokes_preconditioner (const typename DoFHandler<dim>::active_cell_iterator &cell,
+					    Assembly::Scratch::StokesPreconditioner<dim> &scratch,
+					    Assembly::CopyData::StokesPreconditioner<dim> &data);
+
+      void
+      copy_local_to_global_stokes_preconditioner (const Assembly::CopyData::StokesPreconditioner<dim> &data);
+
+
+      void
+      local_assemble_stokes_system (const typename DoFHandler<dim>::active_cell_iterator &cell,
+				    Assembly::Scratch::StokesSystem<dim>  &scratch,
+				    Assembly::CopyData::StokesSystem<dim> &data);
+
+      void
+      copy_local_to_global_stokes_system (const Assembly::CopyData::StokesSystem<dim> &data);
+
+
+      void
+      local_assemble_temperature_matrix (const typename DoFHandler<dim>::active_cell_iterator &cell,
+					 Assembly::Scratch::TemperatureMatrix<dim>  &scratch,
+					 Assembly::CopyData::TemperatureMatrix<dim> &data);
+
+      void
+      copy_local_to_global_temperature_matrix (const Assembly::CopyData::TemperatureMatrix<dim> &data);
+
+
+
+      void
+      local_assemble_temperature_rhs (const std::pair<double,double> global_T_range,
+				      const double                   global_max_velocity,
+				      const double                   global_entropy_variation,
+				      const typename DoFHandler<dim>::active_cell_iterator &cell,
+				      Assembly::Scratch::TemperatureRHS<dim> &scratch,
+				      Assembly::CopyData::TemperatureRHS<dim> &data);
+
+      void
+      copy_local_to_global_temperature_rhs (const Assembly::CopyData::TemperatureRHS<dim> &data);
+
+				       // Finally, we forward declare
+				       // a member class that we will
+				       // define later on and that
+				       // will be used to compute a
+				       // number of quantities from
+				       // our solution vectors that
+				       // we'd like to put into the
+				       // output files for
+				       // visualization.
+      class Postprocessor;
+  };
+
+
+				   // @sect3{BoussinesqFlowProblem class implementation}
+
+				   // @sect4{BoussinesqFlowProblem::Parameters}
+				   //
+				   // Here comes the definition of the
+				   // parameters for the Stokes
+				   // problem. We allow to set the end
+				   // time for the simulation, the
+				   // level of refinements (both
+				   // global and adaptive, which in
+				   // the sum specify what maximum
+				   // level the cells are allowed to
+				   // have), and the interval between
+				   // refinements in the time
+				   // stepping.
+				   //
+				   // Then, we let the user specify
+				   // constants for the stabilization
+				   // parameters (as discussed in the
+				   // introduction), the polynomial
+				   // degree for the Stokes velocity
+				   // space, whether to use the
+				   // locally conservative
+				   // discretization based on FE_DGP
+				   // elements for the pressure or not
+				   // (FE_Q elements for pressure),
+				   // and the polynomial degree for
+				   // the temperature interpolation.
+				   //
+				   // The constructor checks for a
+				   // valid input file (if not, a file
+				   // with default parameters for the
+				   // quantities is written), and
+				   // eventually parses the
+				   // parameters.
+  template <int dim>
+  BoussinesqFlowProblem<dim>::Parameters::Parameters (const std::string &parameter_filename)
+		  :
+		  end_time (1e8),
+		  initial_global_refinement (2),
+		  initial_adaptive_refinement (2),
+		  adaptive_refinement_interval (10),
+		  stabilization_alpha (2),
+		  stabilization_c_R (0.11),
+		  stabilization_beta (0.078),
+		  stokes_velocity_degree (2),
+		  use_locally_conservative_discretization (true),
+		  temperature_degree (2)
+  {
+    ParameterHandler prm;
+    BoussinesqFlowProblem<dim>::Parameters::declare_parameters (prm);
+
+    std::ifstream parameter_file (parameter_filename.c_str());
+
+    if (!parameter_file)
+      {
+	parameter_file.close ();
+
+	std::ostringstream message;
+	message << "Input parameter file <"
+		<< parameter_filename << "> not found. Creating a"
+		<< std::endl
+		<< "template file of the same name."
+		<< std::endl;
+
+	std::ofstream parameter_out (parameter_filename.c_str());
+	prm.print_parameters (parameter_out,
+			      ParameterHandler::Text);
+
+	AssertThrow (false, ExcMessage (message.str().c_str()));
+      }
+
+    const bool success = prm.read_input (parameter_file);
+    AssertThrow (success, ExcMessage ("Invalid input parameter file."));
+
+    parse_parameters (prm);
+  }
+
+
+
+				   // Next we have a function that
+				   // declares the parameters that we
+				   // expect in the input file,
+				   // together with their data types,
+				   // default values and a
+				   // description:
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::Parameters::
+  declare_parameters (ParameterHandler &prm)
+  {
+    prm.declare_entry ("End time", "1e8",
+		       Patterns::Double (0),
+		       "The end time of the simulation in years.");
+    prm.declare_entry ("Initial global refinement", "2",
+		       Patterns::Integer (0),
+		       "The number of global refinement steps performed on "
+		       "the initial coarse mesh, before the problem is first "
+		       "solved there.");
+    prm.declare_entry ("Initial adaptive refinement", "2",
+		       Patterns::Integer (0),
+		       "The number of adaptive refinement steps performed after "
+		       "initial global refinement.");
+    prm.declare_entry ("Time steps between mesh refinement", "10",
+		       Patterns::Integer (1),
+		       "The number of time steps after which the mesh is to be "
+		       "adapted based on computed error indicators.");
+    prm.declare_entry ("Generate graphical output", "false",
+		       Patterns::Bool (),
+		       "Whether graphical output is to be generated or not. "
+		       "You may not want to get graphical output if the number "
+		       "of processors is large.");
+    prm.declare_entry ("Time steps between graphical output", "50",
+		       Patterns::Integer (1),
+		       "The number of time steps between each generation of "
+		       "graphical output files.");
+
+    prm.enter_subsection ("Stabilization parameters");
+    {
+      prm.declare_entry ("alpha", "2",
+			 Patterns::Double (1, 2),
+			 "The exponent in the entropy viscosity stabilization.");
+      prm.declare_entry ("c_R", "0.11",
+			 Patterns::Double (0),
+			 "The c_R factor in the entropy viscosity "
+			 "stabilization.");
+      prm.declare_entry ("beta", "0.078",
+			 Patterns::Double (0),
+			 "The beta factor in the artificial viscosity "
+			 "stabilization. An appropriate value for 2d is 0.052 "
+			 "and 0.078 for 3d.");
+    }
+    prm.leave_subsection ();
+
+    prm.enter_subsection ("Discretization");
+    {
+      prm.declare_entry ("Stokes velocity polynomial degree", "2",
+			 Patterns::Integer (1),
+			 "The polynomial degree to use for the velocity variables "
+			 "in the Stokes system.");
+      prm.declare_entry ("Temperature polynomial degree", "2",
+			 Patterns::Integer (1),
+			 "The polynomial degree to use for the temperature variable.");
+      prm.declare_entry ("Use locally conservative discretization", "true",
+			 Patterns::Bool (),
+			 "Whether to use a Stokes discretization that is locally "
+			 "conservative at the expense of a larger number of degrees "
+			 "of freedom, or to go with a cheaper discretization "
+			 "that does not locally conserve mass (although it is "
+			 "globally conservative.");
+    }
+    prm.leave_subsection ();
+  }
+
+
+
+				   // And then we need a function that
+				   // reads the contents of the
+				   // ParameterHandler object we get
+				   // by reading the input file and
+				   // puts the results into variables
+				   // that store the values of the
+				   // parameters we have previously
+				   // declared:
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::Parameters::
+  parse_parameters (ParameterHandler &prm)
+  {
+    end_time                    = prm.get_double ("End time");
+    initial_global_refinement   = prm.get_integer ("Initial global refinement");
+    initial_adaptive_refinement = prm.get_integer ("Initial adaptive refinement");
+
+    adaptive_refinement_interval= prm.get_integer ("Time steps between mesh refinement");
+
+    generate_graphical_output   = prm.get_bool ("Generate graphical output");
+    graphical_output_interval   = prm.get_integer ("Time steps between graphical output");
+
+    prm.enter_subsection ("Stabilization parameters");
+    {
+      stabilization_alpha = prm.get_double ("alpha");
+      stabilization_c_R   = prm.get_double ("c_R");
+      stabilization_beta  = prm.get_double ("beta");
+    }
+    prm.leave_subsection ();
+
+    prm.enter_subsection ("Discretization");
+    {
+      stokes_velocity_degree = prm.get_integer ("Stokes velocity polynomial degree");
+      temperature_degree     = prm.get_integer ("Temperature polynomial degree");
+      use_locally_conservative_discretization
+	= prm.get_bool ("Use locally conservative discretization");
+    }
+    prm.leave_subsection ();
+  }
+
+
+
+
+				   // @sect4{BoussinesqFlowProblem::BoussinesqFlowProblem}
+				   //
+				   // The constructor of the problem
+				   // is very similar to the
+				   // constructor in step-31. What is
+				   // different is the %parallel
+				   // communication: Trilinos uses a
+				   // message passing interface (MPI)
+				   // for data distribution. When
+				   // entering the
+				   // BoussinesqFlowProblem class, we
+				   // have to decide how the
+				   // parallization is to be done. We
+				   // choose a rather simple strategy
+				   // and let all processors that are
+				   // running the program work
+				   // together, specified by the
+				   // communicator
+				   // <code>MPI_COMM_WORLD</code>. Next,
+				   // we create the output stream (as
+				   // we already did in step-18) that
+				   // only generates output on the
+				   // first MPI process and is
+				   // completely forgetful on all
+				   // others. The implementation of
+				   // this idea is to check the
+				   // process number when
+				   // <code>pcout</code> gets a true
+				   // argument, and it uses the
+				   // <code>std::cout</code> stream
+				   // for output. If we are one
+				   // processor five, for instance,
+				   // then we will give a
+				   // <code>false</code> argument to
+				   // <code>pcout</code>, which means
+				   // that the output of that
+				   // processor will not be
+				   // printed. With the exception of
+				   // the mapping object (for which we
+				   // use polynomials of degree 4) all
+				   // but the final member variable
+				   // are exactly the same as in
+				   // step-31.
+				   //
+				   // This final object, the
+				   // TimerOutput object, is then told
+				   // to restrict output to the
+				   // <code>pcout</code> stream
+				   // (processor 0), and then we
+				   // specify that we want to get a
+				   // summary table at the end of the
+				   // program which shows us wallclock
+				   // times (as opposed to CPU
+				   // times). We will manually also
+				   // request intermediate summaries
+				   // every so many time steps in the
+				   // <code>run()</code> function
+				   // below.
+  template <int dim>
+  BoussinesqFlowProblem<dim>::BoussinesqFlowProblem (Parameters &parameters_)
+		  :
+		  parameters (parameters_),
+		  pcout (std::cout,
+			 (Utilities::MPI::this_mpi_process(MPI_COMM_WORLD)
+			  == 0)),
+
+		  triangulation (MPI_COMM_WORLD,
+				 typename Triangulation<dim>::MeshSmoothing
+				 (Triangulation<dim>::smoothing_on_refinement |
+				  Triangulation<dim>::smoothing_on_coarsening)),
+
+		  mapping (4),
+
+		  stokes_fe (FE_Q<dim>(parameters.stokes_velocity_degree),
+			     dim,
+			     (parameters.use_locally_conservative_discretization
+			      ?
+			      static_cast<const FiniteElement<dim> &>
+			      (FE_DGP<dim>(parameters.stokes_velocity_degree-1))
+			      :
+			      static_cast<const FiniteElement<dim> &>
+			      (FE_Q<dim>(parameters.stokes_velocity_degree-1))),
+			     1),
+
+		  stokes_dof_handler (triangulation),
+
+		  temperature_fe (parameters.temperature_degree),
+		  temperature_dof_handler (triangulation),
+
+		  time_step (0),
+		  old_time_step (0),
+		  timestep_number (0),
+		  rebuild_stokes_matrix (true),
+		  rebuild_stokes_preconditioner (true),
+		  rebuild_temperature_matrices (true),
+		  rebuild_temperature_preconditioner (true),
+
+		  computing_timer (pcout,
+				   TimerOutput::summary,
+				   TimerOutput::wall_times)
+  {}
+
+
+
+				   // @sect4{The BoussinesqFlowProblem helper functions}
+				   // @sect5{BoussinesqFlowProblem::get_maximal_velocity}
+				   // Except for two small details,
+				   // the function to compute the
+				   // global maximum of the velocity
+				   // is the same as in step-31. The
+				   // first detail is actually common
+				   // to all functions that implement
+				   // loops over all cells in the
+				   // triangulation: When operating in
+				   // %parallel, each processor can
+				   // only work on a chunk of cells
+				   // since each processor only has a
+				   // certain part of the entire
+				   // triangulation. This chunk of
+				   // cells that we want to work on is
+				   // identified via a so-called
+				   // <code>subdomain_id</code>, as we
+				   // also did in step-18. All we need
+				   // to change is hence to perform
+				   // the cell-related operations only
+				   // on cells that are owned by the
+				   // current process (as opposed to
+				   // ghost or artificial cells),
+				   // i.e. for which the subdomain id
+				   // equals the number of the process
+				   // ID. Since this is a commonly
+				   // used operation, there is a
+				   // shortcut for this operation: we
+				   // can ask whether the cell is
+				   // owned by the current processor
+				   // using
+				   // <code>cell-@>is_locally_owned()</code>.
+				   //
+				   // The second difference is the way
+				   // we calculate the maximum
+				   // value. Before, we could simply
+				   // have a <code>double</code>
+				   // variable that we checked against
+				   // on each quadrature point for
+				   // each cell. Now, we have to be a
+				   // bit more careful since each
+				   // processor only operates on a
+				   // subset of cells. What we do is
+				   // to first let each processor
+				   // calculate the maximum among its
+				   // cells, and then do a global
+				   // communication operation
+				   // <code>Utilities::MPI::max</code>
+				   // that computes the maximum value
+				   // among all the maximum values of
+				   // the individual processors. MPI
+				   // provides such a call, but it's
+				   // even simpler to use the
+				   // respective function in namespace
+				   // Utilities::MPI using the MPI
+				   // communicator object since that
+				   // will do the right thing even if
+				   // we work without MPI and on a
+				   // single machine only. The call to
+				   // <code>Utilities::MPI::max</code>
+				   // needs two arguments, namely the
+				   // local maximum (input) and the
+				   // MPI communicator, which is
+				   // MPI_COMM_WORLD in this example.
+  template <int dim>
+  double BoussinesqFlowProblem<dim>::get_maximal_velocity () const
+  {
+    const QIterated<dim> quadrature_formula (QTrapez<1>(),
+					     parameters.stokes_velocity_degree);
+    const unsigned int n_q_points = quadrature_formula.size();
+
+    FEValues<dim> fe_values (mapping, stokes_fe, quadrature_formula, update_values);
+    std::vector<Tensor<1,dim> > velocity_values(n_q_points);
+
+    const FEValuesExtractors::Vector velocities (0);
+
+    double max_local_velocity = 0;
+
+    typename DoFHandler<dim>::active_cell_iterator
+      cell = stokes_dof_handler.begin_active(),
+      endc = stokes_dof_handler.end();
+    for (; cell!=endc; ++cell)
+      if (cell->is_locally_owned())
+	{
+	  fe_values.reinit (cell);
+	  fe_values[velocities].get_function_values (stokes_solution,
+						     velocity_values);
+
+	  for (unsigned int q=0; q<n_q_points; ++q)
+	    max_local_velocity = std::max (max_local_velocity,
+					   velocity_values[q].norm());
+	}
+
+    return Utilities::MPI::max (max_local_velocity, MPI_COMM_WORLD);
+  }
+
+
+				   // @sect5{BoussinesqFlowProblem::get_cfl_number}
+				   // The next function does something
+				   // similar, but we now compute the
+				   // CFL number, i.e., maximal
+				   // velocity on a cell divided by
+				   // the cell diameter. This number
+				   // is necessary to determine the
+				   // time step size, as we use a
+				   // semi-explicit time stepping
+				   // scheme for the temperature
+				   // equation (see step-31 for a
+				   // discussion). We compute it in
+				   // the same way as above: Compute
+				   // the local maximum over all
+				   // locally owned cells, then
+				   // exchange it via MPI to find the
+				   // global maximum.
+  template <int dim>
+  double BoussinesqFlowProblem<dim>::get_cfl_number () const
+  {
+    const QIterated<dim> quadrature_formula (QTrapez<1>(),
+					     parameters.stokes_velocity_degree);
+    const unsigned int n_q_points = quadrature_formula.size();
+
+    FEValues<dim> fe_values (mapping, stokes_fe, quadrature_formula, update_values);
+    std::vector<Tensor<1,dim> > velocity_values(n_q_points);
+
+    const FEValuesExtractors::Vector velocities (0);
+
+    double max_local_cfl = 0;
+
+    typename DoFHandler<dim>::active_cell_iterator
+      cell = stokes_dof_handler.begin_active(),
+      endc = stokes_dof_handler.end();
+    for (; cell!=endc; ++cell)
+      if (cell->is_locally_owned())
+	{
+	  fe_values.reinit (cell);
+	  fe_values[velocities].get_function_values (stokes_solution,
+						     velocity_values);
+
+	  double max_local_velocity = 1e-10;
+	  for (unsigned int q=0; q<n_q_points; ++q)
+	    max_local_velocity = std::max (max_local_velocity,
+					   velocity_values[q].norm());
+	  max_local_cfl = std::max(max_local_cfl,
+				   max_local_velocity / cell->diameter());
+	}
+
+    return Utilities::MPI::max (max_local_cfl, MPI_COMM_WORLD);
+  }
+
+
+				   // @sect5{BoussinesqFlowProblem::get_entropy_variation}
+				   // Next comes the computation of
+				   // the global entropy variation
+				   // $\|E(T)-\bar{E}(T)\|_\infty$
+				   // where the entropy $E$ is defined
+				   // as discussed in the
+				   // introduction.  This is needed for
+				   // the evaluation of the
+				   // stabilization in the temperature
+				   // equation as explained in the
+				   // introduction. The entropy
+				   // variation is actually only
+				   // needed if we use $\alpha=2$ as a
+				   // power in the residual
+				   // computation. The infinity norm
+				   // is computed by the maxima over
+				   // quadrature points, as usual in
+				   // discrete computations.
+				   //
+				   // In order to compute this quantity, we
+				   // first have to find the space-average
+				   // $\bar{E}(T)$ and then evaluate the
+				   // maximum. However, that means that we
+				   // would need to perform two loops. We can
+				   // avoid the overhead by noting that
+				   // $\|E(T)-\bar{E}(T)\|_\infty =
+				   // \max\big(E_{\textrm{max}}(T)-\bar{E}(T),
+				   // \bar{E}(T)-E_{\textrm{min}}(T)\big)$, i.e., the
+				   // maximum out of the deviation from the
+				   // average entropy in positive and negative
+				   // directions. The four quantities we need
+				   // for the latter formula (maximum entropy,
+				   // minimum entropy, average entropy, area)
+				   // can all be evaluated in the same loop
+				   // over all cells, so we choose this
+				   // simpler variant.
+  template <int dim>
+  double
+  BoussinesqFlowProblem<dim>::get_entropy_variation (const double average_temperature) const
+  {
+    if (parameters.stabilization_alpha != 2)
+      return 1.;
+
+    const QGauss<dim> quadrature_formula (parameters.temperature_degree+1);
+    const unsigned int n_q_points = quadrature_formula.size();
+
+    FEValues<dim> fe_values (temperature_fe, quadrature_formula,
+			     update_values | update_JxW_values);
+    std::vector<double> old_temperature_values(n_q_points);
+    std::vector<double> old_old_temperature_values(n_q_points);
+
+				     // In the two functions above we
+				     // computed the maximum of
+				     // numbers that were all
+				     // non-negative, so we knew that
+				     // zero was certainly a lower
+				     // bound. On the other hand, here
+				     // we need to find the maximum
+				     // deviation from the average
+				     // value, i.e., we will need to
+				     // know the maximal and minimal
+				     // values of the entropy for
+				     // which we don't a priori know
+				     // the sign.
+				     //
+				     // To compute it, we can
+				     // therefore start with the
+				     // largest and smallest possible
+				     // values we can store in a
+				     // double precision number: The
+				     // minimum is initialized with a
+				     // bigger and the maximum with a
+				     // smaller number than any one
+				     // that is going to appear. We
+				     // are then guaranteed that these
+				     // numbers will be overwritten in
+				     // the loop on the first cell or,
+				     // if this processor does not own
+				     // any cells, in the
+				     // communication step at the
+				     // latest. The following loop
+				     // then computes the minimum and
+				     // maximum local entropy as well
+				     // as keeps track of the
+				     // area/volume of the part of the
+				     // domain we locally own and the
+				     // integral over the entropy on
+				     // it:
+    double min_entropy = std::numeric_limits<double>::max(),
+	   max_entropy = -std::numeric_limits<double>::max(),
+		  area = 0,
+    entropy_integrated = 0;
+
+    typename DoFHandler<dim>::active_cell_iterator
+      cell = temperature_dof_handler.begin_active(),
+      endc = temperature_dof_handler.end();
+    for (; cell!=endc; ++cell)
+      if (cell->is_locally_owned())
+	{
+	  fe_values.reinit (cell);
+	  fe_values.get_function_values (old_temperature_solution,
+					 old_temperature_values);
+	  fe_values.get_function_values (old_old_temperature_solution,
+					 old_old_temperature_values);
+	  for (unsigned int q=0; q<n_q_points; ++q)
+	    {
+	      const double T = (old_temperature_values[q] +
+				old_old_temperature_values[q]) / 2;
+	      const double entropy = ((T-average_temperature) *
+				      (T-average_temperature));
+
+	      min_entropy = std::min (min_entropy, entropy);
+	      max_entropy = std::max (max_entropy, entropy);
+	      area += fe_values.JxW(q);
+	      entropy_integrated += fe_values.JxW(q) * entropy;
+	    }
+	}
+
+				     // Now we only need to exchange
+				     // data between processors: we
+				     // need to sum the two integrals
+				     // (<code>area</code>,
+				     // <code>entropy_integrated</code>),
+				     // and get the extrema for
+				     // maximum and minimum. We could
+				     // do this through four different
+				     // data exchanges, but we can it
+				     // with two: Utilities::MPI::sum
+				     // also exists in a variant that
+				     // takes an array of values that
+				     // are all to be summed up. And
+				     // we can also utilize the
+				     // Utilities::MPI::max function
+				     // by realizing that forming the
+				     // minimum over the minimal
+				     // entropies equals forming the
+				     // negative of the maximum over
+				     // the negative of the minimal
+				     // entropies; this maximum can
+				     // then be combined with forming
+				     // the maximum over the maximal
+				     // entropies.
+    const double local_sums[2]   = { entropy_integrated, area },
+		 local_maxima[2] = { -min_entropy, max_entropy };
+    double global_sums[2], global_maxima[2];
+
+    Utilities::MPI::sum (local_sums,   MPI_COMM_WORLD, global_sums);
+    Utilities::MPI::max (local_maxima, MPI_COMM_WORLD, global_maxima);
+
+				     // Having computed everything
+				     // this way, we can then compute
+				     // the average entropy and find
+				     // the $L^\infty$ norm by taking
+				     // the larger of the deviation of
+				     // the maximum or minimum from
+				     // the average:
+    const double average_entropy = global_sums[0] / global_sums[1];
+    const double entropy_diff = std::max(global_maxima[1] - average_entropy,
+					 average_entropy - (-global_maxima[0]));
+    return entropy_diff;
+  }
+
+
+
+				   // @sect5{BoussinesqFlowProblem::get_extrapolated_temperature_range}
+				   // The next function computes the
+				   // minimal and maximal value of the
+				   // extrapolated temperature over
+				   // the entire domain. Again, this
+				   // is only a slightly modified
+				   // version of the respective
+				   // function in step-31. As in the
+				   // function above, we collect local
+				   // minima and maxima and then
+				   // compute the global extrema using
+				   // the same trick as above.
+				   //
+				   // As already discussed in step-31, the
+				   // function needs to distinguish between
+				   // the first and all following time steps
+				   // because it uses a higher order
+				   // temperature extrapolation scheme when at
+				   // least two previous time steps are
+				   // available.
+  template <int dim>
+  std::pair<double,double>
+  BoussinesqFlowProblem<dim>::get_extrapolated_temperature_range () const
+  {
+    const QIterated<dim> quadrature_formula (QTrapez<1>(),
+					     parameters.temperature_degree);
+    const unsigned int n_q_points = quadrature_formula.size();
+
+    FEValues<dim> fe_values (mapping, temperature_fe, quadrature_formula,
+			     update_values);
+    std::vector<double> old_temperature_values(n_q_points);
+    std::vector<double> old_old_temperature_values(n_q_points);
+
+    double min_local_temperature = std::numeric_limits<double>::max(),
+	   max_local_temperature = -std::numeric_limits<double>::max();
+
+    if (timestep_number != 0)
+      {
+	typename DoFHandler<dim>::active_cell_iterator
+	  cell = temperature_dof_handler.begin_active(),
+	  endc = temperature_dof_handler.end();
+	for (; cell!=endc; ++cell)
+	  if (cell->is_locally_owned())
+	    {
+	      fe_values.reinit (cell);
+	      fe_values.get_function_values (old_temperature_solution,
+					     old_temperature_values);
+	      fe_values.get_function_values (old_old_temperature_solution,
+					     old_old_temperature_values);
+
+	      for (unsigned int q=0; q<n_q_points; ++q)
+		{
+		  const double temperature =
+		    (1. + time_step/old_time_step) * old_temperature_values[q]-
+		    time_step/old_time_step * old_old_temperature_values[q];
+
+		  min_local_temperature = std::min (min_local_temperature,
+						    temperature);
+		  max_local_temperature = std::max (max_local_temperature,
+						    temperature);
+		}
+	    }
+      }
+    else
+      {
+	typename DoFHandler<dim>::active_cell_iterator
+	  cell = temperature_dof_handler.begin_active(),
+	  endc = temperature_dof_handler.end();
+	for (; cell!=endc; ++cell)
+	  if (cell->is_locally_owned())
+	    {
+	      fe_values.reinit (cell);
+	      fe_values.get_function_values (old_temperature_solution,
+					     old_temperature_values);
+
+	      for (unsigned int q=0; q<n_q_points; ++q)
+		{
+		  const double temperature = old_temperature_values[q];
+
+		  min_local_temperature = std::min (min_local_temperature,
+						    temperature);
+		  max_local_temperature = std::max (max_local_temperature,
+						    temperature);
+		}
+	    }
+      }
+
+    double local_extrema[2] = { -min_local_temperature,
+				max_local_temperature };
+    double global_extrema[2];
+    Utilities::MPI::max (local_extrema, MPI_COMM_WORLD, global_extrema);
+
+    return std::make_pair(-global_extrema[0], global_extrema[1]);
+  }
+
+
+				   // @sect5{BoussinesqFlowProblem::compute_viscosity}
+				   // The function that calculates the
+				   // viscosity is purely local and so needs
+				   // no communication at all. It is mostly
+				   // the same as in step-31 but with an
+				   // updated formulation of the viscosity if
+				   // $\alpha=2$ is chosen:
+  template <int dim>
+  double
+  BoussinesqFlowProblem<dim>::
+  compute_viscosity (const std::vector<double>          &old_temperature,
+		     const std::vector<double>          &old_old_temperature,
+		     const std::vector<Tensor<1,dim> >  &old_temperature_grads,
+		     const std::vector<Tensor<1,dim> >  &old_old_temperature_grads,
+		     const std::vector<double>          &old_temperature_laplacians,
+		     const std::vector<double>          &old_old_temperature_laplacians,
+		     const std::vector<Tensor<1,dim> >  &old_velocity_values,
+		     const std::vector<Tensor<1,dim> >  &old_old_velocity_values,
+		     const std::vector<SymmetricTensor<2,dim> >  &old_strain_rates,
+		     const std::vector<SymmetricTensor<2,dim> >  &old_old_strain_rates,
+		     const double                        global_u_infty,
+		     const double                        global_T_variation,
+		     const double                        average_temperature,
+		     const double                        global_entropy_variation,
+		     const double                        cell_diameter) const
+  {
+    if (global_u_infty == 0)
+      return 5e-3 * cell_diameter;
+
+    const unsigned int n_q_points = old_temperature.size();
+
+    double max_residual = 0;
+    double max_velocity = 0;
+
+    for (unsigned int q=0; q < n_q_points; ++q)
+      {
+	const Tensor<1,dim> u = (old_velocity_values[q] +
+				 old_old_velocity_values[q]) / 2;
+
+	const SymmetricTensor<2,dim> strain_rate = (old_strain_rates[q] +
+						    old_old_strain_rates[q]) / 2;
+
+	const double T = (old_temperature[q] + old_old_temperature[q]) / 2;
+	const double dT_dt = (old_temperature[q] - old_old_temperature[q])
+			     / old_time_step;
+	const double u_grad_T = u * (old_temperature_grads[q] +
+				     old_old_temperature_grads[q]) / 2;
+
+	const double kappa_Delta_T = EquationData::kappa
+				     * (old_temperature_laplacians[q] +
+					old_old_temperature_laplacians[q]) / 2;
+	const double gamma
+	  = ((EquationData::radiogenic_heating * EquationData::density(T)
+	      +
+	      2 * EquationData::eta * strain_rate * strain_rate) /
+	     (EquationData::density(T) * EquationData::specific_heat));
+
+	double residual
+	  = std::abs(dT_dt + u_grad_T - kappa_Delta_T - gamma);
+	if (parameters.stabilization_alpha == 2)
+	  residual *= std::abs(T - average_temperature);
+
+	max_residual = std::max (residual,        max_residual);
+	max_velocity = std::max (std::sqrt (u*u), max_velocity);
+      }
+
+    const double max_viscosity = (parameters.stabilization_beta *
+				  max_velocity * cell_diameter);
+    if (timestep_number == 0)
+      return max_viscosity;
+    else
+      {
+	Assert (old_time_step > 0, ExcInternalError());
+
+	double entropy_viscosity;
+	if (parameters.stabilization_alpha == 2)
+	  entropy_viscosity = (parameters.stabilization_c_R *
+			       cell_diameter * cell_diameter *
+			       max_residual /
+			       global_entropy_variation);
+	else
+	  entropy_viscosity = (parameters.stabilization_c_R *
+			       cell_diameter * global_Omega_diameter *
+			       max_velocity * max_residual /
+			       (global_u_infty * global_T_variation));
+
+	return std::min (max_viscosity, entropy_viscosity);
+      }
+  }
+
+
+
+  				   // @sect5{BoussinesqFlowProblem::project_temperature_field}
+
+				   // This function is new compared to
+				   // step-31. What is does is to re-implement
+				   // the library function
+				   // <code>VectorTools::project()</code> for
+				   // an MPI-based parallelization, a function
+				   // we used for generating an initial vector
+				   // for temperature based on some initial
+				   // function. The library function only
+				   // works with shared memory but doesn't
+				   // know how to utilize multiple machines
+				   // coupled through MPI to compute the
+				   // projected field. The details of a
+				   // <code>project()</code> function are not
+				   // very difficult. All we do is to use a
+				   // mass matrix and put the evaluation of
+				   // the initial value function on the right
+				   // hand side. The mass matrix for
+				   // temperature we can simply generate using
+				   // the respective assembly function, so all
+				   // we need to do here is to create the
+				   // right hand side and do a CG solve. The
+				   // assembly function does a loop over all
+				   // cells and evaluates the function in the
+				   // <code>EquationData</code> namespace, and
+				   // does this only on cells owned by the
+				   // respective processor. The implementation
+				   // of this assembly differs from the
+				   // assembly we do for the principal
+				   // assembly functions further down (which
+				   // include thread-based parallelization
+				   // with the WorkStream concept). Here we
+				   // chose to keep things simple (keeping in
+				   // mind that this function is also only
+				   // called once at the beginning of the
+				   // program, not in every time step), and
+				   // generating the right hand side is cheap
+				   // anyway so we won't even notice that this
+				   // part is not parallized by threads.
+				   //
+				   // Regarding the implementation of
+				   // inhomogeneous Dirichlet boundary
+				   // conditions: Since we use the temperature
+				   // ConstraintMatrix, we could apply the
+				   // boundary conditions directly when
+				   // building the respective matrix and right
+				   // hand side. In this case, the boundary
+				   // conditions are inhomogeneous, which
+				   // makes this procedure somewhat tricky
+				   // since we get the matrix from some other
+				   // function that uses its own integration
+				   // and assembly loop. However, the correct
+				   // imposition of boundary conditions needs
+				   // the matrix data we work on plus the
+				   // right hand side simultaneously, since
+				   // the right hand side is created by
+				   // Gaussian elimination on the matrix
+				   // rows. In order to not introduce the
+				   // matrix assembly at this place, but still
+				   // having the matrix data available, we
+				   // choose to create a dummy matrix
+				   // <code>matrix_for_bc</code> that we only
+				   // fill with data when we need it for
+				   // imposing boundary conditions. These
+				   // positions are exactly those where we
+				   // have an inhomogeneous entry in the
+				   // ConstraintMatrix. There are only a few
+				   // such positions (on the boundary DoFs),
+				   // so it is still much cheaper to use this
+				   // function than to create the full matrix
+				   // here. To implement this, we ask the
+				   // constraint matrix whether the DoF under
+				   // consideration is inhomogeneously
+				   // constrained. In that case, we generate
+				   // the respective matrix column that we
+				   // need for creating the correct right hand
+				   // side. Note that this (manually
+				   // generated) matrix entry needs to be
+				   // exactly the entry that we would fill the
+				   // matrix with &mdash; otherwise, this will
+				   // not work.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::project_temperature_field ()
+  {
+    assemble_temperature_matrix ();
+
+    QGauss<dim> quadrature(parameters.temperature_degree+2);
+    UpdateFlags update_flags = UpdateFlags(update_values   |
+					   update_quadrature_points |
+					   update_JxW_values);
+    FEValues<dim> fe_values (mapping, temperature_fe, quadrature, update_flags);
+
+    const unsigned int dofs_per_cell = fe_values.dofs_per_cell,
+		       n_q_points    = fe_values.n_quadrature_points;
+
+    std::vector<unsigned int> local_dof_indices (dofs_per_cell);
+    Vector<double> cell_vector (dofs_per_cell);
+    FullMatrix<double> matrix_for_bc (dofs_per_cell, dofs_per_cell);
+
+    std::vector<double> rhs_values(n_q_points);
+
+    PETScWrappers::MPI::Vector
+      rhs (MPI_COMM_WORLD,temperature_mass_matrix.m(),temperature_mass_matrix.local_size()), //rhs (temperature_mass_matrix.row_partitioner()),
+      solution (MPI_COMM_WORLD,temperature_mass_matrix.m(),temperature_mass_matrix.local_size()); //solution (temperature_mass_matrix.row_partitioner());
+
+    const EquationData::TemperatureInitialValues<dim> initial_temperature;
+
+    typename DoFHandler<dim>::active_cell_iterator
+      cell = temperature_dof_handler.begin_active(),
+      endc = temperature_dof_handler.end();
+
+    for (; cell!=endc; ++cell)
+      if (cell->is_locally_owned())
+	{
+	  cell->get_dof_indices (local_dof_indices);
+	  fe_values.reinit (cell);
+
+	  initial_temperature.value_list (fe_values.get_quadrature_points(),
+					  rhs_values);
+
+	  cell_vector = 0;
+	  matrix_for_bc = 0;
+	  for (unsigned int point=0; point<n_q_points; ++point)
+	    for (unsigned int i=0; i<dofs_per_cell; ++i)
+	      {
+		cell_vector(i) += rhs_values[point] *
+				  fe_values.shape_value(i,point) *
+				  fe_values.JxW(point);
+		if (temperature_constraints.is_inhomogeneously_constrained(local_dof_indices[i]))
+		  {
+		    for (unsigned int j=0; j<dofs_per_cell; ++j)
+		      matrix_for_bc(j,i) += fe_values.shape_value(i,point) *
+					    fe_values.shape_value(j,point) *
+					    fe_values.JxW(point);
+		  }
+	      }
+
+	  temperature_constraints.distribute_local_to_global (cell_vector,
+							      local_dof_indices,
+							      rhs,
+							      matrix_for_bc);
+	}
+
+    rhs.compress (); //rhs.compress (Add);
+    solution.compress();
+//    return;
+
+				     // Now that we have the right linear
+				     // system, we solve it using the CG
+				     // method with a simple Jacobi
+				     // preconditioner:
+    SolverControl solver_control(5*rhs.size(), 1e-12*rhs.l2_norm());
+    SolverCG<PETScWrappers::MPI::Vector> cg(solver_control);
+
+    PETScWrappers::PreconditionJacobi preconditioner_mass;
+    preconditioner_mass.initialize(temperature_mass_matrix);
+
+    cg.solve (temperature_mass_matrix, solution, rhs, preconditioner_mass);
+
+    temperature_constraints.distribute (solution);
+
+				     // Having so computed the current
+				     // temperature field, let us set the
+				     // member variable that holds the
+				     // temperature nodes. Strictly speaking,
+				     // we really only need to set
+				     // <code>old_temperature_solution</code>
+				     // since the first thing we will do is to
+				     // compute the Stokes solution that only
+				     // requires the previous time step's
+				     // temperature field. That said, nothing
+				     // good can come from not initializing
+				     // the other vectors as well (especially
+				     // since it's a relatively cheap
+				     // operation and we only have to do it
+				     // once at the beginning of the program)
+				     // if we ever want to extend our
+				     // numerical method or physical model,
+				     // and so we initialize
+				     // <code>temperature_solution</code> and
+				     // <code>old_old_temperature_solution</code>
+				     // as well. As a sidenote, while the
+				     // <code>solution</code> vector is
+				     // strictly distributed (i.e. each
+				     // processor only stores a mutually
+				     // exclusive subset of elements), the
+				     // assignment makes sure that the vectors
+				     // on the left hand side (which where
+				     // initialized to contain ghost elements
+				     // as well) also get the correct ghost
+				     // elements. In other words, the
+				     // assignment here requires communication
+				     // between processors:
+//    CIG::reduce_accuracy(solution);
+
+    temperature_solution = solution;
+    old_temperature_solution = solution;
+    old_old_temperature_solution = solution;
+    temperature_solution.update_ghost_values();
+    old_temperature_solution.update_ghost_values();
+    old_old_temperature_solution.update_ghost_values();
+  }
+
+
+
+
+				   // @sect4{The BoussinesqFlowProblem setup functions}
+
+				   // The following three functions set up the
+				   // Stokes matrix, the matrix used for the
+				   // Stokes preconditioner, and the
+				   // temperature matrix. The code is mostly
+				   // the same as in step-31, but it has been
+				   // broken out into three functions of their
+				   // own for simplicity.
+				   //
+				   // The main functional difference between
+				   // the code here and that in step-31 is
+				   // that the matrices we want to set up are
+				   // distributed across multiple
+				   // processors. Since we still want to build
+				   // up the sparsity pattern first for
+				   // efficiency reasons, we could continue to
+				   // build the <i>entire</i> sparsity pattern
+				   // as a
+				   // BlockCompressedSimpleSparsityPattern, as
+				   // we did in step-31. However, that would
+				   // be inefficient: every processor would
+				   // build the same sparsity pattern, but
+				   // only initialize a small part of the
+				   // matrix using it. It also violates the
+				   // principle that every processor should
+				   // only work on those cells it owns (and,
+				   // if necessary the layer of ghost cells
+				   // around it).
+				   //
+				   // Rather, we use an object of type
+				   // PETScWrappers::BlockSparsityPattern,
+				   // which is (obviously) a wrapper around a
+				   // sparsity pattern object provided by
+				   // Trilinos. The advantage is that the
+				   // Trilinos sparsity pattern class can
+				   // communicate across multiple processors:
+				   // if this processor fills in all the
+				   // nonzero entries that result from the
+				   // cells it owns, and every other processor
+				   // does so as well, then at the end after
+				   // some MPI communication initiated by the
+				   // <code>compress()</code> call, we will
+				   // have the globally assembled sparsity
+				   // pattern available with which the global
+				   // matrix can be initialized.
+				   //
+				   // The only other change we need to make is
+				   // to tell the
+				   // DoFTools::make_sparsity_pattern() function
+				   // that it is only supposed to work on a
+				   // subset of cells, namely the ones whose
+				   // <code>subdomain_id</code> equals the
+				   // number of the current processor, and to
+				   // ignore all other cells.
+				   //
+				   // This strategy is replicated across all
+				   // three of the following functions.
+				   //
+				   // Note that Trilinos matrices store the
+				   // information contained in the sparsity
+				   // patterns, so we can safely release the
+				   // <code>sp</code> variable once the matrix
+				   // has been given the sparsity structure.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::
+  setup_stokes_matrix (const std::vector<IndexSet> &stokes_partitioning)
+  {
+	  assert(false);
+
+    stokes_matrix.clear ();
+
+    TrilinosWrappers::BlockSparsityPattern sp (stokes_partitioning,
+					       MPI_COMM_WORLD);
+
+    Table<2,DoFTools::Coupling> coupling (dim+1, dim+1);
+
+    for (unsigned int c=0; c<dim+1; ++c)
+      for (unsigned int d=0; d<dim+1; ++d)
+	if (! ((c==dim) && (d==dim)))
+	  coupling[c][d] = DoFTools::always;
+	else
+	  coupling[c][d] = DoFTools::none;
+
+    DoFTools::make_sparsity_pattern (stokes_dof_handler,
+				     coupling, sp,
+				     stokes_constraints, false,
+				     Utilities::MPI::
+				     this_mpi_process(MPI_COMM_WORLD));
+    sp.compress();
+
+//    stokes_matrix.reinit (sp);
+//    stokes_matrix.reinit(sp.n_block_rows(), sp.n_block_cols());
+//    for (unsigned int r=0; r<sp.n_block_rows(); ++r)
+//    	for (unsigned int c=0; c<sp.n_block_cols(); ++c)
+//    	{
+//    		stokes_matrix.block(r,s).reinit((block_sparsity_pattern.block(r,c));
+//    	}
+  }
+
+
+
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::
+  setup_stokes_preconditioner (const std::vector<IndexSet> &stokes_partitioning)
+  {
+	  assert(false);
+
+    Amg_preconditioner.reset ();
+    Mp_preconditioner.reset ();
+
+    stokes_preconditioner_matrix.clear ();
+
+    TrilinosWrappers::BlockSparsityPattern sp (stokes_partitioning,
+					       MPI_COMM_WORLD);
+
+    Table<2,DoFTools::Coupling> coupling (dim+1, dim+1);
+    for (unsigned int c=0; c<dim+1; ++c)
+      for (unsigned int d=0; d<dim+1; ++d)
+	if (c == d)
+	  coupling[c][d] = DoFTools::always;
+	else
+	  coupling[c][d] = DoFTools::none;
+
+    DoFTools::make_sparsity_pattern (stokes_dof_handler,
+				     coupling, sp,
+				     stokes_constraints, false,
+				     Utilities::MPI::
+				     this_mpi_process(MPI_COMM_WORLD));
+    sp.compress();
+
+//    stokes_preconditioner_matrix.reinit (sp);
+  }
+
+
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::
+  setup_temperature_matrices (const IndexSet &temperature_partitioner)
+  {
+    T_preconditioner.reset ();
+    temperature_mass_matrix.clear ();
+    temperature_stiffness_matrix.clear ();
+    temperature_matrix.clear ();
+
+//    PETScWrappers::SparsityPattern sp (temperature_partitioner,
+//					  MPI_COMM_WORLD);
+//    DoFTools::make_sparsity_pattern (temperature_dof_handler, sp,
+//				     temperature_constraints, false,
+//				     Utilities::MPI::
+//				     this_mpi_process(MPI_COMM_WORLD));
+//    sp.compress();
+    int my_rank = Utilities::MPI::this_mpi_process(MPI_COMM_WORLD);
+    const unsigned int n_local_dofs = temperature_dof_handler.n_locally_owned_dofs();
+    const unsigned int n_dofs = temperature_dof_handler.n_dofs();
+//       = DoFTools::count_dofs_with_subdomain_association (temperature_dof_handler,
+//       		Utilities::MPI::this_mpi_process(MPI_COMM_WORLD));
+
+//    temperature_matrix.reinit (sp);
+//    temperature_mass_matrix.reinit (sp);
+//    temperature_stiffness_matrix.reinit (sp);
+    temperature_matrix.reinit (MPI_COMM_WORLD,
+    		temperature_dof_handler.n_dofs(),
+    		temperature_dof_handler.n_dofs(),
+    		n_local_dofs,
+    		n_local_dofs,
+    		temperature_dof_handler.max_couplings_between_dofs());
+    temperature_mass_matrix.reinit (MPI_COMM_WORLD,
+    		temperature_dof_handler.n_dofs(),
+    		temperature_dof_handler.n_dofs(),
+    		n_local_dofs,
+    		n_local_dofs,
+    		temperature_dof_handler.max_couplings_between_dofs());
+    temperature_stiffness_matrix.reinit (MPI_COMM_WORLD,
+    		temperature_dof_handler.n_dofs(),
+    		temperature_dof_handler.n_dofs(),
+    		n_local_dofs,
+    		n_local_dofs,
+    		temperature_dof_handler.max_couplings_between_dofs());
+  }
+
+
+
+				   // The remainder of the setup function
+				   // (after splitting out the three functions
+				   // above) mostly has to deal with the
+				   // things we need to do for parallelization
+				   // across processors. Because setting all
+				   // of this up is a significant compute time
+				   // expense of the program, we put
+				   // everything we do here into a timer group
+				   // so that we can get summary information
+				   // about the fraction of time spent in this
+				   // part of the program at its end.
+				   //
+				   // At the top as usual we enumerate degrees
+				   // of freedom and sort them by
+				   // component/block, followed by writing
+				   // their numbers to the screen from
+				   // processor zero. The DoFHandler::distributed_dofs() function, when applied to a parallel::distributed::Triangulation object, sorts degrees of freedom in such a
+				   // way that all degrees of freedom
+				   // associated with subdomain zero come
+				   // before all those associated with
+				   // subdomain one, etc. For the Stokes
+				   // part, this entails, however, that
+				   // velocities and pressures become
+				   // intermixed, but this is trivially
+				   // solved by sorting again by blocks; it
+				   // is worth noting that this latter
+				   // operation leaves the relative ordering
+				   // of all velocities and pressures alone,
+				   // i.e. within the velocity block we will
+				   // still have all those associated with
+				   // subdomain zero before all velocities
+				   // associated with subdomain one,
+				   // etc. This is important since we store
+				   // each of the blocks of this matrix
+				   // distributed across all processors and
+				   // want this to be done in such a way
+				   // that each processor stores that part
+				   // of the matrix that is roughly equal to
+				   // the degrees of freedom located on
+				   // those cells that it will actually work
+				   // on.
+				   //
+				   // When printing the numbers of degrees of
+				   // freedom, note that these numbers are
+				   // going to be large if we use many
+				   // processors. Consequently, we let the
+				   // stream put a comma separator in between
+				   // every three digits. The state of the
+				   // stream, using the locale, is saved from
+				   // before to after this operation. While
+				   // slightly opaque, the code works because
+				   // the default locale (which we get using
+				   // the constructor call
+				   // <code>std::locale("")</code>) implies
+				   // printing numbers with a comma separator
+				   // for every third digit (i.e., thousands,
+				   // millions, billions).
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::setup_dofs ()
+  {
+    computing_timer.enter_section("Setup dof systems");
+
+    std::vector<unsigned int> stokes_sub_blocks (dim+1,0);
+    stokes_sub_blocks[dim] = 1;
+    stokes_dof_handler.distribute_dofs (stokes_fe);
+    DoFRenumbering::component_wise (stokes_dof_handler, stokes_sub_blocks);
+
+    temperature_dof_handler.distribute_dofs (temperature_fe);
+
+    std::vector<unsigned int> stokes_dofs_per_block (2);
+    DoFTools::count_dofs_per_block (stokes_dof_handler, stokes_dofs_per_block,
+				    stokes_sub_blocks);
+
+    const unsigned int n_u = stokes_dofs_per_block[0],
+		       n_p = stokes_dofs_per_block[1],
+		       n_T = temperature_dof_handler.n_dofs();
+
+    std::locale s = pcout.get_stream().getloc();
+    pcout.get_stream().imbue(std::locale(""));
+    pcout << "Number of active cells: "
+	  << triangulation.n_global_active_cells()
+	  << " (on "
+	  << triangulation.n_levels()
+	  << " levels)"
+	  << std::endl
+	  << "Number of degrees of freedom: "
+	  << n_u + n_p + n_T
+	  << " (" << n_u << '+' << n_p << '+'<< n_T <<')'
+	  << std::endl
+	  << std::endl;
+    pcout.get_stream().imbue(s);
+
+
+				     // After this, we have to set up the
+				     // various partitioners (of type
+				     // <code>IndexSet</code>, see the
+				     // introduction) that describe which
+				     // parts of each matrix or vector will be
+				     // stored where, then call the functions
+				     // that actually set up the matrices, and
+				     // at the end also resize the various
+				     // vectors we keep around in this
+				     // program.
+    std::vector<IndexSet> stokes_partitioning, stokes_relevant_partitioning;
+    IndexSet temperature_partitioning (n_T), temperature_relevant_partitioning (n_T);
+    IndexSet stokes_relevant_set;
+    {
+      IndexSet stokes_index_set = stokes_dof_handler.locally_owned_dofs();
+      stokes_partitioning.push_back(stokes_index_set.get_view(0,n_u));
+      stokes_partitioning.push_back(stokes_index_set.get_view(n_u,n_u+n_p));
+
+      DoFTools::extract_locally_relevant_dofs (stokes_dof_handler,
+					       stokes_relevant_set);
+      stokes_relevant_partitioning.push_back(stokes_relevant_set.get_view(0,n_u));
+      stokes_relevant_partitioning.push_back(stokes_relevant_set.get_view(n_u,n_u+n_p));
+
+      temperature_partitioning = temperature_dof_handler.locally_owned_dofs();
+      DoFTools::extract_locally_relevant_dofs (temperature_dof_handler,
+					       temperature_relevant_partitioning);
+    }
+
+				     // Following this, we can compute
+				     // constraints for the solution vectors,
+				     // including hanging node constraints and
+				     // homogenous and inhomogenous boundary
+				     // values for the Stokes and temperature
+				     // fields. Note that as for everything
+				     // else, the constraint objects can not
+				     // hold <i>all</i> constraints on every
+				     // processor. Rather, each processor
+				     // needs to store only those that are
+				     // actually necessary for correctness
+				     // given that it only assembles linear
+				     // systems on cells it owns. As discussed
+				     // in the
+				     // @ref distributed_paper "this paper",
+				     // the set of constraints we need to know
+				     // about is exactly the set of
+				     // constraints on all locally relevant
+				     // degrees of freedom, so this is what we
+				     // use to initialize the constraint
+				     // objects.
+    {
+      stokes_constraints.clear ();
+      stokes_constraints.reinit (stokes_relevant_set);
+
+      DoFTools::make_hanging_node_constraints (stokes_dof_handler,
+					       stokes_constraints);
+
+      std::vector<bool> velocity_mask (dim+1, true);
+      velocity_mask[dim] = false;
+      VectorTools::interpolate_boundary_values (stokes_dof_handler,
+						0,
+						ZeroFunction<dim>(dim+1),
+						stokes_constraints,
+						velocity_mask);
+
+      std::set<types::boundary_id> no_normal_flux_boundaries;
+      no_normal_flux_boundaries.insert (1);
+      VectorTools::compute_no_normal_flux_constraints (stokes_dof_handler, 0,
+                                                       no_normal_flux_boundaries,
+                                                       stokes_constraints,
+                                                       mapping);
+      stokes_constraints.close ();
+    }
+    {
+      temperature_constraints.clear ();
+      temperature_constraints.reinit (temperature_relevant_partitioning);
+
+      DoFTools::make_hanging_node_constraints (temperature_dof_handler,
+                                               temperature_constraints);
+      VectorTools::interpolate_boundary_values (temperature_dof_handler,
+                                                0,
+                                                EquationData::TemperatureInitialValues<dim>(),
+                                                temperature_constraints);
+      VectorTools::interpolate_boundary_values (temperature_dof_handler,
+                                                1,
+                                                EquationData::TemperatureInitialValues<dim>(),
+                                                temperature_constraints);
+      temperature_constraints.close ();
+    }
+
+                                     // All this done, we can then initialize
+                                     // the various matrix and vector objects
+                                     // to their proper sizes. At the end, we
+                                     // also record that all matrices and
+                                     // preconditioners have to be re-computed
+                                     // at the beginning of the next time
+                                     // step.
+    std::vector<unsigned int> block_sizes, local_sizes;
+    CIG::convert_block_partitioning(stokes_partitioning,n_u,n_p,block_sizes,local_sizes);
+
+//    setup_stokes_matrix (stokes_partitioning);
+    CIG::setup_petsc_matrix(block_sizes,local_sizes,stokes_dof_handler.max_couplings_between_dofs(),stokes_matrix);
+//    setup_stokes_preconditioner (stokes_partitioning);
+    CIG::setup_petsc_matrix(block_sizes,local_sizes,stokes_dof_handler.max_couplings_between_dofs(),stokes_preconditioner_matrix);
+    setup_temperature_matrices (temperature_partitioning);
+
+    stokes_rhs.reinit(block_sizes,MPI_COMM_WORLD,local_sizes);	//    stokes_rhs.reinit (stokes_partitioning, MPI_COMM_WORLD);
+    CIG::setup_petsc_vector(block_sizes, stokes_partitioning,stokes_relevant_partitioning,stokes_solution);
+//    old_stokes_solution.reinit (stokes_solution);
+    CIG::setup_petsc_vector(block_sizes, stokes_partitioning,stokes_relevant_partitioning,old_stokes_solution);
+
+    temperature_rhs.reinit (MPI_COMM_WORLD, temperature_partitioning);
+    temperature_solution.reinit (MPI_COMM_WORLD, temperature_partitioning, temperature_relevant_partitioning);
+    old_temperature_solution.reinit (MPI_COMM_WORLD, temperature_partitioning, temperature_relevant_partitioning);
+    old_old_temperature_solution.reinit(MPI_COMM_WORLD, temperature_partitioning, temperature_relevant_partitioning);
+
+    rebuild_stokes_matrix              = true;
+    rebuild_stokes_preconditioner      = true;
+    rebuild_temperature_matrices       = true;
+    rebuild_temperature_preconditioner = true;
+
+    computing_timer.exit_section();
+  }
+
+
+
+                                   // @sect4{The BoussinesqFlowProblem assembly functions}
+                                   //
+                                   // Following the discussion in the
+                                   // introduction and in the @ref threads
+                                   // module, we split the assembly functions
+                                   // into different parts:
+                                   //
+                                   // <ul> <li> The local calculations of
+                                   // matrices and right hand sides, given a
+                                   // certain cell as input (these functions
+                                   // are named <code>local_assemble_*</code>
+                                   // below). The resulting function is, in
+                                   // other words, essentially the body of the
+                                   // loop over all cells in step-31. Note,
+                                   // however, that these functions store the
+                                   // result from the local calculations in
+                                   // variables of classes from the CopyData
+                                   // namespace.
+                                   //
+                                   // <li>These objects are then given to the
+                                   // second step which writes the local data
+                                   // into the global data structures (these
+                                   // functions are named
+                                   // <code>copy_local_to_global_*</code>
+                                   // below). These functions are pretty
+                                   // trivial.
+                                   //
+                                   // <li>These two subfunctions are then used
+                                   // in the respective assembly routine
+                                   // (called <code>assemble_*</code> below),
+                                   // where a WorkStream object is set up and
+                                   // runs over all the cells that belong to
+                                   // the processor's subdomain.  </ul>
+
+                                   // @sect5{Stokes preconditioner assembly}
+                                   //
+                                   // Let us start with the functions that
+                                   // builds the Stokes preconditioner. The
+                                   // first two of these are pretty trivial,
+                                   // given the discussion above. Note in
+                                   // particular that the main point in using
+                                   // the scratch data object is that we want
+                                   // to avoid allocating any objects on the
+                                   // free space each time we visit a new
+                                   // cell. As a consequence, the assembly
+                                   // function below only has automatic local
+                                   // variables, and everything else is
+                                   // accessed through the scratch data
+                                   // object, which is allocated only once
+                                   // before we start the loop over all cells:
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::
+  local_assemble_stokes_preconditioner (const typename DoFHandler<dim>::active_cell_iterator &cell,
+                                        Assembly::Scratch::StokesPreconditioner<dim> &scratch,
+                                        Assembly::CopyData::StokesPreconditioner<dim> &data)
+  {
+    const unsigned int   dofs_per_cell   = stokes_fe.dofs_per_cell;
+    const unsigned int   n_q_points      = scratch.stokes_fe_values.n_quadrature_points;
+
+    const FEValuesExtractors::Vector velocities (0);
+    const FEValuesExtractors::Scalar pressure (dim);
+
+    scratch.stokes_fe_values.reinit (cell);
+    cell->get_dof_indices (data.local_dof_indices);
+
+    data.local_matrix = 0;
+
+    for (unsigned int q=0; q<n_q_points; ++q)
+      {
+        for (unsigned int k=0; k<dofs_per_cell; ++k)
+          {
+            scratch.grad_phi_u[k] = scratch.stokes_fe_values[velocities].gradient(k,q);
+            scratch.phi_p[k]      = scratch.stokes_fe_values[pressure].value (k, q);
+          }
+
+        for (unsigned int i=0; i<dofs_per_cell; ++i)
+          for (unsigned int j=0; j<dofs_per_cell; ++j)
+            data.local_matrix(i,j) += (EquationData::eta *
+                                       scalar_product (scratch.grad_phi_u[i],
+                                                       scratch.grad_phi_u[j])
+                                       +
+                                       (1./EquationData::eta) *
+                                       EquationData::pressure_scaling *
+                                       EquationData::pressure_scaling *
+                                       (scratch.phi_p[i] * scratch.phi_p[j]))
+                                      * scratch.stokes_fe_values.JxW(q);
+      }
+  }
+
+
+
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::
+  copy_local_to_global_stokes_preconditioner (const Assembly::CopyData::StokesPreconditioner<dim> &data)
+  {
+    stokes_constraints.distribute_local_to_global (data.local_matrix,
+                                                   data.local_dof_indices,
+                                                   stokes_preconditioner_matrix);
+  }
+
+
+                                   // Now for the function that actually puts
+                                   // things together, using the WorkStream
+                                   // functions.  WorkStream::run needs a
+                                   // start and end iterator to enumerate the
+                                   // cells it is supposed to work
+                                   // on. Typically, one would use
+                                   // DoFHandler::begin_active() and
+                                   // DoFHandler::end() for that but here we
+                                   // actually only want the subset of cells
+                                   // that in fact are owned by the current
+                                   // processor. This is where the
+                                   // FilteredIterator class comes into play:
+                                   // you give it a range of cells and it
+                                   // provides an iterator that only iterates
+                                   // over that subset of cells that satisfy a
+                                   // certain predicate (a predicate is a
+                                   // function of one argument that either
+                                   // returns true or false). The predicate we
+                                   // use here is
+                                   // IteratorFilters::LocallyOwnedCell, i.e.,
+                                   // it returns true exactly if the cell is
+                                   // owned by the current processor. The
+                                   // resulting iterator range is then exactly
+                                   // what we need.
+                                   //
+                                   // With this obstacle out of the way, we
+                                   // call the WorkStream::run function with
+                                   // this set of cells, scratch and copy
+                                   // objects, and with pointers to two
+                                   // functions: the local assembly and
+                                   // copy-local-to-global function. These
+                                   // functions need to have very specific
+                                   // signatures: three arguments in the first
+                                   // and one argument in the latter case (see
+                                   // the documentation of the WorkStream::run
+                                   // function for the meaning of these
+                                   // arguments).  Note how we use the
+                                   // construct <code>std_cxx1x::bind</code>
+                                   // to create a function object that
+                                   // satisfies this requirement. It uses
+                                   // placeholders <code>_1, std_cxx1x::_2,
+                                   // _3</code> for the local assembly
+                                   // function that specify cell, scratch
+                                   // data, and copy data, as well as the
+                                   // placeholder <code>_1</code> for the copy
+                                   // function that expects the data to be
+                                   // written into the global matrix. On the
+                                   // other hand, the implicit zeroth argument
+                                   // of member functions (namely the
+                                   // <code>this</code> pointer of the object
+                                   // on which that member function is to
+                                   // operate on) is <i>bound</i> to the
+                                   // <code>this</code> pointer of the current
+                                   // function. The WorkStream::run function,
+                                   // as a consequence, does not need to know
+                                   // anything about the object these
+                                   // functions work on.
+                                   //
+                                   // When the WorkStream is executed, it will
+                                   // create several local assembly routines
+                                   // of the first kind for several cells and
+                                   // let some available processors work on
+                                   // them. The function that needs to be
+                                   // synchronized, i.e., the write operation
+                                   // into the global matrix, however, is
+                                   // executed by only one thread at a time in
+                                   // the prescribed order. Of course, this
+                                   // only holds for the parallelization on a
+                                   // single MPI process. Different MPI
+                                   // processes will have their own WorkStream
+                                   // objects and do that work completely
+                                   // independently (and in different memory
+                                   // spaces). In a distributed calculation,
+                                   // some data will accumulate at degrees of
+                                   // freedom that are not owned by the
+                                   // respective processor. It would be
+                                   // inefficient to send data around every
+                                   // time we encounter such a dof. What
+                                   // happens instead is that the Trilinos
+                                   // sparse matrix will keep that data and
+                                   // send it to the owner at the end of
+                                   // assembly, by calling the
+                                   // <code>compress()</code> command.
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::assemble_stokes_preconditioner ()
+  {
+    stokes_preconditioner_matrix = 0;
+
+    const QGauss<dim> quadrature_formula(parameters.stokes_velocity_degree+1);
+
+    typedef
+      FilteredIterator<typename DoFHandler<dim>::active_cell_iterator>
+      CellFilter;
+
+    WorkStream::
+      run (CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       stokes_dof_handler.begin_active()),
+           CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       stokes_dof_handler.end()),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            local_assemble_stokes_preconditioner,
+                            this,
+                            std_cxx1x::_1,
+                            std_cxx1x::_2,
+                            std_cxx1x::_3),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            copy_local_to_global_stokes_preconditioner,
+                            this,
+                            std_cxx1x::_1),
+           Assembly::Scratch::
+           StokesPreconditioner<dim> (stokes_fe, quadrature_formula,
+                                      mapping,
+                                      update_JxW_values |
+                                      update_values |
+                                      update_gradients),
+           Assembly::CopyData::
+           StokesPreconditioner<dim> (stokes_fe));
+
+    stokes_preconditioner_matrix.compress();
+  }
+
+
+
+                                   // The final function in this block
+                                   // initiates assembly of the Stokes
+                                   // preconditioner matrix and then in fact
+                                   // builds the Stokes preconditioner. It is
+                                   // mostly the same as in the serial
+                                   // case. The only difference to step-31 is
+                                   // that we use a Jacobi preconditioner for
+                                   // the pressure mass matrix instead of IC,
+                                   // as discussed in the introduction.
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::build_stokes_preconditioner ()
+  {
+    if (rebuild_stokes_preconditioner == false)
+      return;
+
+    computing_timer.enter_section ("   Build Stokes preconditioner");
+    pcout << "   Rebuilding Stokes preconditioner..." << std::flush;
+
+    assemble_stokes_preconditioner ();
+
+    std::vector<std::vector<bool> > constant_modes;
+    std::vector<bool>  velocity_components (dim+1,true);
+    velocity_components[dim] = false;
+    DoFTools::extract_constant_modes (stokes_dof_handler, velocity_components,
+                                      constant_modes);
+
+    Mp_preconditioner.reset  (new PETScWrappers::PreconditionJacobi());
+    Amg_preconditioner.reset (new PETScWrappers::PreconditionBoomerAMG());
+
+    PETScWrappers::PreconditionBoomerAMG::AdditionalData Amg_data;
+//    Amg_data.constant_modes = constant_modes;
+//    Amg_data.elliptic = true;
+//    Amg_data.higher_order_elements = true;
+//    Amg_data.smoother_sweeps = 2;
+//    Amg_data.aggregation_threshold = 0.02;
+
+    Mp_preconditioner->initialize (stokes_preconditioner_matrix.block(1,1));
+    Amg_preconditioner->initialize (stokes_preconditioner_matrix.block(0,0),
+                                    Amg_data);
+
+    rebuild_stokes_preconditioner = false;
+
+    pcout << std::endl;
+    computing_timer.exit_section();
+  }
+
+
+                                   // @sect5{Stokes system assembly}
+
+                                   // The next three functions implement the
+                                   // assembly of the Stokes system, again
+                                   // split up into a part performing local
+                                   // calculations, one for writing the local
+                                   // data into the global matrix and vector,
+                                   // and one for actually running the loop
+                                   // over all cells with the help of the
+                                   // WorkStream class. Note that the assembly
+                                   // of the Stokes matrix needs only to be
+                                   // done in case we have changed the
+                                   // mesh. Otherwise, just the
+                                   // (temperature-dependent) right hand side
+                                   // needs to be calculated here. Since we
+                                   // are working with distributed matrices
+                                   // and vectors, we have to call the
+                                   // respective <code>compress()</code>
+                                   // functions in the end of the assembly in
+                                   // order to send non-local data to the
+                                   // owner process.
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::
+  local_assemble_stokes_system (const typename DoFHandler<dim>::active_cell_iterator &cell,
+                                Assembly::Scratch::StokesSystem<dim> &scratch,
+                                Assembly::CopyData::StokesSystem<dim> &data)
+  {
+//	  static int debug_index = 0;
+//	  std::cout << "debug_index = " << debug_index << std::endl;
+
+    const unsigned int dofs_per_cell = scratch.stokes_fe_values.get_fe().dofs_per_cell;
+    const unsigned int n_q_points    = scratch.stokes_fe_values.n_quadrature_points;
+
+    const FEValuesExtractors::Vector velocities (0);
+    const FEValuesExtractors::Scalar pressure (dim);
+
+    scratch.stokes_fe_values.reinit (cell);
+
+    typename DoFHandler<dim>::active_cell_iterator
+      temperature_cell (&triangulation,
+                        cell->level(),
+                        cell->index(),
+                        &temperature_dof_handler);
+    scratch.temperature_fe_values.reinit (temperature_cell);
+
+    if (rebuild_stokes_matrix)
+      data.local_matrix = 0;
+    data.local_rhs = 0;
+
+    scratch.temperature_fe_values.get_function_values (old_temperature_solution,
+                                                       scratch.old_temperature_values);
+
+    for (unsigned int q=0; q<n_q_points; ++q)
+      {
+        const double old_temperature = scratch.old_temperature_values[q];
+
+        for (unsigned int k=0; k<dofs_per_cell; ++k)
+          {
+            scratch.phi_u[k] = scratch.stokes_fe_values[velocities].value (k,q);
+            if (rebuild_stokes_matrix)
+              {
+                scratch.grads_phi_u[k] = scratch.stokes_fe_values[velocities].symmetric_gradient(k,q);
+                scratch.div_phi_u[k]   = scratch.stokes_fe_values[velocities].divergence (k, q);
+                scratch.phi_p[k]       = scratch.stokes_fe_values[pressure].value (k, q);
+              }
+          }
+
+        if (rebuild_stokes_matrix == true)
+          for (unsigned int i=0; i<dofs_per_cell; ++i)
+            for (unsigned int j=0; j<dofs_per_cell; ++j)
+              data.local_matrix(i,j) += (EquationData::eta * 2 *
+                                         (scratch.grads_phi_u[i] * scratch.grads_phi_u[j])
+                                         - (EquationData::pressure_scaling *
+                                            scratch.div_phi_u[i] * scratch.phi_p[j])
+                                         - (EquationData::pressure_scaling *
+                                            scratch.phi_p[i] * scratch.div_phi_u[j]))
+                                        * scratch.stokes_fe_values.JxW(q);
+
+        const Tensor<1,dim>
+          gravity = EquationData::gravity_vector (scratch.stokes_fe_values
+                                                  .quadrature_point(q));
+
+        for (unsigned int i=0; i<dofs_per_cell; ++i)
+        {
+//        	std::cout.precision(20);
+//        	std::cout << std::fixed;
+//        	std::cout << "i = " << i << ", " << data.local_rhs(q) << ", " <<
+//        			old_temperature << ", " <<
+//        			EquationData::density(old_temperature) << ", " <<
+//        			gravity << ", " <<
+//        			scratch.phi_u[i] << ", " <<
+//        			scratch.stokes_fe_values.JxW(q) << std::endl;
+
+          data.local_rhs(i) += (EquationData::density(old_temperature) *
+                                gravity  *
+                                scratch.phi_u[i]) *
+                               scratch.stokes_fe_values.JxW(q);
+        }
+      }
+
+    cell->get_dof_indices (data.local_dof_indices);
+
+//    debug_index++;
+//    data.local_rhs.print(std::cout,7,false,false);
+  }
+
+
+
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::
+  copy_local_to_global_stokes_system (const Assembly::CopyData::StokesSystem<dim> &data)
+  {
+//	  return;
+//	  static int debug_index = 0;
+//	  std::cout << "debug_index = " << debug_index << std::endl;
+//	  debug_index++;
+//	  data.local_rhs.print(std::cout,7,false,false);
+
+    if (rebuild_stokes_matrix == true)
+      stokes_constraints.distribute_local_to_global (data.local_matrix,
+                                                     data.local_rhs,
+                                                     data.local_dof_indices,
+                                                     stokes_matrix,
+                                                     stokes_rhs);
+    else
+      stokes_constraints.distribute_local_to_global (data.local_rhs,
+                                                     data.local_dof_indices,
+                                                     stokes_rhs);
+  }
+
+
+
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::assemble_stokes_system ()
+  {
+    computing_timer.enter_section ("   Assemble Stokes system");
+
+    if (rebuild_stokes_matrix == true)
+      stokes_matrix=0;
+
+    stokes_rhs=0;
+
+    const QGauss<dim> quadrature_formula(parameters.stokes_velocity_degree+1);
+
+    typedef
+      FilteredIterator<typename DoFHandler<dim>::active_cell_iterator>
+      CellFilter;
+
+    WorkStream::
+      run (CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       stokes_dof_handler.begin_active()),
+           CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       stokes_dof_handler.end()),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            local_assemble_stokes_system,
+                            this,
+                            std_cxx1x::_1,
+                            std_cxx1x::_2,
+                            std_cxx1x::_3),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            copy_local_to_global_stokes_system,
+                            this,
+                            std_cxx1x::_1),
+           Assembly::Scratch::
+           StokesSystem<dim> (stokes_fe, mapping, quadrature_formula,
+                              (update_values    |
+                               update_quadrature_points  |
+                               update_JxW_values |
+                               (rebuild_stokes_matrix == true
+                                ?
+                                update_gradients
+                                :
+                                UpdateFlags(0))),
+                              temperature_fe,
+                              update_values),
+           Assembly::CopyData::
+           StokesSystem<dim> (stokes_fe));
+
+    stokes_matrix.compress(dealii::VectorOperation::add);
+    stokes_rhs.compress(dealii::VectorOperation::add); //stokes_rhs.compress(Add);
+
+//    std::ofstream filename_matrix, filename_vector;
+//    filename_matrix.open("stokes_matrix00.txt");
+//    stokes_matrix.block(1,0).write_ascii();
+//    stokes_matrix.block(1,1).write_ascii();
+//    filename_vector.open("stokes_rhs.txt");
+//    stokes_rhs.print(filename_vector,7,false,false);
+
+    rebuild_stokes_matrix = false;
+
+    pcout << std::endl;
+    computing_timer.exit_section();
+  }
+
+
+                                   // @sect5{Temperature matrix assembly}
+
+                                   // The task to be performed by the next
+                                   // three functions is to calculate a mass
+                                   // matrix and a Laplace matrix on the
+                                   // temperature system. These will be
+                                   // combined in order to yield the
+                                   // semi-implicit time stepping matrix that
+                                   // consists of the mass matrix plus a time
+                                   // step-dependent weight factor times the
+                                   // Laplace matrix. This function is again
+                                   // essentially the body of the loop over
+                                   // all cells from step-31.
+                                   //
+                                   // The two following functions perform
+                                   // similar services as the ones above.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::
+  local_assemble_temperature_matrix (const typename DoFHandler<dim>::active_cell_iterator &cell,
+                                     Assembly::Scratch::TemperatureMatrix<dim> &scratch,
+                                     Assembly::CopyData::TemperatureMatrix<dim> &data)
+  {
+    const unsigned int dofs_per_cell = scratch.temperature_fe_values.get_fe().dofs_per_cell;
+    const unsigned int n_q_points    = scratch.temperature_fe_values.n_quadrature_points;
+
+    scratch.temperature_fe_values.reinit (cell);
+    cell->get_dof_indices (data.local_dof_indices);
+
+    data.local_mass_matrix = 0;
+    data.local_stiffness_matrix = 0;
+
+    for (unsigned int q=0; q<n_q_points; ++q)
+      {
+        for (unsigned int k=0; k<dofs_per_cell; ++k)
+          {
+            scratch.grad_phi_T[k] = scratch.temperature_fe_values.shape_grad (k,q);
+            scratch.phi_T[k]      = scratch.temperature_fe_values.shape_value (k, q);
+          }
+
+        for (unsigned int i=0; i<dofs_per_cell; ++i)
+          for (unsigned int j=0; j<dofs_per_cell; ++j)
+            {
+              data.local_mass_matrix(i,j)
+                += (scratch.phi_T[i] * scratch.phi_T[j]
+                    *
+                    scratch.temperature_fe_values.JxW(q));
+              data.local_stiffness_matrix(i,j)
+                += (EquationData::kappa * scratch.grad_phi_T[i] * scratch.grad_phi_T[j]
+                    *
+                    scratch.temperature_fe_values.JxW(q));
+            }
+      }
+  }
+
+
+
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::
+  copy_local_to_global_temperature_matrix (const Assembly::CopyData::TemperatureMatrix<dim> &data)
+  {
+    temperature_constraints.distribute_local_to_global (data.local_mass_matrix,
+                                                        data.local_dof_indices,
+                                                        temperature_mass_matrix);
+    temperature_constraints.distribute_local_to_global (data.local_stiffness_matrix,
+                                                        data.local_dof_indices,
+                                                        temperature_stiffness_matrix);
+  }
+
+
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::assemble_temperature_matrix ()
+  {
+    if (rebuild_temperature_matrices == false)
+      return;
+
+    computing_timer.enter_section ("   Assemble temperature matrices");
+    temperature_mass_matrix = 0;
+    temperature_stiffness_matrix = 0;
+
+    const QGauss<dim> quadrature_formula(parameters.temperature_degree+2);
+
+    typedef
+      FilteredIterator<typename DoFHandler<dim>::active_cell_iterator>
+      CellFilter;
+
+    WorkStream::
+      run (CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       temperature_dof_handler.begin_active()),
+           CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       temperature_dof_handler.end()),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            local_assemble_temperature_matrix,
+                            this,
+                            std_cxx1x::_1,
+                            std_cxx1x::_2,
+                            std_cxx1x::_3),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            copy_local_to_global_temperature_matrix,
+                            this,
+                            std_cxx1x::_1),
+           Assembly::Scratch::
+           TemperatureMatrix<dim> (temperature_fe, mapping, quadrature_formula),
+           Assembly::CopyData::
+           TemperatureMatrix<dim> (temperature_fe));
+
+    temperature_mass_matrix.compress();
+    temperature_stiffness_matrix.compress();
+
+    rebuild_temperature_matrices = false;
+    rebuild_temperature_preconditioner = true;
+
+    computing_timer.exit_section();
+  }
+
+
+                                   // @sect5{Temperature right hand side assembly}
+
+                                   // This is the last assembly function. It
+                                   // calculates the right hand side of the
+                                   // temperature system, which includes the
+                                   // convection and the stabilization
+                                   // terms. It includes a lot of evaluations
+                                   // of old solutions at the quadrature
+                                   // points (which are necessary for
+                                   // calculating the artificial viscosity of
+                                   // stabilization), but is otherwise similar
+                                   // to the other assembly functions. Notice,
+                                   // once again, how we resolve the dilemma
+                                   // of having inhomogeneous boundary
+                                   // conditions, by just making a right hand
+                                   // side at this point (compare the comments
+                                   // for the <code>project()</code> function
+                                   // above): We create some matrix columns
+                                   // with exactly the values that would be
+                                   // entered for the temperature stiffness
+                                   // matrix, in case we have inhomogeneously
+                                   // constrained dofs. That will account for
+                                   // the correct balance of the right hand
+                                   // side vector with the matrix system of
+                                   // temperature.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::
+  local_assemble_temperature_rhs (const std::pair<double,double> global_T_range,
+                                  const double                   global_max_velocity,
+                                  const double                   global_entropy_variation,
+                                  const typename DoFHandler<dim>::active_cell_iterator &cell,
+                                  Assembly::Scratch::TemperatureRHS<dim> &scratch,
+                                  Assembly::CopyData::TemperatureRHS<dim> &data)
+  {
+    const bool use_bdf2_scheme = (timestep_number != 0);
+
+    const unsigned int dofs_per_cell = scratch.temperature_fe_values.get_fe().dofs_per_cell;
+    const unsigned int n_q_points    = scratch.temperature_fe_values.n_quadrature_points;
+
+    const FEValuesExtractors::Vector velocities (0);
+
+    data.local_rhs = 0;
+    data.matrix_for_bc = 0;
+    cell->get_dof_indices (data.local_dof_indices);
+
+    scratch.temperature_fe_values.reinit (cell);
+
+    typename DoFHandler<dim>::active_cell_iterator
+      stokes_cell (&triangulation,
+                   cell->level(),
+                   cell->index(),
+                   &stokes_dof_handler);
+    scratch.stokes_fe_values.reinit (stokes_cell);
+
+    scratch.temperature_fe_values.get_function_values (old_temperature_solution,
+                                                       scratch.old_temperature_values);
+    scratch.temperature_fe_values.get_function_values (old_old_temperature_solution,
+                                                       scratch.old_old_temperature_values);
+
+    scratch.temperature_fe_values.get_function_gradients (old_temperature_solution,
+                                                          scratch.old_temperature_grads);
+    scratch.temperature_fe_values.get_function_gradients (old_old_temperature_solution,
+                                                          scratch.old_old_temperature_grads);
+
+    scratch.temperature_fe_values.get_function_laplacians (old_temperature_solution,
+                                                           scratch.old_temperature_laplacians);
+    scratch.temperature_fe_values.get_function_laplacians (old_old_temperature_solution,
+                                                           scratch.old_old_temperature_laplacians);
+
+    scratch.stokes_fe_values[velocities].get_function_values (stokes_solution,
+                                                              scratch.old_velocity_values);
+    scratch.stokes_fe_values[velocities].get_function_values (old_stokes_solution,
+                                                              scratch.old_old_velocity_values);
+    scratch.stokes_fe_values[velocities].get_function_symmetric_gradients (stokes_solution,
+                                                                           scratch.old_strain_rates);
+    scratch.stokes_fe_values[velocities].get_function_symmetric_gradients (old_stokes_solution,
+                                                                           scratch.old_old_strain_rates);
+
+    const double nu
+      = compute_viscosity (scratch.old_temperature_values,
+                           scratch.old_old_temperature_values,
+                           scratch.old_temperature_grads,
+                           scratch.old_old_temperature_grads,
+                           scratch.old_temperature_laplacians,
+                           scratch.old_old_temperature_laplacians,
+                           scratch.old_velocity_values,
+                           scratch.old_old_velocity_values,
+                           scratch.old_strain_rates,
+                           scratch.old_old_strain_rates,
+                           global_max_velocity,
+                           global_T_range.second - global_T_range.first,
+                           0.5 * (global_T_range.second + global_T_range.first),
+                           global_entropy_variation,
+                           cell->diameter());
+
+    for (unsigned int q=0; q<n_q_points; ++q)
+      {
+        for (unsigned int k=0; k<dofs_per_cell; ++k)
+          {
+            scratch.phi_T[k]      = scratch.temperature_fe_values.shape_value (k, q);
+            scratch.grad_phi_T[k] = scratch.temperature_fe_values.shape_grad (k,q);
+          }
+
+
+        const double T_term_for_rhs
+          = (use_bdf2_scheme ?
+             (scratch.old_temperature_values[q] *
+              (1 + time_step/old_time_step)
+              -
+              scratch.old_old_temperature_values[q] *
+              (time_step * time_step) /
+              (old_time_step * (time_step + old_time_step)))
+             :
+             scratch.old_temperature_values[q]);
+
+        const double ext_T
+          = (use_bdf2_scheme ?
+             (scratch.old_temperature_values[q] *
+              (1 + time_step/old_time_step)
+              -
+              scratch.old_old_temperature_values[q] *
+              time_step/old_time_step)
+             :
+             scratch.old_temperature_values[q]);
+
+        const Tensor<1,dim> ext_grad_T
+          = (use_bdf2_scheme ?
+             (scratch.old_temperature_grads[q] *
+              (1 + time_step/old_time_step)
+              -
+              scratch.old_old_temperature_grads[q] *
+              time_step/old_time_step)
+             :
+             scratch.old_temperature_grads[q]);
+
+        const Tensor<1,dim> extrapolated_u
+          = (use_bdf2_scheme ?
+             (scratch.old_velocity_values[q] *
+              (1 + time_step/old_time_step)
+              -
+              scratch.old_old_velocity_values[q] *
+              time_step/old_time_step)
+             :
+             scratch.old_velocity_values[q]);
+
+        const SymmetricTensor<2,dim> extrapolated_strain_rate
+          = (use_bdf2_scheme ?
+             (scratch.old_strain_rates[q] *
+              (1 + time_step/old_time_step)
+              -
+              scratch.old_old_strain_rates[q] *
+              time_step/old_time_step)
+             :
+             scratch.old_strain_rates[q]);
+
+        const double gamma
+          = ((EquationData::radiogenic_heating * EquationData::density(ext_T)
+              +
+              2 * EquationData::eta * extrapolated_strain_rate * extrapolated_strain_rate) /
+             (EquationData::density(ext_T) * EquationData::specific_heat));
+
+        for (unsigned int i=0; i<dofs_per_cell; ++i)
+          {
+            data.local_rhs(i) += (T_term_for_rhs * scratch.phi_T[i]
+                                  -
+                                  time_step *
+                                  extrapolated_u * ext_grad_T * scratch.phi_T[i]
+                                  -
+                                  time_step *
+                                  nu * ext_grad_T * scratch.grad_phi_T[i]
+                                  +
+                                  time_step *
+                                  gamma * scratch.phi_T[i])
+                                 *
+                                 scratch.temperature_fe_values.JxW(q);
+
+            if (temperature_constraints.is_inhomogeneously_constrained(data.local_dof_indices[i]))
+              {
+                for (unsigned int j=0; j<dofs_per_cell; ++j)
+                  data.matrix_for_bc(j,i) += (scratch.phi_T[i] * scratch.phi_T[j] *
+                                              (use_bdf2_scheme ?
+                                               ((2*time_step + old_time_step) /
+                                                (time_step + old_time_step)) : 1.)
+                                              +
+                                              scratch.grad_phi_T[i] *
+                                              scratch.grad_phi_T[j] *
+                                              EquationData::kappa *
+                                              time_step)
+                                             *
+                                             scratch.temperature_fe_values.JxW(q);
+              }
+          }
+      }
+  }
+
+
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::
+  copy_local_to_global_temperature_rhs (const Assembly::CopyData::TemperatureRHS<dim> &data)
+  {
+    temperature_constraints.distribute_local_to_global (data.local_rhs,
+                                                        data.local_dof_indices,
+                                                        temperature_rhs,
+                                                        data.matrix_for_bc);
+  }
+
+
+
+                                   // In the function that runs the WorkStream
+                                   // for actually calculating the right hand
+                                   // side, we also generate the final
+                                   // matrix. As mentioned above, it is a sum
+                                   // of the mass matrix and the Laplace
+                                   // matrix, times some time step-dependent
+                                   // weight. This weight is specified by the
+                                   // BDF-2 time integration scheme, see the
+                                   // introduction in step-31. What is new in
+                                   // this tutorial program (in addition to
+                                   // the use of MPI parallelization and the
+                                   // WorkStream class), is that we now
+                                   // precompute the temperature
+                                   // preconditioner as well. The reason is
+                                   // that the setup of the Jacobi
+                                   // preconditioner takes a noticeable time
+                                   // compared to the solver because we
+                                   // usually only need between 10 and 20
+                                   // iterations for solving the temperature
+                                   // system (this might sound strange, as
+                                   // Jacobi really only consists of a
+                                   // diagonal, but in Trilinos it is derived
+                                   // from more general framework for point
+                                   // relaxation preconditioners which is a
+                                   // bit inefficient). Hence, it is more
+                                   // efficient to precompute the
+                                   // preconditioner, even though the matrix
+                                   // entries may slightly change because the
+                                   // time step might change. This is not too
+                                   // big a problem because we remesh every
+                                   // few time steps (and regenerate the
+                                   // preconditioner then).
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::assemble_temperature_system (const double maximal_velocity)
+  {
+    const bool use_bdf2_scheme = (timestep_number != 0);
+
+    if (use_bdf2_scheme == true)
+      {
+        temperature_matrix.copy_from (temperature_mass_matrix);
+        temperature_matrix *= (2*time_step + old_time_step) /
+                              (time_step + old_time_step);
+        temperature_matrix.add (time_step, temperature_stiffness_matrix);
+      }
+    else
+      {
+        temperature_matrix.copy_from (temperature_mass_matrix);
+        temperature_matrix.add (time_step, temperature_stiffness_matrix);
+      }
+    temperature_matrix.compress();
+
+    if (rebuild_temperature_preconditioner == true)
+      {
+        T_preconditioner.reset (new PETScWrappers::PreconditionJacobi());
+        T_preconditioner->initialize (temperature_matrix);
+        rebuild_temperature_preconditioner = false;
+      }
+
+                                     // The next part is computing the right
+                                     // hand side vectors.  To do so, we first
+                                     // compute the average temperature $T_m$
+                                     // that we use for evaluating the
+                                     // artificial viscosity stabilization
+                                     // through the residual $E(T) =
+                                     // (T-T_m)^2$. We do this by defining the
+                                     // midpoint between maximum and minimum
+                                     // temperature as average temperature in
+                                     // the definition of the entropy
+                                     // viscosity. An alternative would be to
+                                     // use the integral average, but the
+                                     // results are not very sensitive to this
+                                     // choice. The rest then only requires
+                                     // calling WorkStream::run again, binding
+                                     // the arguments to the
+                                     // <code>local_assemble_temperature_rhs</code>
+                                     // function that are the same in every
+                                     // call to the correct values:
+    temperature_rhs = 0;
+
+    const QGauss<dim> quadrature_formula(parameters.temperature_degree+2);
+    const std::pair<double,double>
+      global_T_range = get_extrapolated_temperature_range();
+
+    const double average_temperature = 0.5 * (global_T_range.first +
+                                              global_T_range.second);
+    const double global_entropy_variation =
+      get_entropy_variation (average_temperature);
+
+    typedef
+      FilteredIterator<typename DoFHandler<dim>::active_cell_iterator>
+      CellFilter;
+
+    WorkStream::
+      run (CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       temperature_dof_handler.begin_active()),
+           CellFilter (IteratorFilters::LocallyOwnedCell(),
+                       temperature_dof_handler.end()),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            local_assemble_temperature_rhs,
+                            this,
+                            global_T_range,
+                            maximal_velocity,
+                            global_entropy_variation,
+                            std_cxx1x::_1,
+                            std_cxx1x::_2,
+                            std_cxx1x::_3),
+           std_cxx1x::bind (&BoussinesqFlowProblem<dim>::
+                            copy_local_to_global_temperature_rhs,
+                            this,
+                            std_cxx1x::_1),
+           Assembly::Scratch::
+           TemperatureRHS<dim> (temperature_fe, stokes_fe, mapping,
+                                quadrature_formula),
+           Assembly::CopyData::
+           TemperatureRHS<dim> (temperature_fe));
+
+    temperature_rhs.compress(); //temperature_rhs.compress(Add);
+  }
+
+
+
+
+                                   // @sect4{BoussinesqFlowProblem::solve}
+
+                                   // This function solves the linear systems
+                                   // in each time step of the Boussinesq
+                                   // problem. First, we
+                                   // work on the Stokes system and then on
+                                   // the temperature system. In essence, it
+                                   // does the same things as the respective
+                                   // function in step-31. However, there are a few
+                                   // changes here.
+                                   //
+                                   // The first change is related to the way
+                                   // we store our solution: we keep the
+                                   // vectors with locally owned degrees of
+                                   // freedom plus ghost nodes on each MPI
+                                   // node. When we enter a solver which is
+                                   // supposed to perform matrix-vector
+                                   // products with a distributed matrix, this
+                                   // is not the appropriate form,
+                                   // though. There, we will want to have the
+                                   // solution vector to be distributed in the
+                                   // same way as the matrix, i.e. without any
+                                   // ghosts. So what we do first is to
+                                   // generate a distributed vector called
+                                   // <code>distributed_stokes_solution</code>
+                                   // and put only the locally owned dofs into
+                                   // that, which is neatly done by the
+                                   // <code>operator=</code> of the Trilinos
+                                   // vector.
+                                   //
+                                   // Next, we scale the pressure solution (or
+                                   // rather, the initial guess) for the
+                                   // solver so that it matches with the
+                                   // length scales in the matrices, as
+                                   // discussed in the introduction. We also
+                                   // immediately scale the pressure solution
+                                   // back to the correct units after the
+                                   // solution is completed.  We also need to
+                                   // set the pressure values at hanging nodes
+                                   // to zero. This we also did in step-31 in
+                                   // order not to disturb the Schur
+                                   // complement by some vector entries that
+                                   // actually are irrelevant during the solve
+                                   // stage. As a difference to step-31, here
+                                   // we do it only for the locally owned
+                                   // pressure dofs. After solving for the
+                                   // Stokes solution, each processor copies
+                                   // the distributed solution back into the
+                                   // solution vector that also includes ghost
+                                   // elements.
+                                   //
+                                   // The third and most obvious change is
+                                   // that we have two variants for the Stokes
+                                   // solver: A fast solver that sometimes
+                                   // breaks down, and a robust solver that is
+                                   // slower. This is what we already
+                                   // discussed in the introduction. Here is
+                                   // how we realize it: First, we perform 30
+                                   // iterations with the fast solver based on
+                                   // the simple preconditioner based on the
+                                   // AMG V-cycle instead of an approximate
+                                   // solve (this is indicated by the
+                                   // <code>false</code> argument to the
+                                   // <code>LinearSolvers::BlockSchurPreconditioner</code>
+                                   // object). If we converge, everything is
+                                   // fine. If we do not converge, the solver
+                                   // control object will throw an exception
+                                   // SolverControl::NoConvergence. Usually,
+                                   // this would abort the program because we
+                                   // don't catch them in our usual
+                                   // <code>solve()</code> functions. This is
+                                   // certainly not what we want to happen
+                                   // here. Rather, we want to switch to the
+                                   // strong solver and continue the solution
+                                   // process with whatever vector we got so
+                                   // far. Hence, we catch the exception with
+                                   // the C++ try/catch mechanism. We then
+                                   // simply go through the same solver
+                                   // sequence again in the <code>catch</code>
+                                   // clause, this time passing the @p true
+                                   // flag to the preconditioner for the
+                                   // strong solver, signaling an approximate
+                                   // CG solve.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::solve ()
+  {
+    computing_timer.enter_section ("   Solve Stokes system");
+
+    {
+      pcout << "   Solving Stokes system... " << std::flush;
+
+      PETScWrappers::MPI::BlockVector
+        distributed_stokes_solution (stokes_rhs);
+      distributed_stokes_solution = stokes_solution;
+
+      distributed_stokes_solution.block(1) /= EquationData::pressure_scaling;
+
+      const unsigned int
+        start = (distributed_stokes_solution.block(0).size() +
+                 distributed_stokes_solution.block(1).local_range().first),
+        end   = (distributed_stokes_solution.block(0).size() +
+                 distributed_stokes_solution.block(1).local_range().second);
+      for (unsigned int i=start; i<end; ++i)
+        if (stokes_constraints.is_constrained (i))
+          distributed_stokes_solution(i) = 0;
+
+
+      PrimitiveVectorMemory<PETScWrappers::MPI::BlockVector> mem;
+
+      unsigned int n_iterations = 0;
+      const double solver_tolerance = 1e-8 * stokes_rhs.l2_norm();
+//      const double solver_tolerance = 1;
+      std::cout << "stokes_rhs.l2_norm() = " << stokes_rhs.l2_norm() << std::endl;
+      SolverControl solver_control (300, solver_tolerance);
+
+      try
+        {
+          const LinearSolvers::BlockSchurPreconditioner<PETScWrappers::PreconditionBoomerAMG,
+                                                        PETScWrappers::PreconditionJacobi>
+            preconditioner (stokes_matrix, stokes_preconditioner_matrix,
+                            *Mp_preconditioner, *Amg_preconditioner,
+                            false);
+
+          SolverGMRES<PETScWrappers::MPI::BlockVector>
+            solver(solver_control, mem,
+            		SolverGMRES<PETScWrappers::MPI::BlockVector>::
+                   AdditionalData(300, true));
+          solver.solve(stokes_matrix, distributed_stokes_solution, stokes_rhs,
+                       preconditioner);
+
+          n_iterations = solver_control.last_step();
+        }
+
+      catch (SolverControl::NoConvergence)
+        {
+          const LinearSolvers::BlockSchurPreconditioner<PETScWrappers::PreconditionBoomerAMG,
+                                                        PETScWrappers::PreconditionJacobi>
+            preconditioner (stokes_matrix, stokes_preconditioner_matrix,
+                            *Mp_preconditioner, *Amg_preconditioner,
+                            true);
+
+          SolverControl solver_control_refined (stokes_matrix.m(), solver_tolerance);
+          SolverGMRES<PETScWrappers::MPI::BlockVector>
+            solver(solver_control_refined, mem,
+            		SolverGMRES<PETScWrappers::MPI::BlockVector>::
+                   AdditionalData(500, true));
+          solver.solve(stokes_matrix, distributed_stokes_solution, stokes_rhs,
+                       preconditioner);
+
+          n_iterations = (solver_control.last_step() +
+                          solver_control_refined.last_step());
+        }
+
+
+      stokes_constraints.distribute (distributed_stokes_solution);
+
+      distributed_stokes_solution.block(1) *= EquationData::pressure_scaling;
+
+      stokes_solution = distributed_stokes_solution;
+      pcout << n_iterations  << " iterations."
+            << std::endl;
+
+      pcout << "distributed_stokes_solution linfty_norm: " <<
+    		  distributed_stokes_solution.block(0).linfty_norm() << ", " <<
+    		  distributed_stokes_solution.block(1).linfty_norm() << std::endl;
+
+    }
+    computing_timer.exit_section();
+
+
+                                     // Now let's turn to the temperature
+                                     // part: First, we compute the time step
+                                     // size. We found that we need smaller
+                                     // time steps for 3D than for 2D for the
+                                     // shell geometry. This is because the
+                                     // cells are more distorted in that case
+                                     // (it is the smallest edge length that
+                                     // determines the CFL number). Instead of
+                                     // computing the time step from maximum
+                                     // velocity and minimal mesh size as in
+                                     // step-31, we compute local CFL numbers,
+                                     // i.e., on each cell we compute the
+                                     // maximum velocity times the mesh size,
+                                     // and compute the maximum of
+                                     // them. Hence, we need to choose the
+                                     // factor in front of the time step
+                                     // slightly smaller.
+                                     //
+                                     // After temperature right hand side
+                                     // assembly, we solve the linear system
+                                     // for temperature (with fully
+                                     // distributed vectors without any
+                                     // ghosts), apply constraints and copy
+                                     // the vector back to one with ghosts.
+                                     //
+                                     // In the end, we extract the temperature
+                                     // range similarly to step-31 to produce
+                                     // some output (for example in order to
+                                     // help us choose the stabilization
+                                     // constants, as discussed in the
+                                     // introduction). The only difference is
+                                     // that we need to exchange maxima over
+                                     // all processors.
+    computing_timer.enter_section ("   Assemble temperature rhs");
+    {
+      old_time_step = time_step;
+
+      const double scaling = (dim==3 ? 0.25 : 1.0);
+      time_step = (scaling/(2.1*dim*std::sqrt(1.*dim)) /
+                   (parameters.temperature_degree *
+                    get_cfl_number()));
+
+      const double maximal_velocity = get_maximal_velocity();
+      pcout << "   Maximal velocity: "
+            << maximal_velocity *EquationData::year_in_seconds * 100
+            << " cm/year"
+            << std::endl;
+      pcout << "   " << "Time step: "
+            << time_step/EquationData::year_in_seconds
+            << " years"
+            << std::endl;
+
+      temperature_solution = old_temperature_solution;
+      assemble_temperature_system (maximal_velocity);
+    }
+    computing_timer.exit_section ();
+
+    computing_timer.enter_section ("   Solve temperature system");
+    {
+      SolverControl solver_control (temperature_matrix.m(),
+                                    1e-12*temperature_rhs.l2_norm());
+//      SolverCG<PETScWrappers::MPI::Vector>   cg (solver_control);
+      PETScWrappers::SolverGMRES cg(solver_control,MPI_COMM_WORLD);
+
+      PETScWrappers::MPI::Vector
+        distributed_temperature_solution (temperature_rhs);
+      distributed_temperature_solution = temperature_solution;
+
+      cg.solve (temperature_matrix, distributed_temperature_solution,
+                temperature_rhs, *T_preconditioner);
+//      cg.solve (temperature_matrix, distributed_temperature_solution,
+//                      temperature_rhs, PETScWrappers::PreconditionILU(temperature_matrix));
+                      
+      temperature_constraints.distribute (distributed_temperature_solution);
+      temperature_solution = distributed_temperature_solution;
+
+      pcout << "   "
+            << solver_control.last_step()
+            << " CG iterations for temperature" << std::endl;
+      computing_timer.exit_section();
+
+      double temperature[2] = { std::numeric_limits<double>::max(),
+                                -std::numeric_limits<double>::max() };
+      double global_temperature[2];
+
+//      for (unsigned int i=0; i<distributed_temperature_solution.local_size(); ++i)
+//        {
+//          temperature[0] = std::min<double> (temperature[0],
+//                                             distributed_temperature_solution.trilinos_vector()[0][i]);
+//          temperature[1] = std::max<double> (temperature[1],
+//                                             distributed_temperature_solution.trilinos_vector()[0][i]);
+//        }
+      temperature[0] = distributed_temperature_solution.min();
+      temperature[1] = distributed_temperature_solution.max();
+
+      temperature[0] *= -1.0;
+      Utilities::MPI::max (temperature, MPI_COMM_WORLD, global_temperature);
+      global_temperature[0] *= -1.0;
+
+      pcout << "   Temperature range: "
+            << global_temperature[0] << ' ' << global_temperature[1]
+            << std::endl;
+    }
+  }
+
+
+                                   // @sect4{BoussinesqFlowProblem::output_results}
+
+                                   // Next comes the function that generates
+                                   // the output. The quantities to output
+                                   // could be introduced manually like we did
+                                   // in step-31. An alternative is to hand
+                                   // this task over to a class PostProcessor
+                                   // that inherits from the class
+                                   // DataPostprocessor, which can be attached
+                                   // to DataOut. This allows us to output
+                                   // derived quantities from the solution,
+                                   // like the friction heating included in
+                                   // this example. It overloads the virtual
+                                   // function
+                                   // DataPostprocessor::compute_derived_quantities_vector,
+                                   // which is then internally called from
+                                   // DataOut::build_patches. We have to give
+                                   // it values of the numerical solution, its
+                                   // derivatives, normals to the cell, the
+                                   // actual evaluation points and any
+                                   // additional quantities. This follows the
+                                   // same procedure as discussed in step-29
+                                   // and other programs.
+  template <int dim>
+  class BoussinesqFlowProblem<dim>::Postprocessor : public DataPostprocessor<dim>
+  {
+    public:
+      Postprocessor (const unsigned int partition,
+                     const double       minimal_pressure);
+
+      virtual
+      void
+      compute_derived_quantities_vector (const std::vector<Vector<double> >              &uh,
+                                         const std::vector<std::vector<Tensor<1,dim> > > &duh,
+                                         const std::vector<std::vector<Tensor<2,dim> > > &dduh,
+                                         const std::vector<Point<dim> >                  &normals,
+                                         const std::vector<Point<dim> >                  &evaluation_points,
+                                         std::vector<Vector<double> >                    &computed_quantities) const;
+
+      virtual std::vector<std::string> get_names () const;
+
+      virtual
+      std::vector<DataComponentInterpretation::DataComponentInterpretation>
+      get_data_component_interpretation () const;
+
+      virtual UpdateFlags get_needed_update_flags () const;
+
+    private:
+      const unsigned int partition;
+      const double       minimal_pressure;
+  };
+
+
+  template <int dim>
+  BoussinesqFlowProblem<dim>::Postprocessor::
+  Postprocessor (const unsigned int partition,
+                 const double       minimal_pressure)
+                  :
+                  partition (partition),
+                  minimal_pressure (minimal_pressure)
+  {}
+
+
+                                   // Here we define the names for the
+                                   // variables we want to output. These are
+                                   // the actual solution values for velocity,
+                                   // pressure, and temperature, as well as
+                                   // the friction heating and to each cell
+                                   // the number of the processor that owns
+                                   // it. This allows us to visualize the
+                                   // partitioning of the domain among the
+                                   // processors. Except for the velocity,
+                                   // which is vector-valued, all other
+                                   // quantities are scalar.
+  template <int dim>
+  std::vector<std::string>
+  BoussinesqFlowProblem<dim>::Postprocessor::get_names() const
+  {
+    std::vector<std::string> solution_names (dim, "velocity");
+    solution_names.push_back ("p");
+    solution_names.push_back ("T");
+    solution_names.push_back ("friction_heating");
+    solution_names.push_back ("partition");
+
+    return solution_names;
+  }
+
+
+  template <int dim>
+  std::vector<DataComponentInterpretation::DataComponentInterpretation>
+  BoussinesqFlowProblem<dim>::Postprocessor::
+  get_data_component_interpretation () const
+  {
+    std::vector<DataComponentInterpretation::DataComponentInterpretation>
+      interpretation (dim,
+                      DataComponentInterpretation::component_is_part_of_vector);
+
+    interpretation.push_back (DataComponentInterpretation::component_is_scalar);
+    interpretation.push_back (DataComponentInterpretation::component_is_scalar);
+    interpretation.push_back (DataComponentInterpretation::component_is_scalar);
+    interpretation.push_back (DataComponentInterpretation::component_is_scalar);
+
+    return interpretation;
+  }
+
+
+  template <int dim>
+  UpdateFlags
+  BoussinesqFlowProblem<dim>::Postprocessor::get_needed_update_flags() const
+  {
+    return update_values | update_gradients | update_q_points;
+  }
+
+
+                                   // Now we implement the function that
+                                   // computes the derived quantities. As we
+                                   // also did for the output, we rescale the
+                                   // velocity from its SI units to something
+                                   // more readable, namely cm/year. Next, the
+                                   // pressure is scaled to be between 0 and
+                                   // the maximum pressure. This makes it more
+                                   // easily comparable -- in essence making
+                                   // all pressure variables positive or
+                                   // zero. Temperature is taken as is, and
+                                   // the friction heating is computed as $2
+                                   // \eta \varepsilon(\mathbf{u}) \cdot
+                                   // \varepsilon(\mathbf{u})$.
+                                   //
+                                   // The quantities we output here are more
+                                   // for illustration, rather than for actual
+                                   // scientific value. We come back to this
+                                   // briefly in the results section of this
+                                   // program and explain what one may in fact
+                                   // be interested in.
+  template <int dim>
+  void
+  BoussinesqFlowProblem<dim>::Postprocessor::
+  compute_derived_quantities_vector (const std::vector<Vector<double> >              &uh,
+                                     const std::vector<std::vector<Tensor<1,dim> > > &duh,
+                                     const std::vector<std::vector<Tensor<2,dim> > > &/*dduh*/,
+                                     const std::vector<Point<dim> >                  &/*normals*/,
+                                     const std::vector<Point<dim> >                  &/*evaluation_points*/,
+                                     std::vector<Vector<double> >                    &computed_quantities) const
+  {
+    const unsigned int n_quadrature_points = uh.size();
+    Assert (duh.size() == n_quadrature_points,                  ExcInternalError());
+    Assert (computed_quantities.size() == n_quadrature_points,  ExcInternalError());
+    Assert (uh[0].size() == dim+2,                              ExcInternalError());
+
+    for (unsigned int q=0; q<n_quadrature_points; ++q)
+      {
+        for (unsigned int d=0; d<dim; ++d)
+          computed_quantities[q](d)
+            = (uh[q](d) *  EquationData::year_in_seconds * 100);
+
+        const double pressure = (uh[q](dim)-minimal_pressure);
+        computed_quantities[q](dim) = pressure;
+
+        const double temperature = uh[q](dim+1);
+        computed_quantities[q](dim+1) = temperature;
+
+        Tensor<2,dim> grad_u;
+        for (unsigned int d=0; d<dim; ++d)
+          grad_u[d] = duh[q][d];
+        const SymmetricTensor<2,dim> strain_rate = symmetrize (grad_u);
+        computed_quantities[q](dim+2) = 2 * EquationData::eta *
+                                        strain_rate * strain_rate;
+
+        computed_quantities[q](dim+3) = partition;
+      }
+  }
+
+
+                                   // The <code>output_results()</code>
+                                   // function does mostly what the
+                                   // corresponding one did in to step-31, in
+                                   // particular the merging data from the two
+                                   // DoFHandler objects (for the Stokes and
+                                   // the temperature parts of the problem)
+                                   // into one. There is one minor change: we
+                                   // make sure that each processor only works
+                                   // on the subdomain it owns locally (and
+                                   // not on ghost or artificial cells) when
+                                   // building the joint solution vector. The
+                                   // same will then have to be done in
+                                   // DataOut::build_patches(), but that
+                                   // function does so automatically.
+                                   //
+                                   // What we end up with is a set of patches
+                                   // that we can write using the functions in
+                                   // DataOutBase in a variety of output
+                                   // formats. Here, we then have to pay
+                                   // attention that what each processor
+                                   // writes is really only its own part of
+                                   // the domain, i.e. we will want to write
+                                   // each processor's contribution into a
+                                   // separate file. This we do by adding an
+                                   // additional number to the filename when
+                                   // we write the solution. This is not
+                                   // really new, we did it similarly in
+                                   // step-40. Note that we write in the
+                                   // compressed format @p .vtu instead of
+                                   // plain vtk files, which saves quite some
+                                   // storage.
+                                   //
+                                   // All the rest of the work is done in the
+                                   // PostProcessor class.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::output_results ()
+  {
+    computing_timer.enter_section ("Postprocessing");
+
+    const FESystem<dim> joint_fe (stokes_fe, 1,
+                                  temperature_fe, 1);
+
+    DoFHandler<dim> joint_dof_handler (triangulation);
+    joint_dof_handler.distribute_dofs (joint_fe);
+    Assert (joint_dof_handler.n_dofs() ==
+            stokes_dof_handler.n_dofs() + temperature_dof_handler.n_dofs(),
+            ExcInternalError());
+
+    PETScWrappers::MPI::Vector joint_solution;
+    joint_solution.reinit(MPI_COMM_WORLD, joint_dof_handler.locally_owned_dofs()); //    joint_solution.reinit (joint_dof_handler.locally_owned_dofs(), MPI_COMM_WORLD);
+
+    {
+      std::vector<unsigned int> local_joint_dof_indices (joint_fe.dofs_per_cell);
+      std::vector<unsigned int> local_stokes_dof_indices (stokes_fe.dofs_per_cell);
+      std::vector<unsigned int> local_temperature_dof_indices (temperature_fe.dofs_per_cell);
+
+      typename DoFHandler<dim>::active_cell_iterator
+        joint_cell       = joint_dof_handler.begin_active(),
+        joint_endc       = joint_dof_handler.end(),
+        stokes_cell      = stokes_dof_handler.begin_active(),
+        temperature_cell = temperature_dof_handler.begin_active();
+      for (; joint_cell!=joint_endc;
+           ++joint_cell, ++stokes_cell, ++temperature_cell)
+        if (joint_cell->is_locally_owned())
+          {
+            joint_cell->get_dof_indices (local_joint_dof_indices);
+            stokes_cell->get_dof_indices (local_stokes_dof_indices);
+            temperature_cell->get_dof_indices (local_temperature_dof_indices);
+
+            for (unsigned int i=0; i<joint_fe.dofs_per_cell; ++i)
+              if (joint_fe.system_to_base_index(i).first.first == 0)
+                {
+                  Assert (joint_fe.system_to_base_index(i).second
+                          <
+                          local_stokes_dof_indices.size(),
+                          ExcInternalError());
+
+                  joint_solution(local_joint_dof_indices[i])
+                    = stokes_solution(local_stokes_dof_indices
+                                      [joint_fe.system_to_base_index(i).second]);
+                }
+              else
+                {
+                  Assert (joint_fe.system_to_base_index(i).first.first == 1,
+                          ExcInternalError());
+                  Assert (joint_fe.system_to_base_index(i).second
+                          <
+                          local_temperature_dof_indices.size(),
+                          ExcInternalError());
+                  joint_solution(local_joint_dof_indices[i])
+                    = temperature_solution(local_temperature_dof_indices
+                                           [joint_fe.system_to_base_index(i).second]);
+                }
+          }
+    }
+
+//    joint_solution.print(std::cout);
+    joint_solution.compress();
+
+    IndexSet locally_relevant_joint_dofs(joint_dof_handler.n_dofs());
+    DoFTools::extract_locally_relevant_dofs (joint_dof_handler, locally_relevant_joint_dofs);
+    PETScWrappers::MPI::Vector locally_relevant_joint_solution;
+    locally_relevant_joint_solution.reinit (MPI_COMM_WORLD,joint_dof_handler.locally_owned_dofs(),locally_relevant_joint_dofs); //    locally_relevant_joint_solution.reinit (locally_relevant_joint_dofs, MPI_COMM_WORLD);
+    locally_relevant_joint_solution = joint_solution;
+
+    Postprocessor postprocessor (Utilities::MPI::this_mpi_process(MPI_COMM_WORLD),
+    		stokes_solution.block(1).min()); //stokes_solution.block(1).minimal_value());
+
+    DataOut<dim> data_out;
+    data_out.attach_dof_handler (joint_dof_handler);
+    data_out.add_data_vector (locally_relevant_joint_solution, postprocessor);
+    data_out.build_patches ();
+
+    static int out_index=0;
+    const std::string filename = ("solution-" +
+                                  Utilities::int_to_string (out_index, 5) +
+                                  "." +
+                                  Utilities::int_to_string
+                                  (triangulation.locally_owned_subdomain(), 4) +
+                                  ".vtu");
+    std::ofstream output (filename.c_str());
+    data_out.write_vtu (output);
+
+
+                                     // At this point, all processors have
+                                     // written their own files to disk. We
+                                     // could visualize them individually in
+                                     // Visit or Paraview, but in reality we
+                                     // of course want to visualize the whole
+                                     // set of files at once. To this end, we
+                                     // create a master file in each of the
+                                     // formats understood by Visit
+                                     // (<code>.visit</code>) and Paraview
+                                     // (<code>.pvtu</code>) on the zeroth
+                                     // processor that describes how the
+                                     // individual files are defining the
+                                     // global data set.
+    if (Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) == 0)
+      {
+        std::vector<std::string> filenames;
+        for (unsigned int i=0; i<Utilities::MPI::n_mpi_processes(MPI_COMM_WORLD); ++i)
+          filenames.push_back (std::string("solution-") +
+                               Utilities::int_to_string (out_index, 5) +
+                               "." +
+                               Utilities::int_to_string(i, 4) +
+                               ".vtu");
+        const std::string
+          pvtu_master_filename = ("solution-" +
+                                  Utilities::int_to_string (out_index, 5) +
+                                  ".pvtu");
+        std::ofstream pvtu_master (pvtu_master_filename.c_str());
+        data_out.write_pvtu_record (pvtu_master, filenames);
+
+        const std::string
+          visit_master_filename = ("solution-" +
+                                   Utilities::int_to_string (out_index, 5) +
+                                   ".visit");
+        std::ofstream visit_master (visit_master_filename.c_str());
+        data_out.write_visit_record (visit_master, filenames);
+      }
+
+    computing_timer.exit_section ();
+    out_index++;
+  }
+
+
+
+                                   // @sect4{BoussinesqFlowProblem::refine_mesh}
+
+                                   // This function isn't really new
+                                   // either. Since the
+                                   // <code>setup_dofs</code> function that we
+                                   // call in the middle has its own timer
+                                   // section, we split timing this function
+                                   // into two sections. It will also allow us
+                                   // to easily identify which of the two is
+                                   // more expensive.
+                                   //
+                                   // One thing of note, however, is that we
+                                   // only want to compute error indicators on
+                                   // the locally owned subdomain. In order to
+                                   // achieve this, we pass one additional
+                                   // argument to the
+                                   // KellyErrorEstimator::estimate
+                                   // function. Note that the vector for error
+                                   // estimates is resized to the number of
+                                   // active cells present on the current
+                                   // process, which is less than the total
+                                   // number of active cells on all processors
+                                   // (but more than the number of locally
+                                   // owned active cells); each processor only
+                                   // has a few coarse cells around the
+                                   // locally owned ones, as also explained in
+                                   // step-40.
+                                   //
+                                   // The local error estimates are then
+                                   // handed to a %parallel version of
+                                   // GridRefinement (in namespace
+                                   // parallel::distributed::GridRefinement,
+                                   // see also step-40) which looks at the
+                                   // errors and finds the cells that need
+                                   // refinement by comparing the error values
+                                   // across processors. As in step-31, we
+                                   // want to limit the maximum grid level. So
+                                   // in case some cells have been marked that
+                                   // are already at the finest level, we
+                                   // simply clear the refine flags.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::refine_mesh (const unsigned int max_grid_level)
+  {
+    computing_timer.enter_section ("Refine mesh structure, part 1");
+    Vector<float> estimated_error_per_cell (triangulation.n_active_cells());
+
+    KellyErrorEstimator<dim>::estimate (temperature_dof_handler,
+                                        QGauss<dim-1>(parameters.temperature_degree+1),
+                                        typename FunctionMap<dim>::type(),
+                                        temperature_solution,
+                                        estimated_error_per_cell,
+                                        std::vector<bool>(),
+                                        0,
+                                        0,
+                                        triangulation.locally_owned_subdomain());
+
+    parallel::distributed::GridRefinement::
+      refine_and_coarsen_fixed_fraction (triangulation,
+                                         estimated_error_per_cell,
+                                         0.3, 0.1);
+
+    if (triangulation.n_levels() > max_grid_level)
+      for (typename Triangulation<dim>::active_cell_iterator
+             cell = triangulation.begin_active(max_grid_level);
+           cell != triangulation.end(); ++cell)
+        cell->clear_refine_flag ();
+
+                                     // With all flags marked as necessary, we
+                                     // set up the
+                                     // parallel::distributed::SolutionTransfer
+                                     // object to transfer the solutions for
+                                     // the current time level and the next
+                                     // older one. The syntax is similar to
+                                     // the non-%parallel solution transfer
+                                     // (with the exception that here a
+                                     // pointer to the vector entries is
+                                     // enough). The remainder of the function
+                                     // is concerned with setting up the data
+                                     // structures again after mesh refinement
+                                     // and restoring the solution vectors on
+                                     // the new mesh.
+    std::vector<const PETScWrappers::MPI::Vector *> x_temperature (2);
+    x_temperature[0] = &temperature_solution;
+    x_temperature[1] = &old_temperature_solution;
+    std::vector<const PETScWrappers::MPI::BlockVector *> x_stokes (2);
+    x_stokes[0] = &stokes_solution;
+    x_stokes[1] = &old_stokes_solution;
+
+    parallel::distributed::SolutionTransfer<dim,PETScWrappers::MPI::Vector>
+      temperature_trans(temperature_dof_handler);
+    parallel::distributed::SolutionTransfer<dim,PETScWrappers::MPI::BlockVector>
+      stokes_trans(stokes_dof_handler);
+
+    triangulation.prepare_coarsening_and_refinement();
+    temperature_trans.prepare_for_coarsening_and_refinement(x_temperature);
+    stokes_trans.prepare_for_coarsening_and_refinement(x_stokes);
+
+    triangulation.execute_coarsening_and_refinement ();
+    computing_timer.exit_section();
+
+    setup_dofs ();
+
+    computing_timer.enter_section ("Refine mesh structure, part 2");
+
+    {
+      PETScWrappers::MPI::Vector distributed_temp1 (temperature_rhs);
+      PETScWrappers::MPI::Vector distributed_temp2 (temperature_rhs);
+
+      std::vector<PETScWrappers::MPI::Vector *> tmp (2);
+      tmp[0] = &(distributed_temp1);
+      tmp[1] = &(distributed_temp2);
+      temperature_trans.interpolate(tmp);
+
+      temperature_solution     = distributed_temp1;
+      old_temperature_solution = distributed_temp2;
+    }
+
+    {
+      PETScWrappers::MPI::BlockVector distributed_stokes (stokes_rhs);
+      PETScWrappers::MPI::BlockVector old_distributed_stokes (stokes_rhs);
+
+      std::vector<PETScWrappers::MPI::BlockVector *> stokes_tmp (2);
+      stokes_tmp[0] = &(distributed_stokes);
+      stokes_tmp[1] = &(old_distributed_stokes);
+
+      stokes_trans.interpolate (stokes_tmp);
+      stokes_solution     = distributed_stokes;
+      old_stokes_solution = old_distributed_stokes;
+    }
+
+    computing_timer.exit_section();
+  }
+
+
+
+                                   // @sect4{BoussinesqFlowProblem::run}
+
+                                   // This is the final and controlling
+                                   // function in this class. It, in fact,
+                                   // runs the entire rest of the program and
+                                   // is, once more, very similar to
+                                   // step-31. We use a different mesh now (a
+                                   // GridGenerator::hyper_shell instead of a
+                                   // simple cube geometry), and use the
+                                   // <code>project_temperature_field()</code>
+                                   // function instead of the library function
+                                   // <code>VectorTools::project</code>, the
+                                   // rest is as before.
+  template <int dim>
+  void BoussinesqFlowProblem<dim>::run ()
+  {
+    GridGenerator::hyper_shell (triangulation,
+                                Point<dim>(),
+                                EquationData::R0,
+                                EquationData::R1,
+                                (dim==3) ? 96 : 12,
+                                true);
+    static HyperShellBoundary<dim> boundary;
+    triangulation.set_boundary (0, boundary);
+    triangulation.set_boundary (1, boundary);
+
+    global_Omega_diameter = GridTools::diameter (triangulation);
+
+    triangulation.refine_global (parameters.initial_global_refinement);
+
+    setup_dofs();
+
+    unsigned int pre_refinement_step = 0;
+
+    start_time_iteration:
+
+    project_temperature_field ();
+//    temperature_solution.print(std::cout,3,false,false);
+//    pcout << " temperature_solution linfty_norm: " << temperature_solution.linfty_norm() << std::endl;
+//    return;
+
+    timestep_number           = 0;
+    time_step = old_time_step = 0;
+
+    double time = 0;
+
+    do
+      {
+//    	if(timestep_number==3)
+//    		break;
+
+        pcout << "Timestep " << timestep_number
+              << ":  t=" << time/EquationData::year_in_seconds
+              << " years"
+              << std::endl;
+
+        assemble_stokes_system ();
+//        return;
+
+        build_stokes_preconditioner ();
+        assemble_temperature_matrix ();
+
+//        return;
+
+        solve ();
+
+        pcout << std::endl;
+
+        if ((timestep_number == 0) &&
+            (pre_refinement_step < parameters.initial_adaptive_refinement))
+          {
+            refine_mesh (parameters.initial_global_refinement +
+                         parameters.initial_adaptive_refinement);
+            ++pre_refinement_step;
+            goto start_time_iteration;
+          }
+        else if ((timestep_number > 0)
+                 &&
+                 (timestep_number % parameters.adaptive_refinement_interval == 0))
+          refine_mesh (parameters.initial_global_refinement +
+                       parameters.initial_adaptive_refinement);
+
+        if ((parameters.generate_graphical_output == true)
+            &&
+            (timestep_number % parameters.graphical_output_interval == 0))
+          output_results ();
+
+                                         // In order to speed up linear
+                                         // solvers, we extrapolate the
+                                         // solutions from the old time levels
+                                         // to the new one. This gives a very
+                                         // good initial guess, cutting the
+                                         // number of iterations needed in
+                                         // solvers by more than one half. We
+                                         // do not need to extrapolate in the
+                                         // last iteration, so if we reached
+                                         // the final time, we stop here.
+                                         //
+                                         // As the last thing during a
+                                         // time step (before actually
+                                         // bumping up the number of
+                                         // the time step), we check
+                                         // whether the current time
+                                         // step number is divisible
+                                         // by 100, and if so we let
+                                         // the computing timer print
+                                         // a summary of CPU times
+                                         // spent so far.
+        if (time > parameters.end_time * EquationData::year_in_seconds)
+          break;
+
+        PETScWrappers::MPI::BlockVector old_old_stokes_solution(old_stokes_solution);
+        old_old_stokes_solution      = old_stokes_solution;
+        old_stokes_solution          = stokes_solution;
+        old_old_temperature_solution = old_temperature_solution;
+        old_temperature_solution     = temperature_solution;
+        if (old_time_step > 0)
+          {
+            //Trilinos sadd does not like ghost vectors even as input. Copy into distributed vectors for now:
+            {
+              PETScWrappers::MPI::BlockVector distr_solution (stokes_rhs);
+              distr_solution = stokes_solution;
+              PETScWrappers::MPI::BlockVector distr_old_solution (stokes_rhs);
+              distr_old_solution = old_old_stokes_solution;
+              distr_solution .sadd (1.+time_step/old_time_step, -time_step/old_time_step,
+                  distr_old_solution);
+              stokes_solution = distr_solution;
+            }
+            {
+              PETScWrappers::MPI::Vector distr_solution (temperature_rhs);
+              distr_solution = temperature_solution;
+              PETScWrappers::MPI::Vector distr_old_solution (temperature_rhs);
+              distr_old_solution = old_old_temperature_solution;
+              distr_solution .sadd (1.+time_step/old_time_step, -time_step/old_time_step,
+                  distr_old_solution);
+              temperature_solution = distr_solution;
+            }
+          }
+
+        if ((timestep_number > 0) && (timestep_number % 100 == 0))
+          computing_timer.print_summary ();
+
+        time += time_step;
+        ++timestep_number;
+      }
+    while (true);
+
+                                     // If we are generating graphical
+                                     // output, do so also for the last
+                                     // time step unless we had just
+                                     // done so before we left the
+                                     // do-while loop
+    if ((parameters.generate_graphical_output == true)
+        &&
+        !((timestep_number-1) % parameters.graphical_output_interval == 0))
+      output_results ();
+  }
+}
+
+
+
+                                 // @sect3{The <code>main</code> function}
+
+                                 // The main function is short as usual and
+                                 // very similar to the one in step-31. Since
+                                 // we use a parameter file which is specified
+                                 // as an argument in the command line, we
+                                 // have to read it in here and pass it on to
+                                 // the Parameters class for parsing. If no
+                                 // filename is given in the command line, we
+                                 // simply use the <code>\step-32.prm</code>
+                                 // file which is distributed together with
+                                 // the program.
+                                 //
+                                 // Because 3d computations are simply
+                                 // very slow unless you throw a lot
+                                 // of processors at them, the program
+                                 // defaults to 2d. You can get the 3d
+                                 // version by changing the constant
+                                 // dimension below to 3.
+int main (int argc, char *argv[])
+{
+	using namespace Step32;
+  using namespace dealii;
+
+  Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv);
+  PetscInitialize(&argc,&argv,0,0);		//  Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv);
+
+  std::cout << "dealii::Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) = " <<
+		  dealii::Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) << std::endl;
+  try
+    {
+      deallog.depth_console (0);
+
+      std::string parameter_filename;
+      if (argc>=2)
+        parameter_filename = argv[1];
+      else
+        parameter_filename = "step-32.prm";
+
+      const int dim = 2;
+      BoussinesqFlowProblem<dim>::Parameters  parameters(parameter_filename);
+      BoussinesqFlowProblem<dim> flow_problem (parameters);
+      flow_problem.m_myrank = dealii::Utilities::MPI::this_mpi_process(MPI_COMM_WORLD);
+      flow_problem.run ();
+    }
+  catch (std::exception &exc)
+    {
+      std::cerr << std::endl << std::endl
+                << "----------------------------------------------------"
+                << std::endl;
+      std::cerr << "Exception on processing: " << std::endl
+                << exc.what() << std::endl
+                << "Aborting!" << std::endl
+                << "----------------------------------------------------"
+                << std::endl;
+
+      return 1;
+    }
+  catch (...)
+    {
+      std::cerr << std::endl << std::endl
+                << "----------------------------------------------------"
+                << std::endl;
+      std::cerr << "Unknown exception!" << std::endl
+                << "Aborting!" << std::endl
+                << "----------------------------------------------------"
+                << std::endl;
+      return 1;
+    }
+
+  dealii::GrowingVectorMemory<dealii::PETScWrappers::MPI::Vector>::release_unused_memory ();
+  dealii::GrowingVectorMemory<dealii::PETScWrappers::Vector>::release_unused_memory ();
+  PetscFinalize();
+
+//  dealii::TrilinosWrappers::BlockVector vec;
+
+  return 0;
+}

Added: branches/s-wang2/for_deal.II/include/deal.II/lac/petsc_matrix_base.h
===================================================================
--- branches/s-wang2/for_deal.II/include/deal.II/lac/petsc_matrix_base.h	                        (rev 0)
+++ branches/s-wang2/for_deal.II/include/deal.II/lac/petsc_matrix_base.h	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,1870 @@
+//---------------------------------------------------------------------------
+//    $Id: petsc_matrix_base.h 27628 2012-11-20 22:49:26Z heister $
+//
+//    Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 by the deal.II authors
+//
+//    This file is subject to QPL and may not be  distributed
+//    without copyright and license information. Please refer
+//    to the file deal.II/doc/license.html for the  text  and
+//    further information on this license.
+//
+//---------------------------------------------------------------------------
+#ifndef __deal2__petsc_matrix_base_h
+#define __deal2__petsc_matrix_base_h
+
+
+#include <deal.II/base/config.h>
+
+#ifdef DEAL_II_USE_PETSC
+
+#  include <deal.II/base/subscriptor.h>
+#  include <deal.II/lac/full_matrix.h>
+#  include <deal.II/lac/exceptions.h>
+#  include <deal.II/lac/vector.h>
+
+#  include <petscmat.h>
+#  include <deal.II/base/std_cxx1x/shared_ptr.h>
+
+#  include <vector>
+#  include <cmath>
+
+DEAL_II_NAMESPACE_OPEN
+
+template <typename Matrix> class BlockMatrixBase;
+
+
+namespace PETScWrappers
+{
+  // forward declarations
+  class VectorBase;
+  class MatrixBase;
+
+  namespace MatrixIterators
+  {
+    /**
+     * STL conforming iterator. This class acts as an iterator walking over the
+     * elements of PETSc matrices. Since PETSc offers a uniform interface for all
+     * types of matrices, this iterator can be used to access both sparse and full
+     * matrices.
+     *
+     * Note that PETSc does not give any guarantees as to the order of elements
+     * within each row. Note also that accessing the elements of a full matrix
+     * surprisingly only shows the nonzero elements of the matrix, not all
+     * elements.
+     *
+     * @ingroup PETScWrappers
+     * @author Guido Kanschat, Roy Stogner, Wolfgang Bangerth, 2004
+     */
+    class const_iterator
+    {
+    private:
+      /**
+       * Accessor class for iterators
+       */
+      class Accessor
+      {
+      public:
+        /**
+         * Constructor. Since we use
+         * accessors only for read
+         * access, a const matrix
+         * pointer is sufficient.
+         */
+        Accessor (const MatrixBase    *matrix,
+                  const unsigned int   row,
+                  const unsigned int   index);
+
+        /**
+         * Row number of the element
+         * represented by this
+         * object.
+         */
+        unsigned int row() const;
+
+        /**
+         * Index in row of the element
+         * represented by this
+         * object.
+         */
+        unsigned int index() const;
+
+        /**
+         * Column number of the
+         * element represented by
+         * this object.
+         */
+        unsigned int column() const;
+
+        /**
+         * Value of this matrix entry.
+         */
+        PetscScalar value() const;
+
+        /**
+         * Exception
+         */
+        DeclException0 (ExcBeyondEndOfMatrix);
+        /**
+         * Exception
+         */
+        DeclException3 (ExcAccessToNonlocalRow,
+                        int, int, int,
+                        << "You tried to access row " << arg1
+                        << " of a distributed matrix, but only rows "
+                        << arg2 << " through " << arg3
+                        << " are stored locally and can be accessed.");
+
+      private:
+        /**
+         * The matrix accessed.
+         */
+        mutable MatrixBase *matrix;
+
+        /**
+         * Current row number.
+         */
+        unsigned int a_row;
+
+        /**
+         * Current index in row.
+         */
+        unsigned int a_index;
+
+        /**
+         * Cache where we store the
+         * column indices of the present
+         * row. This is necessary, since
+         * PETSc makes access to the
+         * elements of its matrices
+         * rather hard, and it is much
+         * more efficient to copy all
+         * column entries of a row once
+         * when we enter it than
+         * repeatedly asking PETSc for
+         * individual ones. This also
+         * makes some sense since it is
+         * likely that we will access
+         * them sequentially anyway.
+         *
+         * In order to make copying of
+         * iterators/accessor of
+         * acceptable performance, we
+         * keep a shared pointer to these
+         * entries so that more than one
+         * accessor can access this data
+         * if necessary.
+         */
+        std_cxx1x::shared_ptr<const std::vector<unsigned int> > colnum_cache;
+
+        /**
+         * Similar cache for the values
+         * of this row.
+         */
+        std_cxx1x::shared_ptr<const std::vector<PetscScalar> > value_cache;
+
+        /**
+         * Discard the old row caches
+         * (they may still be used by
+         * other accessors) and generate
+         * new ones for the row pointed
+         * to presently by this accessor.
+         */
+        void visit_present_row ();
+
+        /**
+         * Make enclosing class a
+         * friend.
+         */
+        friend class const_iterator;
+      };
+
+    public:
+
+      /**
+       * Constructor. Create an iterator
+       * into the matrix @p matrix for the
+       * given row and the index within it.
+       */
+      const_iterator (const MatrixBase   *matrix,
+                      const unsigned int  row,
+                      const unsigned int  index);
+
+      /**
+       * Prefix increment.
+       */
+      const_iterator &operator++ ();
+
+      /**
+       * Postfix increment.
+       */
+      const_iterator operator++ (int);
+
+      /**
+       * Dereferencing operator.
+       */
+      const Accessor &operator* () const;
+
+      /**
+       * Dereferencing operator.
+       */
+      const Accessor *operator-> () const;
+
+      /**
+       * Comparison. True, if
+       * both iterators point to
+       * the same matrix
+       * position.
+       */
+      bool operator == (const const_iterator &) const;
+      /**
+       * Inverse of <tt>==</tt>.
+       */
+      bool operator != (const const_iterator &) const;
+
+      /**
+       * Comparison
+       * operator. Result is true
+       * if either the first row
+       * number is smaller or if
+       * the row numbers are
+       * equal and the first
+       * index is smaller.
+       */
+      bool operator < (const const_iterator &) const;
+
+      /**
+       * Exception
+       */
+      DeclException2 (ExcInvalidIndexWithinRow,
+                      int, int,
+                      << "Attempt to access element " << arg2
+                      << " of row " << arg1
+                      << " which doesn't have that many elements.");
+
+    private:
+      /**
+       * Store an object of the
+       * accessor class.
+       */
+      Accessor accessor;
+    };
+
+  }
+
+
+  /**
+   * Base class for all matrix classes that are implemented on top of the PETSc
+   * matrix types. Since in PETSc all matrix types (i.e. sequential and
+   * parallel, sparse, blocked, etc.)  are built by filling the contents of an
+   * abstract object that is only referenced through a pointer of a type that is
+   * independent of the actual matrix type, we can implement almost all
+   * functionality of matrices in this base class. Derived classes will then only
+   * have to provide the functionality to create one or the other kind of
+   * matrix.
+   *
+   * The interface of this class is modeled after the existing
+   * SparseMatrix class in deal.II. It has almost the same member
+   * functions, and is often exchangable. However, since PETSc only supports a
+   * single scalar type (either double, float, or a complex data type), it is
+   * not templated, and only works with whatever your PETSc installation has
+   * defined the data type PetscScalar to.
+   *
+   * Note that PETSc only guarantees that operations do what you expect if the
+   * functions @p MatAssemblyBegin and @p MatAssemblyEnd have been called
+   * after matrix assembly. Therefore, you need to call
+   * SparseMatrix::compress() before you actually use the matrix. This also
+   * calls @p MatCompress that compresses the storage format for sparse
+   * matrices by discarding unused elements. PETSc allows to continue with
+   * assembling the matrix after calls to these functions, but since there are
+   * no more free entries available after that any more, it is better to only
+   * call SparseMatrix::compress() once at the end of the assembly stage and
+   * before the matrix is actively used.
+   *
+   * @ingroup PETScWrappers
+   * @ingroup Matrix1
+   * @author Wolfgang Bangerth, 2004
+   */
+  class MatrixBase : public Subscriptor
+  {
+  public:
+    /**
+     * Declare a typedef for the iterator
+     * class.
+     */
+    typedef MatrixIterators::const_iterator const_iterator;
+
+    /**
+     * Declare a typedef in analogy to all
+     * the other container classes.
+     */
+    typedef PetscScalar value_type;
+
+    /**
+     * Default constructor.
+     */
+    MatrixBase ();
+
+    /**
+     * Destructor. Made virtual so that one
+     * can use pointers to this class.
+     */
+    virtual ~MatrixBase ();
+
+    /**
+     * This operator assigns a scalar to a
+     * matrix. Since this does usually not
+     * make much sense (should we set all
+     * matrix entries to this value? Only
+     * the nonzero entries of the sparsity
+     * pattern?), this operation is only
+     * allowed if the actual value to be
+     * assigned is zero. This operator only
+     * exists to allow for the obvious
+     * notation <tt>matrix=0</tt>, which
+     * sets all elements of the matrix to
+     * zero, but keeps the sparsity pattern
+     * previously used.
+     */
+    MatrixBase &
+    operator = (const value_type d);
+    /**
+     * Release all memory and return
+     * to a state just like after
+     * having called the default
+     * constructor.
+     */
+    void clear ();
+
+    /**
+     * Set the element (<i>i,j</i>) to @p
+     * value.
+     *
+     * If the present object (from a
+     * derived class of this one) happens
+     * to be a sparse matrix, then this
+     * function adds a new entry to the
+     * matrix if it didn't exist before,
+     * very much in contrast to the
+     * SparseMatrix class which throws an
+     * error if the entry does not exist.
+     * If <tt>value</tt> is not a finite
+     * number an exception is thrown.
+     */
+    void set (const unsigned int i,
+              const unsigned int j,
+              const PetscScalar value);
+
+    /**
+     * Set all elements given in a
+     * FullMatrix<double> into the sparse
+     * matrix locations given by
+     * <tt>indices</tt>. In other words,
+     * this function writes the elements
+     * in <tt>full_matrix</tt> into the
+     * calling matrix, using the
+     * local-to-global indexing specified
+     * by <tt>indices</tt> for both the
+     * rows and the columns of the
+     * matrix. This function assumes a
+     * quadratic sparse matrix and a
+     * quadratic full_matrix, the usual
+     * situation in FE calculations.
+     *
+     * If the present object (from a
+     * derived class of this one) happens
+     * to be a sparse matrix, then this
+     * function adds some new entries to
+     * the matrix if they didn't exist
+     * before, very much in contrast to
+     * the SparseMatrix class which
+     * throws an error if the entry does
+     * not exist.
+     *
+     * The optional parameter
+     * <tt>elide_zero_values</tt> can be
+     * used to specify whether zero
+     * values should be inserted anyway
+     * or they should be filtered
+     * away. The default value is
+     * <tt>false</tt>, i.e., even zero
+     * values are inserted/replaced.
+     */
+    void set (const std::vector<unsigned int> &indices,
+              const FullMatrix<PetscScalar>   &full_matrix,
+              const bool                       elide_zero_values = false);
+
+    /**
+     * Same function as before, but now
+     * including the possibility to use
+     * rectangular full_matrices and
+     * different local-to-global indexing
+     * on rows and columns, respectively.
+     */
+    void set (const std::vector<unsigned int> &row_indices,
+              const std::vector<unsigned int> &col_indices,
+              const FullMatrix<PetscScalar>   &full_matrix,
+              const bool                       elide_zero_values = false);
+
+    /**
+     * Set several elements in the
+     * specified row of the matrix with
+     * column indices as given by
+     * <tt>col_indices</tt> to the
+     * respective value.
+     *
+     * If the present object (from a
+     * derived class of this one) happens
+     * to be a sparse matrix, then this
+     * function adds some new entries to
+     * the matrix if they didn't exist
+     * before, very much in contrast to
+     * the SparseMatrix class which
+     * throws an error if the entry does
+     * not exist.
+     *
+     * The optional parameter
+     * <tt>elide_zero_values</tt> can be
+     * used to specify whether zero
+     * values should be inserted anyway
+     * or they should be filtered
+     * away. The default value is
+     * <tt>false</tt>, i.e., even zero
+     * values are inserted/replaced.
+     */
+    void set (const unsigned int               row,
+              const std::vector<unsigned int> &col_indices,
+              const std::vector<PetscScalar> &values,
+              const bool                       elide_zero_values = false);
+
+    /**
+     * Set several elements to values
+     * given by <tt>values</tt> in a
+     * given row in columns given by
+     * col_indices into the sparse
+     * matrix.
+     *
+     * If the present object (from a
+     * derived class of this one) happens
+     * to be a sparse matrix, then this
+     * function adds some new entries to
+     * the matrix if they didn't exist
+     * before, very much in contrast to
+     * the SparseMatrix class which
+     * throws an error if the entry does
+     * not exist.
+     *
+     * The optional parameter
+     * <tt>elide_zero_values</tt> can be
+     * used to specify whether zero
+     * values should be inserted anyway
+     * or they should be filtered
+     * away. The default value is
+     * <tt>false</tt>, i.e., even zero
+     * values are inserted/replaced.
+     */
+    void set (const unsigned int  row,
+              const unsigned int  n_cols,
+              const unsigned int *col_indices,
+              const PetscScalar *values,
+              const bool          elide_zero_values = false);
+
+    /**
+     * Add @p value to the element
+     * (<i>i,j</i>).
+     *
+     * If the present object (from a
+     * derived class of this one) happens
+     * to be a sparse matrix, then this
+     * function adds a new entry to the
+     * matrix if it didn't exist before,
+     * very much in contrast to the
+     * SparseMatrix class which throws an
+     * error if the entry does not exist.
+     * If <tt>value</tt> is not a finite
+     * number an exception is thrown.
+     */
+    void add (const unsigned int i,
+              const unsigned int j,
+              const PetscScalar value);
+
+    /**
+     * Add all elements given in a
+     * FullMatrix<double> into sparse
+     * matrix locations given by
+     * <tt>indices</tt>. In other words,
+     * this function adds the elements in
+     * <tt>full_matrix</tt> to the
+     * respective entries in calling
+     * matrix, using the local-to-global
+     * indexing specified by
+     * <tt>indices</tt> for both the rows
+     * and the columns of the
+     * matrix. This function assumes a
+     * quadratic sparse matrix and a
+     * quadratic full_matrix, the usual
+     * situation in FE calculations.
+     *
+     * If the present object (from a
+     * derived class of this one) happens
+     * to be a sparse matrix, then this
+     * function adds some new entries to
+     * the matrix if they didn't exist
+     * before, very much in contrast to
+     * the SparseMatrix class which
+     * throws an error if the entry does
+     * not exist.
+     *
+     * The optional parameter
+     * <tt>elide_zero_values</tt> can be
+     * used to specify whether zero
+     * values should be added anyway or
+     * these should be filtered away and
+     * only non-zero data is added. The
+     * default value is <tt>true</tt>,
+     * i.e., zero values won't be added
+     * into the matrix.
+     */
+    void add (const std::vector<unsigned int> &indices,
+              const FullMatrix<PetscScalar>   &full_matrix,
+              const bool                       elide_zero_values = true);
+
+    /**
+     * Same function as before, but now
+     * including the possibility to use
+     * rectangular full_matrices and
+     * different local-to-global indexing
+     * on rows and columns, respectively.
+     */
+    void add (const std::vector<unsigned int> &row_indices,
+              const std::vector<unsigned int> &col_indices,
+              const FullMatrix<PetscScalar>   &full_matrix,
+              const bool                       elide_zero_values = true);
+
+    /**
+     * Set several elements in the
+     * specified row of the matrix with
+     * column indices as given by
+     * <tt>col_indices</tt> to the
+     * respective value.
+     *
+     * If the present object (from a
+     * derived class of this one) happens
+     * to be a sparse matrix, then this
+     * function adds some new entries to
+     * the matrix if they didn't exist
+     * before, very much in contrast to
+     * the SparseMatrix class which
+     * throws an error if the entry does
+     * not exist.
+     *
+     * The optional parameter
+     * <tt>elide_zero_values</tt> can be
+     * used to specify whether zero
+     * values should be added anyway or
+     * these should be filtered away and
+     * only non-zero data is added. The
+     * default value is <tt>true</tt>,
+     * i.e., zero values won't be added
+     * into the matrix.
+     */
+    void add (const unsigned int               row,
+              const std::vector<unsigned int> &col_indices,
+              const std::vector<PetscScalar> &values,
+              const bool                       elide_zero_values = true);
+
+    /**
+     * Add an array of values given by
+     * <tt>values</tt> in the given
+     * global matrix row at columns
+     * specified by col_indices in the
+     * sparse matrix.
+     *
+     * If the present object (from a
+     * derived class of this one) happens
+     * to be a sparse matrix, then this
+     * function adds some new entries to
+     * the matrix if they didn't exist
+     * before, very much in contrast to
+     * the SparseMatrix class which
+     * throws an error if the entry does
+     * not exist.
+     *
+     * The optional parameter
+     * <tt>elide_zero_values</tt> can be
+     * used to specify whether zero
+     * values should be added anyway or
+     * these should be filtered away and
+     * only non-zero data is added. The
+     * default value is <tt>true</tt>,
+     * i.e., zero values won't be added
+     * into the matrix.
+     */
+    void add (const unsigned int  row,
+              const unsigned int  n_cols,
+              const unsigned int *col_indices,
+              const PetscScalar *values,
+              const bool          elide_zero_values = true,
+              const bool          col_indices_are_sorted = false);
+
+    /**
+     * Remove all elements from
+     * this <tt>row</tt> by setting
+     * them to zero. The function
+     * does not modify the number
+     * of allocated nonzero
+     * entries, it only sets some
+     * entries to zero. It may drop
+     * them from the sparsity
+     * pattern, though (but retains
+     * the allocated memory in case
+     * new entries are again added
+     * later).
+     *
+     * This operation is used in
+     * eliminating constraints (e.g. due to
+     * hanging nodes) and makes sure that
+     * we can write this modification to
+     * the matrix without having to read
+     * entries (such as the locations of
+     * non-zero elements) from it --
+     * without this operation, removing
+     * constraints on parallel matrices is
+     * a rather complicated procedure.
+     *
+     * The second parameter can be used to
+     * set the diagonal entry of this row
+     * to a value different from zero. The
+     * default is to set it to zero.
+     */
+    void clear_row (const unsigned int row,
+                    const PetscScalar  new_diag_value = 0);
+
+    /**
+     * Same as clear_row(), except that it
+     * works on a number of rows at once.
+     *
+     * The second parameter can be used to
+     * set the diagonal entries of all
+     * cleared rows to something different
+     * from zero. Note that all of these
+     * diagonal entries get the same value
+     * -- if you want different values for
+     * the diagonal entries, you have to
+     * set them by hand.
+     */
+    void clear_rows (const std::vector<unsigned int> &rows,
+                     const PetscScalar                new_diag_value = 0);
+
+    /**
+     * PETSc matrices store their own
+     * sparsity patterns. So, in analogy to
+     * our own SparsityPattern class,
+     * this function compresses the
+     * sparsity pattern and allows the
+     * resulting matrix to be used in all
+     * other operations where before only
+     * assembly functions were
+     * allowed. This function must
+     * therefore be called once you have
+     * assembled the matrix.
+     *
+     * See @ref GlossCompress "Compressing distributed objects"
+     * for more information.
+     * more information.
+     */
+    void compress (::dealii::VectorOperation::values operation
+                   =::dealii::VectorOperation::unknown);
+    /**
+     * Return the value of the entry
+     * (<i>i,j</i>).  This may be an
+     * expensive operation and you should
+     * always take care where to call this
+     * function. In contrast to the
+     * respective function in the
+     * @p MatrixBase class, we don't
+     * throw an exception if the respective
+     * entry doesn't exist in the sparsity
+     * pattern of this class, since PETSc
+     * does not transmit this information.
+     *
+     * This function is therefore exactly
+     * equivalent to the <tt>el()</tt> function.
+     */
+    PetscScalar operator () (const unsigned int i,
+                             const unsigned int j) const;
+
+    /**
+     * Return the value of the matrix entry
+     * (<i>i,j</i>). If this entry does not
+     * exist in the sparsity pattern, then
+     * zero is returned. While this may be
+     * convenient in some cases, note that
+     * it is simple to write algorithms
+     * that are slow compared to an optimal
+     * solution, since the sparsity of the
+     * matrix is not used.
+     */
+    PetscScalar el (const unsigned int i,
+                    const unsigned int j) const;
+
+    /**
+     * Return the main diagonal
+     * element in the <i>i</i>th
+     * row. This function throws an
+     * error if the matrix is not
+     * quadratic.
+     *
+     * Since we do not have direct access
+     * to the underlying data structure,
+     * this function is no faster than the
+     * elementwise access using the el()
+     * function. However, we provide this
+     * function for compatibility with the
+     * SparseMatrix class.
+     */
+    PetscScalar diag_element (const unsigned int i) const;
+
+    /**
+     * Return the number of rows in this
+     * matrix.
+     */
+    unsigned int m () const;
+
+    /**
+     * Return the number of columns in this
+     * matrix.
+     */
+    unsigned int n () const;
+
+    /**
+     * Return the local dimension of the
+     * matrix, i.e. the number of rows
+     * stored on the present MPI
+     * process. For sequential matrices,
+     * this number is the same as m(),
+     * but for parallel matrices it may be
+     * smaller.
+     *
+     * To figure out which elements
+     * exactly are stored locally,
+     * use local_range().
+     */
+    unsigned int local_size () const;
+
+    /**
+     * Return a pair of indices
+     * indicating which rows of
+     * this matrix are stored
+     * locally. The first number is
+     * the index of the first
+     * row stored, the second
+     * the index of the one past
+     * the last one that is stored
+     * locally. If this is a
+     * sequential matrix, then the
+     * result will be the pair
+     * (0,m()), otherwise it will be
+     * a pair (i,i+n), where
+     * <tt>n=local_size()</tt>.
+     */
+    std::pair<unsigned int, unsigned int>
+    local_range () const;
+
+    /**
+     * Return whether @p index is
+     * in the local range or not,
+     * see also local_range().
+     */
+    bool in_local_range (const unsigned int index) const;
+
+    /**
+     * Return a reference to the MPI
+     * communicator object in use with this
+     * matrix. This function has to be
+     * implemented in derived classes.
+     */
+    virtual const MPI_Comm &get_mpi_communicator () const = 0;
+
+    /**
+     * Return the number of nonzero
+     * elements of this
+     * matrix. Actually, it returns
+     * the number of entries in the
+     * sparsity pattern; if any of
+     * the entries should happen to
+     * be zero, it is counted anyway.
+     */
+    unsigned int n_nonzero_elements () const;
+
+    /**
+     * Number of entries in a specific row.
+     */
+    unsigned int row_length (const unsigned int row) const;
+
+    /**
+     * Return the l1-norm of the matrix, that is
+     * $|M|_1=max_{all columns j}\sum_{all
+     * rows i} |M_ij|$,
+     * (max. sum of columns).
+     * This is the
+     * natural matrix norm that is compatible
+     * to the l1-norm for vectors, i.e.
+     * $|Mv|_1\leq |M|_1 |v|_1$.
+     * (cf. Haemmerlin-Hoffmann:
+     * Numerische Mathematik)
+     */
+    PetscReal l1_norm () const;
+
+    /**
+     * Return the linfty-norm of the
+     * matrix, that is
+     * $|M|_infty=max_{all rows i}\sum_{all
+     * columns j} |M_ij|$,
+     * (max. sum of rows).
+     * This is the
+     * natural matrix norm that is compatible
+     * to the linfty-norm of vectors, i.e.
+     * $|Mv|_infty \leq |M|_infty |v|_infty$.
+     * (cf. Haemmerlin-Hoffmann:
+     * Numerische Mathematik)
+     */
+    PetscReal linfty_norm () const;
+
+    /**
+     * Return the frobenius norm of the
+     * matrix, i.e. the square root of the
+     * sum of squares of all entries in the
+     * matrix.
+     */
+    PetscReal frobenius_norm () const;
+
+
+    /**
+     * Return the square of the norm
+     * of the vector $v$ with respect
+     * to the norm induced by this
+     * matrix,
+     * i.e. $\left(v,Mv\right)$. This
+     * is useful, e.g. in the finite
+     * element context, where the
+     * $L_2$ norm of a function
+     * equals the matrix norm with
+     * respect to the mass matrix of
+     * the vector representing the
+     * nodal values of the finite
+     * element function.
+     *
+     * Obviously, the matrix needs to
+     * be quadratic for this operation.
+     *
+     * The implementation of this function
+     * is not as efficient as the one in
+     * the @p MatrixBase class used in
+     * deal.II (i.e. the original one, not
+     * the PETSc wrapper class) since PETSc
+     * doesn't support this operation and
+     * needs a temporary vector.
+     *
+     * Note that if the current object
+     * represents a parallel distributed
+     * matrix (of type
+     * PETScWrappers::MPI::SparseMatrix),
+     * then the given vector has to be
+     * a distributed vector as
+     * well. Conversely, if the matrix is
+     * not distributed, then neither
+     * may the vector be.
+     */
+    PetscScalar matrix_norm_square (const VectorBase &v) const;
+
+
+    /**
+     * Compute the matrix scalar
+     * product $\left(u,Mv\right)$.
+     *
+     * The implementation of this function
+     * is not as efficient as the one in
+     * the @p MatrixBase class used in
+     * deal.II (i.e. the original one, not
+     * the PETSc wrapper class) since PETSc
+     * doesn't support this operation and
+     * needs a temporary vector.
+     *
+     * Note that if the current object
+     * represents a parallel distributed
+     * matrix (of type
+     * PETScWrappers::MPI::SparseMatrix),
+     * then both vectors have to be
+     * distributed vectors as
+     * well. Conversely, if the matrix is
+     * not distributed, then neither of the
+     * vectors may be.
+     */
+    PetscScalar matrix_scalar_product (const VectorBase &u,
+                                       const VectorBase &v) const;
+
+
+#if DEAL_II_PETSC_VERSION_GTE(3,1,0)
+    /**
+     * Return the trace of the
+     * matrix, i.e. the sum of all
+     * diagonal entries in the
+     * matrix.
+     */
+    PetscReal trace () const;
+#endif
+
+    /**
+     * Multiply the entire matrix by a
+     * fixed factor.
+     */
+    MatrixBase &operator *= (const PetscScalar factor);
+
+    /**
+     * Divide the entire matrix by a
+     * fixed factor.
+     */
+    MatrixBase &operator /= (const PetscScalar factor);
+
+    /**
+     * Matrix-vector multiplication:
+     * let <i>dst = M*src</i> with
+     * <i>M</i> being this matrix.
+     *
+     * Source and destination must
+     * not be the same vector.
+     *
+     * Note that if the current object
+     * represents a parallel distributed
+     * matrix (of type
+     * PETScWrappers::MPI::SparseMatrix),
+     * then both vectors have to be
+     * distributed vectors as
+     * well. Conversely, if the matrix is
+     * not distributed, then neither of the
+     * vectors may be.
+     */
+    void vmult (VectorBase       &dst,
+                const VectorBase &src) const;
+
+    /**
+     * Matrix-vector multiplication: let
+     * <i>dst = M<sup>T</sup>*src</i> with
+     * <i>M</i> being this matrix. This
+     * function does the same as vmult()
+     * but takes the transposed matrix.
+     *
+     * Source and destination must
+     * not be the same vector.
+     *
+     * Note that if the current object
+     * represents a parallel distributed
+     * matrix (of type
+     * PETScWrappers::MPI::SparseMatrix),
+     * then both vectors have to be
+     * distributed vectors as
+     * well. Conversely, if the matrix is
+     * not distributed, then neither of the
+     * vectors may be.
+     */
+    void Tvmult (VectorBase       &dst,
+                 const VectorBase &src) const;
+
+    /**
+     * Adding Matrix-vector
+     * multiplication. Add
+     * <i>M*src</i> on <i>dst</i>
+     * with <i>M</i> being this
+     * matrix.
+     *
+     * Source and destination must
+     * not be the same vector.
+     *
+     * Note that if the current object
+     * represents a parallel distributed
+     * matrix (of type
+     * PETScWrappers::MPI::SparseMatrix),
+     * then both vectors have to be
+     * distributed vectors as
+     * well. Conversely, if the matrix is
+     * not distributed, then neither of the
+     * vectors may be.
+     */
+    void vmult_add (VectorBase       &dst,
+                    const VectorBase &src) const;
+
+    /**
+     * Adding Matrix-vector
+     * multiplication. Add
+     * <i>M<sup>T</sup>*src</i> to
+     * <i>dst</i> with <i>M</i> being
+     * this matrix. This function
+     * does the same as vmult_add()
+     * but takes the transposed
+     * matrix.
+     *
+     * Source and destination must
+     * not be the same vector.
+     *
+     * Note that if the current object
+     * represents a parallel distributed
+     * matrix (of type
+     * PETScWrappers::MPI::SparseMatrix),
+     * then both vectors have to be
+     * distributed vectors as
+     * well. Conversely, if the matrix is
+     * not distributed, then neither of the
+     * vectors may be.
+     */
+    void Tvmult_add (VectorBase       &dst,
+                     const VectorBase &src) const;
+
+
+    /**
+     * Compute the residual of an
+     * equation <i>Mx=b</i>, where
+     * the residual is defined to be
+     * <i>r=b-Mx</i>. Write the
+     * residual into
+     * @p dst. The
+     * <i>l<sub>2</sub></i> norm of
+     * the residual vector is
+     * returned.
+     *
+     * Source <i>x</i> and destination
+     * <i>dst</i> must not be the same
+     * vector.
+     *
+     * Note that if the current object
+     * represents a parallel distributed
+     * matrix (of type
+     * PETScWrappers::MPI::SparseMatrix),
+     * then all vectors have to be
+     * distributed vectors as
+     * well. Conversely, if the matrix is
+     * not distributed, then neither of the
+     * vectors may be.
+     */
+    PetscScalar residual (VectorBase       &dst,
+                          const VectorBase &x,
+                          const VectorBase &b) const;
+
+    /**
+     * STL-like iterator with the
+     * first entry.
+     */
+    const_iterator begin () const;
+
+    /**
+     * Final iterator.
+     */
+    const_iterator end () const;
+
+    /**
+     * STL-like iterator with the
+     * first entry of row @p r.
+     *
+     * Note that if the given row is empty,
+     * i.e. does not contain any nonzero
+     * entries, then the iterator returned by
+     * this function equals
+     * <tt>end(r)</tt>. Note also that the
+     * iterator may not be dereferencable in
+     * that case.
+     */
+    const_iterator begin (const unsigned int r) const;
+
+    /**
+     * Final iterator of row <tt>r</tt>. It
+     * points to the first element past the
+     * end of line @p r, or past the end of
+     * the entire sparsity pattern.
+     *
+     * Note that the end iterator is not
+     * necessarily dereferencable. This is in
+     * particular the case if it is the end
+     * iterator for the last row of a matrix.
+     */
+    const_iterator end (const unsigned int r) const;
+
+    /**
+     * Conversion operator to gain access
+     * to the underlying PETSc type. If you
+     * do this, you cut this class off some
+     * information it may need, so this
+     * conversion operator should only be
+     * used if you know what you do. In
+     * particular, it should only be used
+     * for read-only operations into the
+     * matrix.
+     */
+    operator Mat () const;
+
+    /**
+     * Make an in-place transpose of a
+     * matrix.
+     */
+    void transpose ();
+
+    /**
+     * Test whether a matrix is
+     * symmetric.  Default
+     * tolerance is
+     * $1000\times32$-bit machine
+     * precision.
+     */
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    PetscTruth
+#else
+    PetscBool
+#endif
+    is_symmetric (const double tolerance = 1.e-12);
+
+    /**
+     * Test whether a matrix is
+     * Hermitian, i.e. it is the
+     * complex conjugate of its
+     * transpose. Default
+     * tolerance is
+     * $1000\times32$-bit machine
+     * precision.
+     */
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    PetscTruth
+#else
+    PetscBool
+#endif
+    is_hermitian (const double tolerance = 1.e-12);
+
+    /**
+     * Abstract PETSc object that helps view
+     * in ASCII other PETSc objects. Currently
+     * this function simply writes non-zero
+     * elements of a matrix to the terminal.
+     */
+      void write_ascii () const;		// shuqiangwang
+
+    /**
+     *  Returns the number bytes consumed
+     *  by this matrix on this CPU.
+     */
+    std::size_t memory_consumption() const;
+
+    /**
+     * Exception
+     */
+    DeclException1 (ExcPETScError,
+                    int,
+                    << "An error with error number " << arg1
+                    << " occurred while calling a PETSc function");
+    /**
+     * Exception
+     */
+    DeclException0 (ExcSourceEqualsDestination);
+
+    /**
+      * Exception.
+      */
+    DeclException2 (ExcWrongMode,
+                    int, int,
+                    << "You tried to do a "
+                    << (arg1 == 1 ?
+                        "'set'" :
+                        (arg1 == 2 ?
+                         "'add'" : "???"))
+                    << " operation but the matrix is currently in "
+                    << (arg2 == 1 ?
+                        "'set'" :
+                        (arg2 == 2 ?
+                         "'add'" : "???"))
+                    << " mode. You first have to call 'compress()'.");
+
+  protected:
+    /**
+     * A generic matrix object in
+     * PETSc. The actual type, a sparse
+     * matrix, is set in the constructor.
+     */
+    Mat matrix;
+
+    /**
+     * PETSc doesn't allow to mix additions
+     * to matrix entries and overwriting
+     * them (to make synchronisation of
+     * parallel computations
+     * simpler). Since the interface of the
+     * existing classes don't support the
+     * notion of not interleaving things,
+     * we have to emulate this
+     * ourselves. The way we do it is to,
+     * for each access operation, store
+     * whether it is an insertion or an
+     * addition. If the previous one was of
+     * different type, then we first have
+     * to flush the PETSc buffers;
+     * otherwise, we can simply go on.
+     *
+     * The following structure and variable
+     * declare and store the previous
+     * state.
+     */
+    struct LastAction
+    {
+      enum Values { none, insert, add };
+    };
+
+    /**
+     * Store whether the last action was a
+     * write or add operation.
+     */
+    LastAction::Values last_action;
+
+    /**
+     * Ensure that the add/set mode that
+     * is required for actions following
+     * this call is compatible with the
+     * current mode.
+     * Should be called from all internal
+     * functions accessing matrix elements.
+     */
+    void prepare_action(const LastAction::Values new_action);
+
+    /**
+     * For some matrix storage
+     * formats, in particular for the
+     * PETSc distributed blockmatrices,
+     * set and add operations on
+     * individual elements can not be
+     * freely mixed. Rather, one has
+     * to synchronize operations when
+     * one wants to switch from
+     * setting elements to adding to
+     * elements.
+     * BlockMatrixBase automatically
+     * synchronizes the access by
+     * calling this helper function
+     * for each block.
+     * This function ensures that the
+     * matrix is in a state that
+     * allows adding elements; if it
+     * previously already was in this
+     * state, the function does
+     * nothing.
+     */
+    void prepare_add();
+    /**
+     * Same as prepare_add() but
+     * prepare the matrix for setting
+     * elements if the representation
+     * of elements in this class
+     * requires such an operation.
+     */
+    void prepare_set();
+
+
+
+  private:
+    /**
+     * An internal array of integer
+     * values that is used to store the
+     * column indices when
+     * adding/inserting local data into
+     * the (large) sparse matrix.
+     */
+#ifdef PETSC_USE_64BIT_INDICES
+    std::vector<PetscInt> column_indices;
+#else
+    std::vector<int> column_indices;
+#endif
+
+    /**
+     * An internal array of double values
+     * that is used to store the column
+     * indices when adding/inserting
+     * local data into the (large) sparse
+     * matrix.
+     */
+    std::vector<PetscScalar> column_values;
+
+
+    /**
+     *  To allow calling protected
+     *  prepare_add() and
+     *  prepare_set().
+     */
+    template <class> friend class dealii::BlockMatrixBase;
+
+
+public:	// added by shuqiangwang
+      void copy_from(const MatrixBase &source);
+      void add(double factor, const MatrixBase &source);
+  };
+
+
+
+#ifndef DOXYGEN
+// -------------------------- inline and template functions ----------------------
+
+
+  namespace MatrixIterators
+  {
+
+    inline
+    const_iterator::Accessor::
+    Accessor (const MatrixBase   *matrix,
+              const unsigned int  row,
+              const unsigned int  index)
+      :
+      matrix(const_cast<MatrixBase *>(matrix)),
+      a_row(row),
+      a_index(index)
+    {
+      visit_present_row ();
+    }
+
+
+    inline
+    unsigned int
+    const_iterator::Accessor::row() const
+    {
+      Assert (a_row < matrix->m(), ExcBeyondEndOfMatrix());
+      return a_row;
+    }
+
+
+    inline
+    unsigned int
+    const_iterator::Accessor::column() const
+    {
+      Assert (a_row < matrix->m(), ExcBeyondEndOfMatrix());
+      return (*colnum_cache)[a_index];
+    }
+
+
+    inline
+    unsigned int
+    const_iterator::Accessor::index() const
+    {
+      Assert (a_row < matrix->m(), ExcBeyondEndOfMatrix());
+      return a_index;
+    }
+
+
+    inline
+    PetscScalar
+    const_iterator::Accessor::value() const
+    {
+      Assert (a_row < matrix->m(), ExcBeyondEndOfMatrix());
+      return (*value_cache)[a_index];
+    }
+
+
+    inline
+    const_iterator::
+    const_iterator(const MatrixBase   *matrix,
+                   const unsigned int  row,
+                   const unsigned int  index)
+      :
+      accessor(matrix, row, index)
+    {}
+
+
+
+    inline
+    const_iterator &
+    const_iterator::operator++ ()
+    {
+      Assert (accessor.a_row < accessor.matrix->m(), ExcIteratorPastEnd());
+
+      ++accessor.a_index;
+
+      // if at end of line: do one step, then
+      // cycle until we find a row with a
+      // nonzero number of entries
+      if (accessor.a_index >= accessor.colnum_cache->size())
+        {
+          accessor.a_index = 0;
+          ++accessor.a_row;
+
+          while ((accessor.a_row < accessor.matrix->m())
+                 &&
+                 (accessor.matrix->row_length(accessor.a_row) == 0))
+            ++accessor.a_row;
+
+          accessor.visit_present_row();
+        }
+      return *this;
+    }
+
+
+    inline
+    const_iterator
+    const_iterator::operator++ (int)
+    {
+      const const_iterator old_state = *this;
+      ++(*this);
+      return old_state;
+    }
+
+
+    inline
+    const const_iterator::Accessor &
+    const_iterator::operator* () const
+    {
+      return accessor;
+    }
+
+
+    inline
+    const const_iterator::Accessor *
+    const_iterator::operator-> () const
+    {
+      return &accessor;
+    }
+
+
+    inline
+    bool
+    const_iterator::
+    operator == (const const_iterator &other) const
+    {
+      return (accessor.a_row == other.accessor.a_row &&
+              accessor.a_index == other.accessor.a_index);
+    }
+
+
+    inline
+    bool
+    const_iterator::
+    operator != (const const_iterator &other) const
+    {
+      return ! (*this == other);
+    }
+
+
+    inline
+    bool
+    const_iterator::
+    operator < (const const_iterator &other) const
+    {
+      return (accessor.row() < other.accessor.row() ||
+              (accessor.row() == other.accessor.row() &&
+               accessor.index() < other.accessor.index()));
+    }
+
+  }
+
+
+
+  // Inline the set() and add()
+  // functions, since they will be
+  // called frequently, and the
+  // compiler can optimize away
+  // some unnecessary loops when
+  // the sizes are given at
+  // compile time.
+  inline
+  void
+  MatrixBase::set (const unsigned int i,
+                   const unsigned int j,
+                   const PetscScalar  value)
+  {
+    Assert (numbers::is_finite(value), ExcNumberNotFinite());
+
+    set (i, 1, &j, &value, false);
+  }
+
+
+
+  inline
+  void
+  MatrixBase::set (const std::vector<unsigned int> &indices,
+                   const FullMatrix<PetscScalar>   &values,
+                   const bool                       elide_zero_values)
+  {
+    Assert (indices.size() == values.m(),
+            ExcDimensionMismatch(indices.size(), values.m()));
+    Assert (values.m() == values.n(), ExcNotQuadratic());
+
+    for (unsigned int i=0; i<indices.size(); ++i)
+      set (indices[i], indices.size(), &indices[0], &values(i,0),
+           elide_zero_values);
+  }
+
+
+
+  inline
+  void
+  MatrixBase::set (const std::vector<unsigned int> &row_indices,
+                   const std::vector<unsigned int> &col_indices,
+                   const FullMatrix<PetscScalar>   &values,
+                   const bool                       elide_zero_values)
+  {
+    Assert (row_indices.size() == values.m(),
+            ExcDimensionMismatch(row_indices.size(), values.m()));
+    Assert (col_indices.size() == values.n(),
+            ExcDimensionMismatch(col_indices.size(), values.n()));
+
+    for (unsigned int i=0; i<row_indices.size(); ++i)
+      set (row_indices[i], col_indices.size(), &col_indices[0], &values(i,0),
+           elide_zero_values);
+  }
+
+
+
+  inline
+  void
+  MatrixBase::set (const unsigned int               row,
+                   const std::vector<unsigned int> &col_indices,
+                   const std::vector<PetscScalar> &values,
+                   const bool                       elide_zero_values)
+  {
+    Assert (col_indices.size() == values.size(),
+            ExcDimensionMismatch(col_indices.size(), values.size()));
+
+    set (row, col_indices.size(), &col_indices[0], &values[0],
+         elide_zero_values);
+  }
+
+
+
+  inline
+  void
+  MatrixBase::set (const unsigned int  row,
+                   const unsigned int  n_cols,
+                   const unsigned int *col_indices,
+                   const PetscScalar *values,
+                   const bool          elide_zero_values)
+  {
+    prepare_action(LastAction::insert);
+
+#ifdef PETSC_USE_64BIT_INDICES
+    const PetscInt petsc_i = row;
+    PetscInt *col_index_ptr;
+#else
+    const int petsc_i = row;
+    int *col_index_ptr;
+#endif
+    PetscScalar const *col_value_ptr;
+    int n_columns;
+
+    // If we don't elide zeros, the pointers
+    // are already available...
+#ifndef PETSC_USE_64BIT_INDICES
+    if (elide_zero_values == false)
+      {
+        col_index_ptr = (int *)col_indices;
+        col_value_ptr = values;
+        n_columns = n_cols;
+      }
+    else
+#endif
+      {
+        // Otherwise, extract nonzero values in
+        // each row and get the respective index.
+        if (column_indices.size() < n_cols)
+          {
+            column_indices.resize(n_cols);
+            column_values.resize(n_cols);
+          }
+
+        n_columns = 0;
+        for (unsigned int j=0; j<n_cols; ++j)
+          {
+            const PetscScalar value = values[j];
+            Assert (numbers::is_finite(value), ExcNumberNotFinite());
+            if (value != PetscScalar())
+              {
+                column_indices[n_columns] = col_indices[j];
+                column_values[n_columns] = value;
+                n_columns++;
+              }
+          }
+        Assert(n_columns <= (int)n_cols, ExcInternalError());
+
+        col_index_ptr = &column_indices[0];
+        col_value_ptr = &column_values[0];
+      }
+
+    const int ierr
+      = MatSetValues (matrix, 1, &petsc_i, n_columns, col_index_ptr,
+                      col_value_ptr, INSERT_VALUES);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+  }
+
+
+
+  inline
+  void
+  MatrixBase::add (const unsigned int i,
+                   const unsigned int j,
+                   const PetscScalar  value)
+  {
+
+    Assert (numbers::is_finite(value), ExcNumberNotFinite());
+
+    if (value == PetscScalar())
+      {
+        // we have to do checkings on Insert/Add
+        // in any case
+        // to be consistent with the MPI
+        // communication model (see the comments
+        // in the documentation of
+        // TrilinosWrappers::Vector), but we can
+        // save some work if the addend is
+        // zero. However, these actions are done
+        // in case we pass on to the other
+        // function.
+        prepare_action(LastAction::add);
+
+        return;
+      }
+    else
+      add (i, 1, &j, &value, false);
+  }
+
+
+
+  inline
+  void
+  MatrixBase::add (const std::vector<unsigned int> &indices,
+                   const FullMatrix<PetscScalar>   &values,
+                   const bool                       elide_zero_values)
+  {
+    Assert (indices.size() == values.m(),
+            ExcDimensionMismatch(indices.size(), values.m()));
+    Assert (values.m() == values.n(), ExcNotQuadratic());
+
+    for (unsigned int i=0; i<indices.size(); ++i)
+      add (indices[i], indices.size(), &indices[0], &values(i,0),
+           elide_zero_values);
+  }
+
+
+
+  inline
+  void
+  MatrixBase::add (const std::vector<unsigned int> &row_indices,
+                   const std::vector<unsigned int> &col_indices,
+                   const FullMatrix<PetscScalar>   &values,
+                   const bool                       elide_zero_values)
+  {
+    Assert (row_indices.size() == values.m(),
+            ExcDimensionMismatch(row_indices.size(), values.m()));
+    Assert (col_indices.size() == values.n(),
+            ExcDimensionMismatch(col_indices.size(), values.n()));
+
+    for (unsigned int i=0; i<row_indices.size(); ++i)
+      add (row_indices[i], col_indices.size(), &col_indices[0], &values(i,0),
+           elide_zero_values);
+  }
+
+
+
+  inline
+  void
+  MatrixBase::add (const unsigned int               row,
+                   const std::vector<unsigned int> &col_indices,
+                   const std::vector<PetscScalar> &values,
+                   const bool                       elide_zero_values)
+  {
+    Assert (col_indices.size() == values.size(),
+            ExcDimensionMismatch(col_indices.size(), values.size()));
+
+    add (row, col_indices.size(), &col_indices[0], &values[0],
+         elide_zero_values);
+  }
+
+
+
+  inline
+  void
+  MatrixBase::add (const unsigned int  row,
+                   const unsigned int  n_cols,
+                   const unsigned int *col_indices,
+                   const PetscScalar *values,
+                   const bool          elide_zero_values,
+                   const bool          /*col_indices_are_sorted*/)
+  {
+    prepare_action(LastAction::add);
+
+#ifdef PETSC_USE_64BIT_INDICES
+    const PetscInt petsc_i = row;
+    PetscInt *col_index_ptr;
+#else
+    const int petsc_i = row;
+    int *col_index_ptr;
+#endif
+    PetscScalar const *col_value_ptr;
+    int n_columns;
+
+    // If we don't elide zeros, the pointers
+    // are already available...
+#ifndef PETSC_USE_64BIT_INDICES
+    if (elide_zero_values == false)
+      {
+        col_index_ptr = (int *)col_indices;
+        col_value_ptr = values;
+        n_columns = n_cols;
+      }
+    else
+#endif
+      {
+        // Otherwise, extract nonzero values in
+        // each row and get the respective index.
+        if (column_indices.size() < n_cols)
+          {
+            column_indices.resize(n_cols);
+            column_values.resize(n_cols);
+          }
+
+        n_columns = 0;
+        for (unsigned int j=0; j<n_cols; ++j)
+          {
+            const PetscScalar value = values[j];
+            Assert (numbers::is_finite(value), ExcNumberNotFinite());
+            if (value != PetscScalar())
+              {
+                column_indices[n_columns] = col_indices[j];
+                column_values[n_columns] = value;
+                n_columns++;
+              }
+          }
+        Assert(n_columns <= (int)n_cols, ExcInternalError());
+
+        col_index_ptr = &column_indices[0];
+        col_value_ptr = &column_values[0];
+      }
+
+    const int ierr
+      = MatSetValues (matrix, 1, &petsc_i, n_columns, col_index_ptr,
+                      col_value_ptr, ADD_VALUES);
+    Assert (ierr == 0, ExcPETScError(ierr));
+  }
+
+
+
+
+
+
+  inline
+  PetscScalar
+  MatrixBase::operator() (const unsigned int i,
+                          const unsigned int j) const
+  {
+    return el(i,j);
+  }
+
+
+
+  inline
+  MatrixBase::const_iterator
+  MatrixBase::begin() const
+  {
+    return const_iterator(this, 0, 0);
+  }
+
+
+  inline
+  MatrixBase::const_iterator
+  MatrixBase::end() const
+  {
+    return const_iterator(this, m(), 0);
+  }
+
+
+  inline
+  MatrixBase::const_iterator
+  MatrixBase::begin(const unsigned int r) const
+  {
+    Assert (r < m(), ExcIndexRange(r, 0, m()));
+    if (row_length(r) > 0)
+      return const_iterator(this, r, 0);
+    else
+      return end (r);
+  }
+
+
+  inline
+  MatrixBase::const_iterator
+  MatrixBase::end(const unsigned int r) const
+  {
+    Assert (r < m(), ExcIndexRange(r, 0, m()));
+
+    // place the iterator on the first entry
+    // past this line, or at the end of the
+    // matrix
+    for (unsigned int i=r+1; i<m(); ++i)
+      if (row_length(i) > 0)
+        return const_iterator(this, i, 0);
+
+    // if there is no such line, then take the
+    // end iterator of the matrix
+    return end();
+  }
+
+
+
+  inline
+  bool
+  MatrixBase::in_local_range (const unsigned int index) const
+  {
+#ifdef PETSC_USE_64BIT_INDICES
+    PetscInt begin, end;
+#else
+    int begin, end;
+#endif
+    const int ierr = MatGetOwnershipRange (static_cast<const Mat &>(matrix),
+                                           &begin, &end);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return ((index >= static_cast<unsigned int>(begin)) &&
+            (index < static_cast<unsigned int>(end)));
+  }
+
+
+
+  inline
+  void
+  MatrixBase::prepare_action(const LastAction::Values new_action)
+  {
+    if (last_action == new_action)
+      ;
+    else if (last_action == LastAction::none)
+      last_action = new_action;
+    else
+      Assert (false, ExcWrongMode (last_action, new_action));
+  }
+
+
+
+  inline
+  void
+  MatrixBase::prepare_add()
+  {
+    prepare_action(LastAction::add);
+  }
+
+
+
+  inline
+  void
+  MatrixBase::prepare_set()
+  {
+    prepare_action(LastAction::insert);
+  }
+
+#endif // DOXYGEN
+}
+
+
+DEAL_II_NAMESPACE_CLOSE
+
+
+#endif // DEAL_II_USE_PETSC
+
+
+/*----------------------------   petsc_matrix_base.h     ---------------------------*/
+
+#endif
+/*----------------------------   petsc_matrix_base.h     ---------------------------*/

Added: branches/s-wang2/for_deal.II/include/deal.II/lac/petsc_parallel_block_vector.h
===================================================================
--- branches/s-wang2/for_deal.II/include/deal.II/lac/petsc_parallel_block_vector.h	                        (rev 0)
+++ branches/s-wang2/for_deal.II/include/deal.II/lac/petsc_parallel_block_vector.h	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,528 @@
+//---------------------------------------------------------------------------
+//    $Id: petsc_parallel_block_vector.h 27628 2012-11-20 22:49:26Z heister $
+//
+//    Copyright (C) 2004, 2005, 2006, 2007, 2009, 2010, 2012 by the deal.II authors
+//
+//    This file is subject to QPL and may not be  distributed
+//    without copyright and license information. Please refer
+//    to the file deal.II/doc/license.html for the  text  and
+//    further information on this license.
+//
+//---------------------------------------------------------------------------
+#ifndef __deal2__petsc_parallel_block_vector_h
+#define __deal2__petsc_parallel_block_vector_h
+
+
+#include <deal.II/base/config.h>
+
+#ifdef DEAL_II_USE_PETSC
+
+#  include <deal.II/lac/petsc_parallel_vector.h>
+#  include <deal.II/lac/block_indices.h>
+#  include <deal.II/lac/block_vector_base.h>
+#  include <deal.II/lac/exceptions.h>
+
+DEAL_II_NAMESPACE_OPEN
+
+
+namespace PETScWrappers
+{
+  // forward declaration
+  class BlockVector;
+
+  namespace MPI
+  {
+
+    /*! @addtogroup PETScWrappers
+     *@{
+     */
+
+    /**
+     * An implementation of block vectors based on the parallel vector class
+     * implemented in PETScWrappers. While the base class provides for most of the
+     * interface, this class handles the actual allocation of vectors and provides
+     * functions that are specific to the underlying vector type.
+     *
+     * The model of distribution of data is such that each of the blocks is
+     * distributed across all MPI processes named in the MPI communicator. I.e. we
+     * don't just distribute the whole vector, but each component. In the
+     * constructors and reinit() functions, one therefore not only has to specify
+     * the sizes of the individual blocks, but also the number of elements of each
+     * of these blocks to be stored on the local process.
+     *
+     * @ingroup Vectors
+     * @see @ref GlossBlockLA "Block (linear algebra)"
+     * @author Wolfgang Bangerth, 2004
+     */
+    class BlockVector : public BlockVectorBase<Vector>
+    {
+    public:
+      /**
+       * Typedef the base class for simpler
+       * access to its own typedefs.
+       */
+      typedef BlockVectorBase<Vector> BaseClass;
+
+      /**
+       * Typedef the type of the underlying
+       * vector.
+       */
+      typedef BaseClass::BlockType  BlockType;
+
+      /**
+       * Import the typedefs from the base
+       * class.
+       */
+      typedef BaseClass::value_type      value_type;
+      typedef BaseClass::pointer         pointer;
+      typedef BaseClass::const_pointer   const_pointer;
+      typedef BaseClass::reference       reference;
+      typedef BaseClass::const_reference const_reference;
+      typedef BaseClass::size_type       size_type;
+      typedef BaseClass::iterator        iterator;
+      typedef BaseClass::const_iterator  const_iterator;
+
+      /**
+       * Default constructor. Generate an
+       * empty vector without any blocks.
+       */
+      BlockVector ();
+
+      /**
+       *  Constructor. Generate a block
+       *  vector with @p n_blocks blocks,
+       *  each of which is a parallel
+       *  vector across @p communicator
+       *  with @p block_size elements of
+       *  which @p local_size elements are
+       *  stored on the present process.
+       */
+      explicit BlockVector (const unsigned int  n_blocks,
+                            const MPI_Comm     &communicator,
+                            const unsigned int  block_size,
+                            const unsigned int  local_size);
+
+      /**
+       * Copy-Constructor. Set all the
+       * properties of the parallel vector
+       * to those of the given argument and
+       * copy the elements.
+       */
+      BlockVector (const BlockVector &V);
+
+      /**
+       * Constructor. Set the number of
+       * blocks to
+       * <tt>block_sizes.size()</tt> and
+       * initialize each block with
+       * <tt>block_sizes[i]</tt> zero
+       * elements. The individual blocks
+       * are distributed across the given
+       * communicator, and each store
+       * <tt>local_elements[i]</tt>
+       * elements on the present process.
+       */
+      BlockVector (const std::vector<unsigned int> &block_sizes,
+                   const MPI_Comm                  &communicator,
+                   const std::vector<unsigned int> &local_elements);
+
+      /**
+       * Destructor. Clears memory
+       */
+      ~BlockVector ();
+
+      /**
+       * Copy operator: fill all components
+       * of the vector that are locally
+       * stored with the given scalar value.
+       */
+      BlockVector &operator = (const value_type s);
+
+      /**
+       * Copy operator for arguments of the
+       * same type.
+       */
+      BlockVector &
+      operator= (const BlockVector &V);
+
+      /**
+       * Copy the given sequential
+       * (non-distributed) block vector
+       * into the present parallel block
+       * vector. It is assumed that they
+       * have the same size, and this
+       * operation does not change the
+       * partitioning of the parallel
+       * vectors by which its elements are
+       * distributed across several MPI
+       * processes. What this operation
+       * therefore does is to copy that
+       * chunk of the given vector @p v
+       * that corresponds to elements of
+       * the target vector that are stored
+       * locally, and copies them, for each
+       * of the individual blocks of this
+       * object. Elements that are not
+       * stored locally are not touched.
+       *
+       * This being a parallel vector, you
+       * must make sure that @em all
+       * processes call this function at
+       * the same time. It is not possible
+       * to change the local part of a
+       * parallel vector on only one
+       * process, independent of what other
+       * processes do, with this function.
+       */
+      BlockVector &
+      operator = (const PETScWrappers::BlockVector &v);
+
+      /**
+       * Reinitialize the BlockVector to
+       * contain @p n_blocks of size @p
+       * block_size, each of which stores
+       * @p local_size elements
+       * locally. The @p communicator
+       * argument denotes which MPI channel
+       * each of these blocks shall
+       * communicate.
+       *
+       * If <tt>fast==false</tt>, the vector
+       * is filled with zeros.
+       */
+      void reinit (const unsigned int  n_blocks,
+                   const MPI_Comm     &communicator,
+                   const unsigned int  block_size,
+                   const unsigned int  local_size,
+                   const bool fast = false);
+
+      /**
+       * Reinitialize the BlockVector such
+       * that it contains
+       * <tt>block_sizes.size()</tt>
+       * blocks. Each block is
+       * reinitialized to dimension
+       * <tt>block_sizes[i]</tt>. Each of
+       * them stores
+       * <tt>local_sizes[i]</tt> elements
+       * on the present process.
+       *
+       * If the number of blocks is the
+       * same as before this function
+       * was called, all vectors remain
+       * the same and reinit() is
+       * called for each vector.
+       *
+       * If <tt>fast==false</tt>, the vector
+       * is filled with zeros.
+       *
+       * Note that you must call this
+       * (or the other reinit()
+       * functions) function, rather
+       * than calling the reinit()
+       * functions of an individual
+       * block, to allow the block
+       * vector to update its caches of
+       * vector sizes. If you call
+       * reinit() of one of the
+       * blocks, then subsequent
+       * actions on this object may
+       * yield unpredictable results
+       * since they may be routed to
+       * the wrong block.
+       */
+      void reinit (const std::vector<unsigned int> &block_sizes,
+                   const MPI_Comm                  &communicator,
+                   const std::vector<unsigned int> &local_sizes,
+                   const bool                       fast=false);
+
+        void reinit (const std::vector<unsigned int> &block_sizes,
+        		const MPI_Comm                  &communicator);				// added by shuqiangwang
+
+      /**
+       * Change the dimension to that
+       * of the vector <tt>V</tt>. The same
+       * applies as for the other
+       * reinit() function.
+       *
+       * The elements of <tt>V</tt> are not
+       * copied, i.e.  this function is
+       * the same as calling <tt>reinit
+       * (V.size(), fast)</tt>.
+       *
+       * Note that you must call this
+       * (or the other reinit()
+       * functions) function, rather
+       * than calling the reinit()
+       * functions of an individual
+       * block, to allow the block
+       * vector to update its caches of
+       * vector sizes. If you call
+       * reinit() on one of the
+       * blocks, then subsequent
+       * actions on this object may
+       * yield unpredictable results
+       * since they may be routed to
+       * the wrong block.
+       */
+      void reinit (const BlockVector &V,
+                   const bool         fast=false);
+
+      /**
+       * Return a reference to the MPI
+       * communicator object in use with
+       * this vector.
+       */
+      const MPI_Comm &get_mpi_communicator () const;
+
+      /**
+       * Swap the contents of this
+       * vector and the other vector
+       * <tt>v</tt>. One could do this
+       * operation with a temporary
+       * variable and copying over the
+       * data elements, but this
+       * function is significantly more
+       * efficient since it only swaps
+       * the pointers to the data of
+       * the two vectors and therefore
+       * does not need to allocate
+       * temporary storage and move
+       * data around.
+       *
+       * Limitation: right now this
+       * function only works if both
+       * vectors have the same number
+       * of blocks. If needed, the
+       * numbers of blocks should be
+       * exchanged, too.
+       *
+       * This function is analog to the
+       * the swap() function of all C++
+       * standard containers. Also,
+       * there is a global function
+       * swap(u,v) that simply calls
+       * <tt>u.swap(v)</tt>, again in analogy
+       * to standard functions.
+       */
+      void swap (BlockVector &v);
+
+      /**
+       * Print to a stream.
+       */
+      void print (std::ostream       &out,
+                  const unsigned int  precision = 3,
+                  const bool          scientific = true,
+                  const bool          across = true) const;
+
+      /**
+       * Exception
+       */
+      DeclException0 (ExcIteratorRangeDoesNotMatchVectorSize);
+      /**
+       * Exception
+       */
+      DeclException0 (ExcNonMatchingBlockVectors);
+    };
+
+    /*@}*/
+
+    /*----------------------- Inline functions ----------------------------------*/
+
+
+    inline
+    BlockVector::BlockVector ()
+    {}
+
+
+
+    inline
+    BlockVector::BlockVector (const unsigned int  n_blocks,
+                              const MPI_Comm     &communicator,
+                              const unsigned int  block_size,
+                              const unsigned int  local_size)
+    {
+      reinit (n_blocks, communicator, block_size, local_size);
+    }
+
+
+
+    inline
+    BlockVector::BlockVector (const std::vector<unsigned int> &block_sizes,
+                              const MPI_Comm     &communicator,
+                              const std::vector<unsigned int> &local_elements)
+    {
+      reinit (block_sizes, communicator, local_elements, false);
+    }
+
+
+    inline
+    BlockVector::BlockVector (const BlockVector &v)
+      :
+      BlockVectorBase<Vector > ()
+    {
+      this->components.resize (v.n_blocks());
+      this->block_indices = v.block_indices;
+
+      for (unsigned int i=0; i<this->n_blocks(); ++i)
+        this->components[i] = v.components[i];
+
+      collect_sizes();     // shuqiangwang
+    }
+
+
+
+    inline
+    BlockVector &
+    BlockVector::operator = (const value_type s)
+    {
+      BaseClass::operator = (s);
+      return *this;
+    }
+
+
+
+    inline
+    BlockVector &
+    BlockVector::operator = (const BlockVector &v)
+    {
+      BaseClass::operator = (v);
+      return *this;
+    }
+
+
+
+    inline
+    BlockVector::~BlockVector ()
+    {}
+
+
+    inline
+    void
+    BlockVector::reinit (const unsigned int  n_blocks,
+                         const MPI_Comm     &communicator,
+                         const unsigned int  block_size,
+                         const unsigned int  local_size,
+                         const bool fast)
+    {
+      reinit(std::vector<unsigned int>(n_blocks, block_size),
+             communicator,
+             std::vector<unsigned int>(n_blocks, local_size),
+             fast);
+    }
+
+
+
+    inline
+    void
+    BlockVector::reinit (const std::vector<unsigned int> &block_sizes,
+                         const MPI_Comm                  &communicator,
+                         const std::vector<unsigned int> &local_sizes,
+                         const bool                       fast)
+    {
+      this->block_indices.reinit (block_sizes);
+      if (this->components.size() != this->n_blocks())
+        this->components.resize(this->n_blocks());
+
+      for (unsigned int i=0; i<this->n_blocks(); ++i)
+        this->components[i].reinit(communicator, block_sizes[i],
+                                   local_sizes[i], fast);
+
+      collect_sizes();     // shuqiangwang
+    }
+
+    inline
+    void
+    BlockVector::reinit (const std::vector<unsigned int> &block_sizes,
+            			 const MPI_Comm     &communicator)				// added by shuqiangwang
+    {
+    	this->block_indices.reinit (block_sizes);
+    	if (this->components.size() != this->n_blocks())
+    		this->components.resize(this->n_blocks());
+
+    	collect_sizes();     // shuqiangwang
+    }
+
+    inline
+    void
+    BlockVector::reinit (const BlockVector &v,
+                         const bool fast)
+    {
+      this->block_indices = v.get_block_indices();
+      if (this->components.size() != this->n_blocks())
+        this->components.resize(this->n_blocks());
+
+      for (unsigned int i=0; i<this->n_blocks(); ++i)
+        block(i).reinit(v.block(i), fast);
+
+      collect_sizes();     // shuqiangwang
+    }
+
+
+
+    inline
+    const MPI_Comm &
+    BlockVector::get_mpi_communicator () const
+    {
+      return block(0).get_mpi_communicator();
+    }
+
+
+
+    inline
+    void
+    BlockVector::swap (BlockVector &v)
+    {
+      Assert (this->n_blocks() == v.n_blocks(),
+              ExcDimensionMismatch(this->n_blocks(), v.n_blocks()));
+
+      for (unsigned int i=0; i<this->n_blocks(); ++i)
+        this->components[i].swap (v.components[i]);
+      ::dealii::swap (this->block_indices, v.block_indices);
+    }
+
+
+
+    inline
+    void
+    BlockVector::print (std::ostream       &out,
+                        const unsigned int  precision,
+                        const bool          scientific,
+                        const bool          across) const
+    {
+      for (unsigned int i=0; i<this->n_blocks(); ++i)
+        {
+          if (across)
+            out << 'C' << i << ':';
+          else
+            out << "Component " << i << std::endl;
+          this->components[i].print(out, precision, scientific, across);
+        }
+    }
+
+
+
+    /**
+     * Global function which overloads the default implementation
+     * of the C++ standard library which uses a temporary object. The
+     * function simply exchanges the data of the two vectors.
+     *
+     * @relates PETScWrappers::MPI::BlockVector
+     * @author Wolfgang Bangerth, 2000
+     */
+    inline
+    void swap (BlockVector &u,
+               BlockVector &v)
+    {
+      u.swap (v);
+    }
+
+  }
+
+}
+
+DEAL_II_NAMESPACE_CLOSE
+
+#endif  // DEAL_II_USE_PETSC
+
+#endif

Added: branches/s-wang2/for_deal.II/include/deal.II/lac/petsc_solver.h
===================================================================
--- branches/s-wang2/for_deal.II/include/deal.II/lac/petsc_solver.h	                        (rev 0)
+++ branches/s-wang2/for_deal.II/include/deal.II/lac/petsc_solver.h	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,1272 @@
+//---------------------------------------------------------------------------
+//    $Id: petsc_solver.h 27666 2012-11-21 22:05:49Z bangerth $
+//
+//    Copyright (C) 2004, 2005, 2006, 2007, 2009, 2010, 2012 by the deal.II authors
+//
+//    This file is subject to QPL and may not be  distributed
+//    without copyright and license information. Please refer
+//    to the file deal.II/doc/license.html for the  text  and
+//    further information on this license.
+//
+//---------------------------------------------------------------------------
+#ifndef __deal2__petsc_solver_h
+#define __deal2__petsc_solver_h
+
+
+#include <deal.II/base/config.h>
+
+#ifdef DEAL_II_USE_PETSC
+
+#  include <deal.II/lac/exceptions.h>
+#  include <deal.II/lac/solver_control.h>
+#  include <deal.II/base/std_cxx1x/shared_ptr.h>
+
+#  include <petscksp.h>
+
+DEAL_II_NAMESPACE_OPEN
+
+
+namespace PETScWrappers
+{
+  // forward declarations
+  class MatrixBase;
+  class VectorBase;
+  class PreconditionerBase;
+
+
+  /**
+   * Base class for solver classes using the PETSc solvers. Since solvers in
+   * PETSc are selected based on flags passed to a generic solver object,
+   * basically all the actual solver calls happen in this class, and derived
+   * classes simply set the right flags to select one solver or another, or to
+   * set certain parameters for individual solvers.
+   *
+   * Optionally, the user can create a solver derived from the
+   * SolverBase class and can set the default arguments necessary to
+   * solve the linear system of equations with SolverControl. These
+   * default options can be overridden by specifying command line
+   * arguments of the form @p -ksp_*. For example,
+   * @p -ksp_monitor_true_residual prints out true residual norm
+   * (unpreconditioned) at each iteration and @p -ksp_view provides
+   * information about the linear solver and the preconditioner used in
+   * the current context. The type of the solver can also be changed
+   * during runtime by specifying @p -ksp_type {richardson, cg, gmres,
+   * fgmres, ..} to dynamically test the optimal solver along with a
+   * suitable preconditioner set using @p -pc_type {jacobi, bjacobi,
+   * ilu, lu, ..}. There are several other command line options
+   * available to modify the behavior of the PETSc linear solver and can
+   * be obtained from the <a
+   * href="http://www.mcs.anl.gov/petsc">documentation and manual
+   * pages</a>.
+   *
+   * @note Repeated calls to solve() on a solver object with a Preconditioner
+   * must be used with care. The preconditioner is initialized in the first call
+   * to solve() and subsequent calls reuse the solver and preconditioner
+   * object. This is done for performance reasons. The solver and preconditioner
+   * can be reset by calling reset().
+   *
+   * One of the gotchas of PETSc is that -- in particular in MPI mode -- it
+   * often does not produce very helpful error messages. In order to save
+   * other users some time in searching a hard to track down error, here is
+   * one situation and the error message one gets there:
+   * when you don't specify an MPI communicator to your solver's constructor. In
+   * this case, you will get an error of the following form from each of your
+   * parallel processes:
+   * @verbatim
+   *   [1]PETSC ERROR: PCSetVector() line 1173 in src/ksp/pc/interface/precon.c
+   *   [1]PETSC ERROR:   Arguments must have same communicators!
+   *   [1]PETSC ERROR:   Different communicators in the two objects: Argument # 1 and 2!
+   *   [1]PETSC ERROR: KSPSetUp() line 195 in src/ksp/ksp/interface/itfunc.c
+   * @endverbatim
+   *
+   * This error, on which one can spend a very long time figuring out
+   * what exactly goes wrong, results from not specifying an MPI
+   * communicator. Note that the communicator @em must match that of the
+   * matrix and all vectors in the linear system which we want to
+   * solve. Aggravating the situation is the fact that the default
+   * argument to the solver classes, @p PETSC_COMM_SELF, is the
+   * appropriate argument for the sequential case (which is why it is
+   * the default argument), so this error only shows up in parallel
+   * mode.
+   *
+   * @ingroup PETScWrappers
+   * @author Wolfgang Bangerth, 2004
+   */
+  class SolverBase
+  {
+  public:
+    /**
+     * Constructor. Takes the solver
+     * control object and the MPI
+     * communicator over which parallel
+     * computations are to happen.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverBase (SolverControl &cn,
+                const MPI_Comm &mpi_communicator);
+
+    /**
+     * Destructor.
+     */
+    virtual ~SolverBase ();
+
+    /**
+     * Solve the linear system
+     * <tt>Ax=b</tt>. Depending on the
+     * information provided by derived
+     * classes and the object passed as a
+     * preconditioner, one of the linear
+     * solvers and preconditioners of PETSc
+     * is chosen.  Repeated calls to
+     * solve() do not reconstruct the
+     * preconditioner for performance
+     * reasons. See class Documentation.
+     */
+    void
+    solve (const MatrixBase         &A,
+           VectorBase               &x,
+           const VectorBase         &b,
+           const PreconditionerBase &preconditioner);
+
+
+    /**
+     * Resets the contained preconditioner
+     * and solver object. See class
+     * description for more details.
+     */
+    virtual void reset();
+
+
+    /**
+      * Sets a prefix name for the solver
+      * object. Useful when customizing the
+      * PETSc KSP object with command-line
+      * options.
+      */
+    void set_prefix(const std::string &prefix);
+
+
+    /**
+     * Access to object that controls
+     * convergence.
+     */
+    SolverControl &control() const;
+
+    /**
+     * Exception
+     */
+    DeclException1 (ExcPETScError,
+                    int,
+                    << "An error with error number " << arg1
+                    << " occurred while calling a PETSc function");
+
+  protected:
+
+    /**
+     * Reference to the object that
+     * controls convergence of the
+     * iterative solver. In fact, for these
+     * PETSc wrappers, PETSc does so
+     * itself, but we copy the data from
+     * this object before starting the
+     * solution process, and copy the data
+     * back into it afterwards.
+     */
+    SolverControl &solver_control;
+
+    /**
+     * Copy of the MPI communicator object
+     * to be used for the solver.
+     */
+    const MPI_Comm mpi_communicator;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     * requested by the derived class.
+     */
+    virtual void set_solver_type (KSP &ksp) const = 0;
+
+    /**
+     * Solver prefix name to qualify options
+     * specific to the PETSc KSP object in the
+     * current context.
+     * Note: A hyphen (-) must NOT be given
+     * at the beginning of the prefix name.
+     * The first character of all runtime
+     * options is AUTOMATICALLY the hyphen.
+     */
+    std::string prefix_name;
+
+  private:
+    /**
+     * A function that is used in PETSc as
+     * a callback to check on
+     * convergence. It takes the
+     * information provided from PETSc and
+     * checks it against deal.II's own
+     * SolverControl objects to see if
+     * convergence has been reached.
+     */
+    static
+#ifdef PETSC_USE_64BIT_INDICES
+    PetscErrorCode
+#else
+    int
+#endif
+    convergence_test (KSP                 ksp,
+#ifdef PETSC_USE_64BIT_INDICES
+                      const PetscInt      iteration,
+#else
+                      const int           iteration,
+#endif
+                      const PetscReal     residual_norm,
+                      KSPConvergedReason *reason,
+                      void               *solver_control);
+
+    /**
+     * A structure that contains the PETSc
+     * solver and preconditioner
+     * objects. This object is preserved
+     * between subsequent calls to the
+     * solver if the same preconditioner is
+     * used as in the previous solver
+     * step. This may save some computation
+     * time, if setting up a preconditioner
+     * is expensive, such as in the case of
+     * an ILU for example.
+     *
+     * The actual declaration of this class
+     * is complicated by the fact that
+     * PETSc changed its solver interface
+     * completely and incompatibly between
+     * versions 2.1.6 and 2.2.0 :-(
+     *
+     * Objects of this type are explicitly
+     * created, but are destroyed when the
+     * surrounding solver object goes out
+     * of scope, or when we assign a new
+     * value to the pointer to this
+     * object. The respective *Destroy
+     * functions are therefore written into
+     * the destructor of this object, even
+     * though the object does not have a
+     * constructor.
+     */
+    struct SolverData
+    {
+      /**
+       * Destructor
+       */
+      ~SolverData ();
+
+      /**
+       * Objects for Krylov subspace
+       * solvers and preconditioners.
+       */
+      KSP  ksp;
+      PC   pc;
+    };
+
+    /**
+     * Pointer to an object that stores the
+     * solver context. This is recreated in
+     * the main solver routine if
+     * necessary.
+     */
+    std_cxx1x::shared_ptr<SolverData> solver_data;
+  };
+
+
+
+  /**
+   * An implementation of the solver interface using the PETSc Richardson
+   * solver.
+   *
+   * @ingroup PETScWrappers
+   * @author Wolfgang Bangerth, 2004
+   */
+  class SolverRichardson : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data struct to
+     * pipe additional data to the
+     * solver.
+     */
+    struct AdditionalData
+    {
+      /**
+       * Constructor. By default,
+       * set the damping parameter
+       * to one.
+       */
+      AdditionalData (const double omega = 1);
+
+      /**
+       * Relaxation parameter.
+       */
+      double omega;
+    };
+
+    /**
+     * Constructor. In contrast to
+     * deal.II's own solvers, there is no
+     * need to give a vector memory
+     * object. However, PETSc solvers want
+     * to have an MPI communicator context
+     * over which computations are
+     * parallelized. By default,
+     * @p PETSC_COMM_SELF is used here,
+     * but you can change this. Note that
+     * for single processor (non-MPI)
+     * versions, this parameter does not
+     * have any effect.
+     *
+     * The last argument takes a structure
+     * with additional, solver dependent
+     * flags for tuning.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverRichardson (SolverControl        &cn,
+                      const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+                      const AdditionalData &data = AdditionalData());
+
+  protected:
+    /**
+     * Store a copy of the flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     *appropriate for this class.
+     */
+    virtual void set_solver_type (KSP &ksp) const;
+  };
+
+
+
+  /**
+   * An implementation of the solver interface using the PETSc Chebychev
+   * solver.
+   *
+   * @ingroup PETScWrappers
+   * @author Wolfgang Bangerth, 2004
+   */
+  class SolverChebychev : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data struct to
+     * pipe additional data to the
+     * solver.
+     */
+    struct AdditionalData
+    {};
+
+    /**
+     * Constructor. In contrast to
+     * deal.II's own solvers, there is no
+     * need to give a vector memory
+     * object. However, PETSc solvers want
+     * to have an MPI communicator context
+     * over which computations are
+     * parallelized. By default,
+     * @p PETSC_COMM_SELF is used here,
+     * but you can change this. Note that
+     * for single processor (non-MPI)
+     * versions, this parameter does not
+     * have any effect.
+     *
+     * The last argument takes a structure
+     * with additional, solver dependent
+     * flags for tuning.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverChebychev (SolverControl        &cn,
+                     const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+                     const AdditionalData &data = AdditionalData());
+
+  protected:
+    /**
+     * Store a copy of the flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     *appropriate for this class.
+     */
+    virtual void set_solver_type (KSP &ksp) const;
+  };
+
+
+
+  /**
+   * An implementation of the solver interface using the PETSc CG
+   * solver.
+   *
+   * @ingroup PETScWrappers
+   * @author Wolfgang Bangerth, 2004
+   */
+  class SolverCG : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data struct to
+     * pipe additional data to the
+     * solver.
+     */
+    struct AdditionalData
+    {};
+
+    /**
+     * Constructor. In contrast to
+     * deal.II's own solvers, there is no
+     * need to give a vector memory
+     * object. However, PETSc solvers want
+     * to have an MPI communicator context
+     * over which computations are
+     * parallelized. By default,
+     * @p PETSC_COMM_SELF is used here,
+     * but you can change this. Note that
+     * for single processor (non-MPI)
+     * versions, this parameter does not
+     * have any effect.
+     *
+     * The last argument takes a structure
+     * with additional, solver dependent
+     * flags for tuning.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverCG (SolverControl        &cn,
+              const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+              const AdditionalData &data = AdditionalData());
+
+  protected:
+    /**
+     * Store a copy of the flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     *appropriate for this class.
+     */
+    virtual void set_solver_type (KSP &ksp) const;
+  };
+
+
+
+  /**
+   * An implementation of the solver interface using the PETSc BiCG
+   * solver.
+   *
+   * @ingroup PETScWrappers
+   * @author Wolfgang Bangerth, 2004
+   */
+  class SolverBiCG : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data struct to
+     * pipe additional data to the
+     * solver.
+     */
+    struct AdditionalData
+    {};
+
+    /**
+     * Constructor. In contrast to
+     * deal.II's own solvers, there is no
+     * need to give a vector memory
+     * object. However, PETSc solvers want
+     * to have an MPI communicator context
+     * over which computations are
+     * parallelized. By default,
+     * @p PETSC_COMM_SELF is used here,
+     * but you can change this. Note that
+     * for single processor (non-MPI)
+     * versions, this parameter does not
+     * have any effect.
+     *
+     * The last argument takes a structure
+     * with additional, solver dependent
+     * flags for tuning.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverBiCG (SolverControl        &cn,
+                const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+                const AdditionalData &data = AdditionalData());
+
+  protected:
+    /**
+     * Store a copy of the flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     *appropriate for this class.
+     */
+    virtual void set_solver_type (KSP &ksp) const;
+  };
+
+
+
+  /**
+   * An implementation of the solver interface using the PETSc GMRES
+   * solver.
+   *
+   * @author Wolfgang Bangerth, 2004
+   */
+  class SolverGMRES : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data struct to
+     * pipe additional data to the
+     * solver.
+     */
+    struct AdditionalData
+    {
+      /**
+       * Constructor. By default, set the
+       * number of temporary vectors to
+       * 30, i.e. do a restart every 30
+       * iterations.
+       */
+      AdditionalData (const unsigned int restart_parameter = 30,
+                      const bool right_preconditioning = false);
+
+      /**
+       * Maximum number of
+       * tmp vectors.
+       */
+      unsigned int restart_parameter;
+
+      /**
+       * Flag for right
+       * preconditioning.
+       */
+      bool right_preconditioning;
+    };
+
+    /**
+     * Constructor. In contrast to
+     * deal.II's own solvers, there is no
+     * need to give a vector memory
+     * object. However, PETSc solvers want
+     * to have an MPI communicator context
+     * over which computations are
+     * parallelized. By default,
+     * @p PETSC_COMM_SELF is used here,
+     * but you can change this. Note that
+     * for single processor (non-MPI)
+     * versions, this parameter does not
+     * have any effect.
+     *
+     * The last argument takes a structure
+     * with additional, solver dependent
+     * flags for tuning.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverGMRES (SolverControl        &cn,
+                 const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+                 const AdditionalData &data = AdditionalData());
+
+  protected:
+    /**
+     * Store a copy of the flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     *appropriate for this class.
+     */
+    virtual void set_solver_type (KSP &ksp) const;
+  };
+
+
+
+  /**
+   * An implementation of the solver interface using the PETSc BiCGStab
+   * solver.
+   *
+   * @ingroup PETScWrappers
+   * @author Wolfgang Bangerth, 2004
+   */
+  class SolverBicgstab : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data struct to
+     * pipe additional data to the
+     * solver.
+     */
+    struct AdditionalData
+    {};
+
+    /**
+     * Constructor. In contrast to
+     * deal.II's own solvers, there is no
+     * need to give a vector memory
+     * object. However, PETSc solvers want
+     * to have an MPI communicator context
+     * over which computations are
+     * parallelized. By default,
+     * @p PETSC_COMM_SELF is used here,
+     * but you can change this. Note that
+     * for single processor (non-MPI)
+     * versions, this parameter does not
+     * have any effect.
+     *
+     * The last argument takes a structure
+     * with additional, solver dependent
+     * flags for tuning.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverBicgstab (SolverControl        &cn,
+                    const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+                    const AdditionalData &data = AdditionalData());
+
+  protected:
+    /**
+     * Store a copy of the flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     *appropriate for this class.
+     */
+    virtual void set_solver_type (KSP &ksp) const;
+  };
+
+  /**
+   * An implementation of the solver interface using the PETSc CG Squared
+   * solver.
+   *
+   * @ingroup PETScWrappers
+   * @author Wolfgang Bangerth, 2004
+   */
+  class SolverCGS : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data struct to
+     * pipe additional data to the
+     * solver.
+     */
+    struct AdditionalData
+    {};
+
+    /**
+     * Constructor. In contrast to
+     * deal.II's own solvers, there is no
+     * need to give a vector memory
+     * object. However, PETSc solvers want
+     * to have an MPI communicator context
+     * over which computations are
+     * parallelized. By default,
+     * @p PETSC_COMM_SELF is used here,
+     * but you can change this. Note that
+     * for single processor (non-MPI)
+     * versions, this parameter does not
+     * have any effect.
+     *
+     * The last argument takes a structure
+     * with additional, solver dependent
+     * flags for tuning.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverCGS (SolverControl        &cn,
+               const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+               const AdditionalData &data = AdditionalData());
+
+  protected:
+    /**
+     * Store a copy of the flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     *appropriate for this class.
+     */
+    virtual void set_solver_type (KSP &ksp) const;
+  };
+
+
+
+  /**
+   * An implementation of the solver interface using the PETSc TFQMR
+   * solver.
+   *
+   * @ingroup PETScWrappers
+   * @author Wolfgang Bangerth, 2004
+   */
+  class SolverTFQMR : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data struct to
+     * pipe additional data to the
+     * solver.
+     */
+    struct AdditionalData
+    {};
+
+    /**
+     * Constructor. In contrast to
+     * deal.II's own solvers, there is no
+     * need to give a vector memory
+     * object. However, PETSc solvers want
+     * to have an MPI communicator context
+     * over which computations are
+     * parallelized. By default,
+     * @p PETSC_COMM_SELF is used here,
+     * but you can change this. Note that
+     * for single processor (non-MPI)
+     * versions, this parameter does not
+     * have any effect.
+     *
+     * The last argument takes a structure
+     * with additional, solver dependent
+     * flags for tuning.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverTFQMR (SolverControl        &cn,
+                 const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+                 const AdditionalData &data = AdditionalData());
+
+  protected:
+    /**
+     * Store a copy of the flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     *appropriate for this class.
+     */
+    virtual void set_solver_type (KSP &ksp) const;
+  };
+
+
+
+
+  /**
+   * An implementation of the solver interface using the PETSc TFQMR-2 solver
+   * (called TCQMR in PETSc). Note that this solver had a serious bug in
+   * versions up to and including PETSc 2.1.6, in that it did not check
+   * convergence and always returned an error code. Thus, this class will abort
+   * with an error indicating failure to converge with PETSc 2.1.6 and
+   * prior. This should be fixed in later versions of PETSc, though.
+   *
+   * @ingroup PETScWrappers
+   * @author Wolfgang Bangerth, 2004
+   */
+  class SolverTCQMR : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data struct to
+     * pipe additional data to the
+     * solver.
+     */
+    struct AdditionalData
+    {};
+
+    /**
+     * Constructor. In contrast to
+     * deal.II's own solvers, there is no
+     * need to give a vector memory
+     * object. However, PETSc solvers want
+     * to have an MPI communicator context
+     * over which computations are
+     * parallelized. By default,
+     * @p PETSC_COMM_SELF is used here,
+     * but you can change this. Note that
+     * for single processor (non-MPI)
+     * versions, this parameter does not
+     * have any effect.
+     *
+     * The last argument takes a structure
+     * with additional, solver dependent
+     * flags for tuning.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverTCQMR (SolverControl        &cn,
+                 const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+                 const AdditionalData &data = AdditionalData());
+
+  protected:
+    /**
+     * Store a copy of the flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     *appropriate for this class.
+     */
+    virtual void set_solver_type (KSP &ksp) const;
+  };
+
+
+
+  /**
+   * An implementation of the solver interface using the PETSc CR
+   * solver.
+   *
+   * @ingroup PETScWrappers
+   * @author Wolfgang Bangerth, 2004
+   */
+  class SolverCR : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data struct to
+     * pipe additional data to the
+     * solver.
+     */
+    struct AdditionalData
+    {};
+
+    /**
+     * Constructor. In contrast to
+     * deal.II's own solvers, there is no
+     * need to give a vector memory
+     * object. However, PETSc solvers want
+     * to have an MPI communicator context
+     * over which computations are
+     * parallelized. By default,
+     * @p PETSC_COMM_SELF is used here,
+     * but you can change this. Note that
+     * for single processor (non-MPI)
+     * versions, this parameter does not
+     * have any effect.
+     *
+     * The last argument takes a structure
+     * with additional, solver dependent
+     * flags for tuning.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverCR (SolverControl        &cn,
+              const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+              const AdditionalData &data = AdditionalData());
+
+  protected:
+    /**
+     * Store a copy of the flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     *appropriate for this class.
+     */
+    virtual void set_solver_type (KSP &ksp) const;
+  };
+
+
+
+  /**
+   * An implementation of the solver interface using the PETSc Least Squares
+   * solver.
+   *
+   * @ingroup PETScWrappers
+   * @author Wolfgang Bangerth, 2004
+   */
+  class SolverLSQR : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data struct to
+     * pipe additional data to the
+     * solver.
+     */
+    struct AdditionalData
+    {};
+
+    /**
+     * Constructor. In contrast to
+     * deal.II's own solvers, there is no
+     * need to give a vector memory
+     * object. However, PETSc solvers want
+     * to have an MPI communicator context
+     * over which computations are
+     * parallelized. By default,
+     * @p PETSC_COMM_SELF is used here,
+     * but you can change this. Note that
+     * for single processor (non-MPI)
+     * versions, this parameter does not
+     * have any effect.
+     *
+     * The last argument takes a structure
+     * with additional, solver dependent
+     * flags for tuning.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverLSQR (SolverControl        &cn,
+                const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+                const AdditionalData &data = AdditionalData());
+
+  protected:
+    /**
+     * Store a copy of the flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     *appropriate for this class.
+     */
+    virtual void set_solver_type (KSP &ksp) const;
+  };
+
+
+  /**
+   * An implementation of the solver interface using the PETSc PREONLY
+   * solver. Actually this is NOT a real solution algorithm. solve() only
+   * applies the preconditioner once and returns immediately. Its only purpose
+   * is to provide a solver object, when the preconditioner should be used as a
+   * real solver. It is very useful in conjunction with the complete LU
+   * decomposition preconditioner <tt> PreconditionLU </tt>, which in
+   * conjunction with this solver class becomes a direct solver.
+   *
+   * @ingroup PETScWrappers
+   * @author Wolfgang Bangerth, 2004, Oliver Kayser-Herold, 2004
+   */
+  class SolverPreOnly : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data struct to
+     * pipe additional data to the
+     * solver.
+     */
+    struct AdditionalData
+    {};
+
+    /**
+     * Constructor. In contrast to
+     * deal.II's own solvers, there is no
+     * need to give a vector memory
+     * object. However, PETSc solvers want
+     * to have an MPI communicator context
+     * over which computations are
+     * parallelized. By default,
+     * @p PETSC_COMM_SELF is used here,
+     * but you can change this. Note that
+     * for single processor (non-MPI)
+     * versions, this parameter does not
+     * have any effect.
+     *
+     * The last argument takes a structure
+     * with additional, solver dependent
+     * flags for tuning.
+     *
+     * Note that the communicator used here
+     * must match the communicator used in
+     * the system matrix, solution, and
+     * right hand side object of the solve
+     * to be done with this
+     * solver. Otherwise, PETSc will
+     * generate hard to track down errors,
+     * see the documentation of the
+     * SolverBase class.
+     */
+    SolverPreOnly (SolverControl        &cn,
+                   const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+                   const AdditionalData &data = AdditionalData());
+
+  protected:
+    /**
+     * Store a copy of the flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    /**
+     * Function that takes a Krylov
+     * Subspace Solver context object, and
+     * sets the type of solver that is
+     * appropriate for this class.
+     */
+    virtual void set_solver_type (KSP &ksp) const;
+  };
+
+  /**
+   * An implementation of the solver interface using the sparse direct MUMPS
+   * solver through PETSc. This class has the usual interface of all other
+   * solver classes but it is of course different in that it doesn't implement
+   * an iterative solver. As a consequence, things like the SolverControl object
+   * have no particular meaning here.
+   *
+   * MUMPS allows to make use of symmetry in this matrix. In this class this is
+   * made possible by the set_symmetric_mode() function. If your matrix is
+   * symmetric, you can use this class as follows:
+   * @code
+   *    SolverControl cn;
+   *    PETScWrappers::SparseDirectMUMPS solver(cn, mpi_communicator);
+   *    solver.set_symmetric_mode(true);
+   *    solver.solve(system_matrix, solution, system_rhs);
+   * @endcode
+   *
+   * @note The class internally calls KSPSetFromOptions thus you are
+   * able to use all the PETSc parameters for MATSOLVERMUMPS package.
+   * See http://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/Mat/MATSOLVERMUMPS.html
+   *
+   * @ingroup PETScWrappers
+   * @author Daniel Brauss, Alexander Grayver, 2012
+   */
+  class SparseDirectMUMPS : public SolverBase
+  {
+  public:
+    /**
+     * Standardized data structure
+     * to pipe additional data to
+     * the solver.
+     */
+    struct AdditionalData
+    {};
+    /**
+     * Constructor
+     */
+    SparseDirectMUMPS (SolverControl        &cn,
+                       const MPI_Comm       &mpi_communicator = PETSC_COMM_SELF,
+                       const AdditionalData &data = AdditionalData());
+
+    /**
+     * The method to solve the
+     * linear system.
+     */
+    void solve (const MatrixBase &A,
+                VectorBase       &x,
+                const VectorBase &b);
+
+    /**
+    * The method allows to take advantage
+    * if the system matrix is symmetric by
+    * using LDL^T decomposition unstead of
+    * more expensive LU. The argument
+    * indicates whether the matrix is
+    * symmetric or not.
+    */
+    void set_symmetric_mode (const bool flag);
+
+  protected:
+    /**
+     * Store a copy of flags for this
+     * particular solver.
+     */
+    const AdditionalData additional_data;
+
+    virtual void set_solver_type (KSP &ksp) const;
+
+  private:
+    /**
+     * A function that is used in PETSc
+     * as a callback to check convergence.
+     * It takes the information provided
+     * from PETSc and checks it against
+     * deal.II's own SolverControl objects
+     * to see if convergence has been reached.
+     */
+    static
+#ifdef PETSC_USE_64BIT_INDICES
+    PetscErrorCode
+#else
+    int
+#endif
+    convergence_test (KSP                ksp,
+#ifdef PETSC_USE_64BIT_INDICES
+                      const PetscInt     iteration,
+#else
+                      const int          iteration,
+#endif
+                      const PetscReal    residual_norm,
+                      KSPConvergedReason *reason,
+                      void               *solver_control);
+    /**
+     * A structure that contains the
+     * PETSc solver and preconditioner
+     * objects.  Since the solve member
+     * function in the base is not used
+     * here, the private SolverData struct
+     * located in the base could not be used
+     * either
+     */
+    struct SolverDataMUMPS
+    {
+      KSP ksp;
+      PC  pc;
+    };
+
+    std_cxx1x::shared_ptr<SolverDataMUMPS> solver_data;
+
+    /**
+     * Flag specifies whether matrix
+     * being factorized is symmetric
+     * or not. It influences the type
+     * of the used preconditioner
+     * (PCLU or PCCHOLESKY)
+     */
+    bool symmetric_mode;
+  };
+}
+
+DEAL_II_NAMESPACE_CLOSE
+
+#endif // DEAL_II_USE_PETSC
+
+/*----------------------------   petsc_solver.h     ---------------------------*/
+
+#endif
+/*----------------------------   petsc_solver.h     ---------------------------*/

Added: branches/s-wang2/for_deal.II/include/deal.II/lac/trilinos_sparse_matrix.h
===================================================================
--- branches/s-wang2/for_deal.II/include/deal.II/lac/trilinos_sparse_matrix.h	                        (rev 0)
+++ branches/s-wang2/for_deal.II/include/deal.II/lac/trilinos_sparse_matrix.h	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,3483 @@
+//---------------------------------------------------------------------------
+//    $Id: trilinos_sparse_matrix.h 27628 2012-11-20 22:49:26Z heister $
+//
+//    Copyright (C) 2008, 2009, 2010, 2011, 2012 by the deal.II authors
+//
+//    This file is subject to QPL and may not be  distributed
+//    without copyright and license information. Please refer
+//    to the file deal.II/doc/license.html for the  text  and
+//    further information on this license.
+//
+//---------------------------------------------------------------------------
+#ifndef __deal2__trilinos_sparse_matrix_h
+#define __deal2__trilinos_sparse_matrix_h
+
+
+#include <deal.II/base/config.h>
+
+#ifdef DEAL_II_USE_TRILINOS
+
+#  include <deal.II/base/std_cxx1x/shared_ptr.h>
+#  include <deal.II/base/subscriptor.h>
+#  include <deal.II/base/index_set.h>
+#  include <deal.II/lac/full_matrix.h>
+#  include <deal.II/lac/exceptions.h>
+#  include <deal.II/lac/trilinos_vector_base.h>
+#  include <deal.II/lac/parallel_vector.h>
+
+#  include <vector>
+#  include <cmath>
+#  include <memory>
+
+#  define TrilinosScalar double
+#  include <Epetra_FECrsMatrix.h>
+#  include <Epetra_Map.h>
+#  include <Epetra_CrsGraph.h>
+#  include <Epetra_Vector.h>
+#  ifdef DEAL_II_COMPILER_SUPPORTS_MPI
+#    include <Epetra_MpiComm.h>
+#    include "mpi.h"
+#  else
+#    include "Epetra_SerialComm.h"
+#  endif
+
+DEAL_II_NAMESPACE_OPEN
+
+// forward declarations
+template <typename MatrixType> class BlockMatrixBase;
+
+template <typename number> class SparseMatrix;
+class SparsityPattern;
+
+namespace TrilinosWrappers
+{
+  // forward declarations
+  class VectorBase;
+  class SparseMatrix;
+  class SparsityPattern;
+
+  /**
+   * Iterators for Trilinos matrices
+   */
+  namespace MatrixIterators
+  {
+    /**
+     * STL conforming iterator. This class acts as an iterator walking
+     * over the elements of Trilinos matrices. The implementation of this
+     * class is similar to the one for PETSc matrices.
+     *
+     * Note that Trilinos stores the elements within each row in ascending
+     * order. This is opposed to the deal.II sparse matrix style where the
+     * diagonal element (if it exists) is stored before all other values, and
+     * the PETSc sparse matrices, where one can't guarantee a certain order of
+     * the elements.
+     *
+     * @ingroup TrilinosWrappers
+     * @author Martin Kronbichler, Wolfgang Bangerth, 2008
+     */
+    class const_iterator
+    {
+    private:
+      /**
+       * Accessor class for iterators
+       */
+      class Accessor
+      {
+      public:
+        /**
+         * Constructor. Since we use
+         * accessors only for read
+         * access, a const matrix
+         * pointer is sufficient.
+         */
+        Accessor (const SparseMatrix *matrix,
+                  const unsigned int  row,
+                  const unsigned int  index);
+
+        /**
+         * Row number of the element
+         * represented by this object.
+         */
+        unsigned int row() const;
+
+        /**
+         * Index in row of the element
+         * represented by this object.
+         */
+        unsigned int index() const;
+
+        /**
+         * Column number of the element
+         * represented by this object.
+         */
+        unsigned int column() const;
+
+        /**
+         * Value of this matrix entry.
+         */
+        TrilinosScalar value() const;
+
+        /**
+         * Exception
+         */
+        DeclException0 (ExcBeyondEndOfMatrix);
+
+        /**
+         * Exception
+         */
+        DeclException3 (ExcAccessToNonlocalRow,
+                        int, int, int,
+                        << "You tried to access row " << arg1
+                        << " of a distributed matrix, but only rows "
+                        << arg2 << " through " << arg3
+                        << " are stored locally and can be accessed.");
+
+      private:
+        /**
+         * The matrix accessed.
+         */
+        mutable SparseMatrix *matrix;
+
+        /**
+         * Current row number.
+         */
+        unsigned int a_row;
+
+        /**
+         * Current index in row.
+         */
+        unsigned int a_index;
+
+        /**
+         * Cache where we store the
+         * column indices of the
+         * present row. This is
+         * necessary, since Trilinos
+         * makes access to the elements
+         * of its matrices rather hard,
+         * and it is much more
+         * efficient to copy all column
+         * entries of a row once when
+         * we enter it than repeatedly
+         * asking Trilinos for
+         * individual ones. This also
+         * makes some sense since it is
+         * likely that we will access
+         * them sequentially anyway.
+         *
+         * In order to make copying of
+         * iterators/accessor of
+         * acceptable performance, we
+         * keep a shared pointer to
+         * these entries so that more
+         * than one accessor can access
+         * this data if necessary.
+         */
+        std_cxx1x::shared_ptr<std::vector<unsigned int> > colnum_cache;
+
+        /**
+         * Similar cache for the values
+         * of this row.
+         */
+        std_cxx1x::shared_ptr<std::vector<TrilinosScalar> > value_cache;
+
+        /**
+         * Discard the old row caches
+         * (they may still be used by
+         * other accessors) and
+         * generate new ones for the
+         * row pointed to presently by
+         * this accessor.
+         */
+        void visit_present_row ();
+
+        /**
+         * Make enclosing class a
+         * friend.
+         */
+        friend class const_iterator;
+      };
+
+    public:
+
+      /**
+       * Constructor. Create an
+       * iterator into the matrix @p
+       * matrix for the given row and
+       * the index within it.
+       */
+      const_iterator (const SparseMatrix *matrix,
+                      const unsigned int  row,
+                      const unsigned int  index);
+
+      /**
+       * Prefix increment.
+       */
+      const_iterator &operator++ ();
+
+      /**
+       * Postfix increment.
+       */
+      const_iterator operator++ (int);
+
+      /**
+       * Dereferencing operator.
+       */
+      const Accessor &operator* () const;
+
+      /**
+       * Dereferencing operator.
+       */
+      const Accessor *operator-> () const;
+
+      /**
+       * Comparison. True, if both
+       * iterators point to the same
+       * matrix position.
+       */
+      bool operator == (const const_iterator &) const;
+
+      /**
+       * Inverse of <tt>==</tt>.
+       */
+      bool operator != (const const_iterator &) const;
+
+      /**
+       * Comparison operator. Result
+       * is true if either the first
+       * row number is smaller or if
+       * the row numbers are equal
+       * and the first index is
+       * smaller.
+       */
+      bool operator < (const const_iterator &) const;
+
+      /**
+       * Exception
+       */
+      DeclException2 (ExcInvalidIndexWithinRow,
+                      int, int,
+                      << "Attempt to access element " << arg2
+                      << " of row " << arg1
+                      << " which doesn't have that many elements.");
+
+    private:
+      /**
+       * Store an object of the
+       * accessor class.
+       */
+      Accessor accessor;
+    };
+
+  }
+
+
+  /**
+   * This class implements a wrapper to use the Trilinos distributed
+   * sparse matrix class Epetra_FECrsMatrix. This is precisely the kind of
+   * matrix we deal with all the time - we most likely get it from some
+   * assembly process, where also entries not locally owned might need to
+   * be written and hence need to be forwarded to the owner process.  This
+   * class is designed to be used in a distributed memory architecture
+   * with an MPI compiler on the bottom, but works equally well also for
+   * serial processes. The only requirement for this class to work is that
+   * Trilinos has been installed with the same compiler as is used for
+   * generating deal.II.
+   *
+   * The interface of this class is modeled after the existing
+   * SparseMatrix class in deal.II. It has almost the same member
+   * functions, and is often exchangable. However, since Trilinos only
+   * supports a single scalar type (double), it is not templated, and only
+   * works with doubles.
+   *
+   * Note that Trilinos only guarantees that operations do what you expect
+   * if the functions @p GlobalAssemble has been called after matrix
+   * assembly.  Therefore, you need to call SparseMatrix::compress()
+   * before you actually use the matrix. This also calls @p FillComplete
+   * that compresses the storage format for sparse matrices by discarding
+   * unused elements. Trilinos allows to continue with assembling the
+   * matrix after calls to these functions, though.
+   *
+   * @ingroup TrilinosWrappers
+   * @ingroup Matrix1
+   * @author Martin Kronbichler, Wolfgang Bangerth, 2008, 2009
+   */
+  class SparseMatrix : public Subscriptor
+  {
+  public:
+    /**
+     * A structure that describes
+     * some of the traits of this
+     * class in terms of its run-time
+     * behavior. Some other classes
+     * (such as the block matrix
+     * classes) that take one or
+     * other of the matrix classes as
+     * its template parameters can
+     * tune their behavior based on
+     * the variables in this class.
+     */
+    struct Traits
+    {
+      /**
+       * It is safe to elide additions
+       * of zeros to individual
+       * elements of this matrix.
+       */
+      static const bool zero_addition_can_be_elided = true;
+    };
+
+    /**
+     * Declare a typedef for the
+     * iterator class.
+     */
+    typedef MatrixIterators::const_iterator const_iterator;
+
+    /**
+     * Declare a typedef in analogy
+     * to all the other container
+     * classes.
+     */
+    typedef TrilinosScalar value_type;
+
+    /**
+     * @name Constructors and initalization.
+     */
+//@{
+    /**
+     * Default constructor. Generates
+     * an empty (zero-size) matrix.
+     */
+    SparseMatrix ();
+
+    /**
+     * Generate a matrix that is completely
+     * stored locally, having #m rows and
+     * #n columns.
+     *
+     * The number of columns entries per
+     * row is specified as the maximum
+     * number of entries argument.
+     */
+    SparseMatrix (const unsigned int  m,
+                  const unsigned int  n,
+                  const unsigned int  n_max_entries_per_row);
+
+    /**
+     * Generate a matrix that is completely
+     * stored locally, having #m rows and
+     * #n columns.
+     *
+     * The vector
+     * <tt>n_entries_per_row</tt>
+     * specifies the number of entries in
+     * each row.
+     */
+    SparseMatrix (const unsigned int               m,
+                  const unsigned int               n,
+                  const std::vector<unsigned int> &n_entries_per_row);
+
+    /**
+     * Generate a matrix from a Trilinos
+     * sparsity pattern object.
+     */
+    SparseMatrix (const SparsityPattern &InputSparsityPattern);
+
+    /**
+     * Copy constructor. Sets the
+     * calling matrix to be the same
+     * as the input matrix, i.e.,
+     * using the same sparsity
+     * pattern and entries.
+     */
+    SparseMatrix (const SparseMatrix &InputMatrix);
+
+    /**
+     * Destructor. Made virtual so
+     * that one can use pointers to
+     * this class.
+     */
+    virtual ~SparseMatrix ();
+
+    /**
+     * This function initializes the
+     * Trilinos matrix with a deal.II
+     * sparsity pattern, i.e. it makes
+     * the Trilinos Epetra matrix know
+     * the position of nonzero entries
+     * according to the sparsity
+     * pattern. This function is meant
+     * for use in serial programs, where
+     * there is no need to specify how
+     * the matrix is going to be
+     * distributed among different
+     * processors. This function works in
+     * %parallel, too, but it is
+     * recommended to manually specify
+     * the %parallel partioning of the
+     * matrix using an Epetra_Map. When
+     * run in %parallel, it is currently
+     * necessary that each processor
+     * holds the sparsity_pattern
+     * structure because each processor
+     * sets its rows.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    template<typename SparsityType>
+    void reinit (const SparsityType &sparsity_pattern);
+
+    /**
+     * This function reinitializes the
+     * Trilinos sparse matrix from a
+     * (possibly distributed) Trilinos
+     * sparsity pattern.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    void reinit (const SparsityPattern &sparsity_pattern);
+
+    /**
+     * This function copies the content
+     * in <tt>sparse_matrix</tt> to the
+     * calling matrix.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    void reinit (const SparseMatrix &sparse_matrix);
+
+    /**
+     * This function initializes the
+     * Trilinos matrix using the deal.II
+     * sparse matrix and the entries
+     * stored therein. It uses a
+     * threshold to copy only elements
+     * with modulus larger than the
+     * threshold (so zeros in the deal.II
+     * matrix can be filtered away).
+     *
+     * The optional parameter
+     * <tt>copy_values</tt> decides
+     * whether only the sparsity
+     * structure of the input matrix
+     * should be used or the matrix
+     * entries should be copied, too.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    template <typename number>
+    void reinit (const ::dealii::SparseMatrix<number> &dealii_sparse_matrix,
+                 const double                          drop_tolerance=1e-13,
+                 const bool                            copy_values=true,
+                 const ::dealii::SparsityPattern      *use_this_sparsity=0);
+
+    /**
+     * This reinit function takes as
+     * input a Trilinos Epetra_CrsMatrix
+     * and copies its sparsity
+     * pattern. If so requested, even the
+     * content (values) will be copied.
+     */
+    void reinit (const Epetra_CrsMatrix &input_matrix,
+                 const bool              copy_values = true);
+//@}
+    /**
+     * @name Constructors and initialization using an Epetra_Map description
+     */
+//@{
+    /**
+     * Constructor using an Epetra_Map to
+     * describe the %parallel
+     * partitioning. The parameter @p
+     * n_max_entries_per_row sets the
+     * number of nonzero entries in each
+     * row that will be allocated. Note
+     * that this number does not need to
+     * be exact, and it is even allowed
+     * that the actual matrix structure
+     * has more nonzero entries than
+     * specified in the
+     * constructor. However it is still
+     * advantageous to provide good
+     * estimates here since this will
+     * considerably increase the
+     * performance of the matrix
+     * setup. However, there is no effect
+     * in the performance of
+     * matrix-vector products, since
+     * Trilinos reorganizes the matrix
+     * memory prior to use (in the
+     * compress() step).
+     */
+    SparseMatrix (const Epetra_Map   &parallel_partitioning,
+                  const unsigned int  n_max_entries_per_row = 0);
+
+    /**
+     * Same as before, but now set a
+     * value of nonzeros for each matrix
+     * row. Since we know the number of
+     * elements in the matrix exactly in
+     * this case, we can already allocate
+     * the right amount of memory, which
+     * makes the creation process
+     * including the insertion of nonzero
+     * elements by the respective
+     * SparseMatrix::reinit call
+     * considerably faster.
+     */
+    SparseMatrix (const Epetra_Map                &parallel_partitioning,
+                  const std::vector<unsigned int> &n_entries_per_row);
+
+    /**
+     * This constructor is similar to the
+     * one above, but it now takes two
+     * different Epetra maps for rows and
+     * columns. This interface is meant
+     * to be used for generating
+     * rectangular matrices, where one
+     * map describes the %parallel
+     * partitioning of the dofs
+     * associated with the matrix rows
+     * and the other one the partitioning
+     * of dofs in the matrix
+     * columns. Note that there is no
+     * real parallelism along the columns
+     * &ndash; the processor that owns a
+     * certain row always owns all the
+     * column elements, no matter how far
+     * they might be spread out. The
+     * second Epetra_Map is only used to
+     * specify the number of columns and
+     * for internal arragements when
+     * doing matrix-vector products with
+     * vectors based on that column map.
+     *
+     * The integer input @p
+     * n_max_entries_per_row defines the
+     * number of columns entries per row
+     * that will be allocated.
+     */
+    SparseMatrix (const Epetra_Map   &row_parallel_partitioning,
+                  const Epetra_Map   &col_parallel_partitioning,
+                  const unsigned int  n_max_entries_per_row = 0);
+
+    /**
+     * This constructor is similar to the
+     * one above, but it now takes two
+     * different Epetra maps for rows and
+     * columns. This interface is meant
+     * to be used for generating
+     * rectangular matrices, where one
+     * map specifies the %parallel
+     * distribution of degrees of freedom
+     * associated with matrix rows and
+     * the second one specifies the
+     * %parallel distribution the dofs
+     * associated with columns in the
+     * matrix. The second map also
+     * provides information for the
+     * internal arrangement in matrix
+     * vector products (i.e., the
+     * distribution of vector this matrix
+     * is to be multiplied with), but is
+     * not used for the distribution of
+     * the columns &ndash; rather, all
+     * column elements of a row are
+     * stored on the same processor in
+     * any case. The vector
+     * <tt>n_entries_per_row</tt>
+     * specifies the number of entries in
+     * each row of the newly generated
+     * matrix.
+     */
+    SparseMatrix (const Epetra_Map                &row_parallel_partitioning,
+                  const Epetra_Map                &col_parallel_partitioning,
+                  const std::vector<unsigned int> &n_entries_per_row);
+
+    /**
+     * This function is initializes the
+     * Trilinos Epetra matrix according to
+     * the specified sparsity_pattern, and
+     * also reassigns the matrix rows to
+     * different processes according to a
+     * user-supplied Epetra map. In
+     * programs following the style of the
+     * tutorial programs, this function
+     * (and the respective call for a
+     * rectangular matrix) are the natural
+     * way to initialize the matrix size,
+     * its distribution among the MPI
+     * processes (if run in %parallel) as
+     * well as the locatoin of non-zero
+     * elements. Trilinos stores the
+     * sparsity pattern internally, so it
+     * won't be needed any more after this
+     * call, in contrast to the deal.II own
+     * object. The optional argument @p
+     * exchange_data can be used for
+     * reinitialization with a sparsity
+     * pattern that is not fully
+     * constructed. This feature is only
+     * implemented for input sparsity
+     * patterns of type
+     * CompressedSimpleSparsityPattern. If
+     * the flag is not set, each processor
+     * just sets the elements in the
+     * sparsity pattern that belong to its
+     * rows.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    template<typename SparsityType>
+    void reinit (const Epetra_Map    &parallel_partitioning,
+                 const SparsityType &sparsity_pattern,
+                 const bool          exchange_data = false);
+
+    /**
+     * This function is similar to the
+     * other initialization function
+     * above, but now also reassigns the
+     * matrix rows and columns according
+     * to two user-supplied Epetra maps.
+     * To be used for rectangular
+     * matrices. The optional argument @p
+     * exchange_data can be used for
+     * reinitialization with a sparsity
+     * pattern that is not fully
+     * constructed. This feature is only
+     * implemented for input sparsity
+     * patterns of type
+     * CompressedSimpleSparsityPattern.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    template<typename SparsityType>
+    void reinit (const Epetra_Map    &row_parallel_partitioning,
+                 const Epetra_Map    &col_parallel_partitioning,
+                 const SparsityType &sparsity_pattern,
+                 const bool          exchange_data = false);
+
+    /**
+     * This function initializes the
+     * Trilinos matrix using the deal.II
+     * sparse matrix and the entries
+     * stored therein. It uses a
+     * threshold to copy only elements
+     * with modulus larger than the
+     * threshold (so zeros in the deal.II
+     * matrix can be filtered away). In
+     * contrast to the other reinit
+     * function with deal.II sparse
+     * matrix argument, this function
+     * takes a %parallel partitioning
+     * specified by the user instead of
+     * internally generating it.
+     *
+     * The optional parameter
+     * <tt>copy_values</tt> decides
+     * whether only the sparsity
+     * structure of the input matrix
+     * should be used or the matrix
+     * entries should be copied, too.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    template <typename number>
+    void reinit (const Epetra_Map                     &parallel_partitioning,
+                 const ::dealii::SparseMatrix<number> &dealii_sparse_matrix,
+                 const double                          drop_tolerance=1e-13,
+                 const bool                            copy_values=true,
+                 const ::dealii::SparsityPattern      *use_this_sparsity=0);
+
+    /**
+     * This function is similar to the
+     * other initialization function with
+     * deal.II sparse matrix input above,
+     * but now takes Epetra maps for both
+     * the rows and the columns of the
+     * matrix. Chosen for rectangular
+     * matrices.
+     *
+     * The optional parameter
+     * <tt>copy_values</tt> decides
+     * whether only the sparsity
+     * structure of the input matrix
+     * should be used or the matrix
+     * entries should be copied, too.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    template <typename number>
+    void reinit (const Epetra_Map                      &row_parallel_partitioning,
+                 const Epetra_Map                      &col_parallel_partitioning,
+                 const ::dealii::SparseMatrix<number> &dealii_sparse_matrix,
+                 const double                           drop_tolerance=1e-13,
+                 const bool                             copy_values=true,
+                 const ::dealii::SparsityPattern      *use_this_sparsity=0);
+//@}
+    /**
+     * @name Constructors and initialization using an IndexSet description
+     */
+//@{
+    /**
+     * Constructor using an IndexSet and
+     * an MPI communicator to describe
+     * the %parallel partitioning. The
+     * parameter @p n_max_entries_per_row
+     * sets the number of nonzero entries
+     * in each row that will be
+     * allocated. Note that this number
+     * does not need to be exact, and it
+     * is even allowed that the actual
+     * matrix structure has more nonzero
+     * entries than specified in the
+     * constructor. However it is still
+     * advantageous to provide good
+     * estimates here since this will
+     * considerably increase the
+     * performance of the matrix
+     * setup. However, there is no effect
+     * in the performance of
+     * matrix-vector products, since
+     * Trilinos reorganizes the matrix
+     * memory prior to use (in the
+     * compress() step).
+     */
+    SparseMatrix (const IndexSet     &parallel_partitioning,
+                  const MPI_Comm     &communicator = MPI_COMM_WORLD,
+                  const unsigned int  n_max_entries_per_row = 0);
+
+    /**
+     * Same as before, but now set the
+     * number of nonzeros in each matrix
+     * row separately. Since we know the
+     * number of elements in the matrix
+     * exactly in this case, we can
+     * already allocate the right amount
+     * of memory, which makes the
+     * creation process including the
+     * insertion of nonzero elements by
+     * the respective
+     * SparseMatrix::reinit call
+     * considerably faster.
+     */
+    SparseMatrix (const IndexSet                  &parallel_partitioning,
+                  const MPI_Comm                  &communicator,
+                  const std::vector<unsigned int> &n_entries_per_row);
+
+    /**
+     * This constructor is similar to the
+     * one above, but it now takes two
+     * different IndexSet partitions for
+     * row and columns. This interface is
+     * meant to be used for generating
+     * rectangular matrices, where the
+     * first index set describes the
+     * %parallel partitioning of the
+     * degrees of freedom associated with
+     * the matrix rows and the second one
+     * the partitioning of the matrix
+     * columns. The second index set
+     * specifies the partitioning of the
+     * vectors this matrix is to be
+     * multiplied with, not the
+     * distribution of the elements that
+     * actually appear in the matrix.
+     *
+     * The parameter @p
+     * n_max_entries_per_row defines how
+     * much memory will be allocated for
+     * each row. This number does not
+     * need to be accurate, as the
+     * structure is reorganized in the
+     * compress() call.
+     */
+    SparseMatrix (const IndexSet     &row_parallel_partitioning,
+                  const IndexSet     &col_parallel_partitioning,
+                  const MPI_Comm     &communicator = MPI_COMM_WORLD,
+                  const unsigned int  n_max_entries_per_row = 0);
+
+    /**
+     * This constructor is similar to the
+     * one above, but it now takes two
+     * different Epetra maps for rows and
+     * columns. This interface is meant
+     * to be used for generating
+     * rectangular matrices, where one
+     * map specifies the %parallel
+     * distribution of degrees of freedom
+     * associated with matrix rows and
+     * the second one specifies the
+     * %parallel distribution the dofs
+     * associated with columns in the
+     * matrix. The second map also
+     * provides information for the
+     * internal arrangement in matrix
+     * vector products (i.e., the
+     * distribution of vector this matrix
+     * is to be multiplied with), but is
+     * not used for the distribution of
+     * the columns &ndash; rather, all
+     * column elements of a row are
+     * stored on the same processor in
+     * any case. The vector
+     * <tt>n_entries_per_row</tt>
+     * specifies the number of entries in
+     * each row of the newly generated
+     * matrix.
+     */
+    SparseMatrix (const IndexSet                  &row_parallel_partitioning,
+                  const IndexSet                  &col_parallel_partitioning,
+                  const MPI_Comm                  &communicator,
+                  const std::vector<unsigned int> &n_entries_per_row);
+
+    /**
+     * This function is initializes the
+     * Trilinos Epetra matrix according
+     * to the specified sparsity_pattern,
+     * and also reassigns the matrix rows
+     * to different processes according
+     * to a user-supplied index set and
+     * %parallel communicator. In
+     * programs following the style of
+     * the tutorial programs, this
+     * function (and the respective call
+     * for a rectangular matrix) are the
+     * natural way to initialize the
+     * matrix size, its distribution
+     * among the MPI processes (if run in
+     * %parallel) as well as the locatoin
+     * of non-zero elements. Trilinos
+     * stores the sparsity pattern
+     * internally, so it won't be needed
+     * any more after this call, in
+     * contrast to the deal.II own
+     * object. The optional argument @p
+     * exchange_data can be used for
+     * reinitialization with a sparsity
+     * pattern that is not fully
+     * constructed. This feature is only
+     * implemented for input sparsity
+     * patterns of type
+     * CompressedSimpleSparsityPattern. If
+     * the flag is not set, each
+     * processor just sets the elements
+     * in the sparsity pattern that
+     * belong to its rows.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    template<typename SparsityType>
+    void reinit (const IndexSet      &parallel_partitioning,
+                 const SparsityType &sparsity_pattern,
+                 const MPI_Comm      &communicator = MPI_COMM_WORLD,
+                 const bool           exchange_data = false);
+
+    /**
+     * This function is similar to the
+     * other initialization function
+     * above, but now also reassigns the
+     * matrix rows and columns according
+     * to two user-supplied index sets.
+     * To be used for rectangular
+     * matrices. The optional argument @p
+     * exchange_data can be used for
+     * reinitialization with a sparsity
+     * pattern that is not fully
+     * constructed. This feature is only
+     * implemented for input sparsity
+     * patterns of type
+     * CompressedSimpleSparsityPattern.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    template<typename SparsityType>
+    void reinit (const IndexSet      &row_parallel_partitioning,
+                 const IndexSet      &col_parallel_partitioning,
+                 const SparsityType &sparsity_pattern,
+                 const MPI_Comm      &communicator = MPI_COMM_WORLD,
+                 const bool           exchange_data = false);
+
+    /**
+     * This function initializes the
+     * Trilinos matrix using the deal.II
+     * sparse matrix and the entries
+     * stored therein. It uses a
+     * threshold to copy only elements
+     * with modulus larger than the
+     * threshold (so zeros in the deal.II
+     * matrix can be filtered away). In
+     * contrast to the other reinit
+     * function with deal.II sparse
+     * matrix argument, this function
+     * takes a %parallel partitioning
+     * specified by the user instead of
+     * internally generating it.
+     *
+     * The optional parameter
+     * <tt>copy_values</tt> decides
+     * whether only the sparsity
+     * structure of the input matrix
+     * should be used or the matrix
+     * entries should be copied, too.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    template <typename number>
+    void reinit (const IndexSet                       &parallel_partitioning,
+                 const ::dealii::SparseMatrix<number> &dealii_sparse_matrix,
+                 const MPI_Comm                       &communicator = MPI_COMM_WORLD,
+                 const double                          drop_tolerance=1e-13,
+                 const bool                            copy_values=true,
+                 const ::dealii::SparsityPattern      *use_this_sparsity=0);
+
+    /**
+     * This function is similar to the
+     * other initialization function with
+     * deal.II sparse matrix input above,
+     * but now takes index sets for both
+     * the rows and the columns of the
+     * matrix. Chosen for rectangular
+     * matrices.
+     *
+     * The optional parameter
+     * <tt>copy_values</tt> decides
+     * whether only the sparsity
+     * structure of the input matrix
+     * should be used or the matrix
+     * entries should be copied, too.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    template <typename number>
+    void reinit (const IndexSet                        &row_parallel_partitioning,
+                 const IndexSet                        &col_parallel_partitioning,
+                 const ::dealii::SparseMatrix<number> &dealii_sparse_matrix,
+                 const MPI_Comm                        &communicator = MPI_COMM_WORLD,
+                 const double                           drop_tolerance=1e-13,
+                 const bool                             copy_values=true,
+                 const ::dealii::SparsityPattern      *use_this_sparsity=0);
+//@}
+    /**
+     * @name Information on the matrix
+     */
+//@{
+
+    /**
+     * Return the number of rows in
+     * this matrix.
+     */
+    unsigned int m () const;
+
+    /**
+     * Return the number of columns
+     * in this matrix.
+     */
+    unsigned int n () const;
+
+    /**
+     * Return the local dimension
+     * of the matrix, i.e. the
+     * number of rows stored on the
+     * present MPI process. For
+     * sequential matrices, this
+     * number is the same as m(),
+     * but for %parallel matrices it
+     * may be smaller.
+     *
+     * To figure out which elements
+     * exactly are stored locally,
+     * use local_range().
+     */
+    unsigned int local_size () const;
+
+    /**
+     * Return a pair of indices
+     * indicating which rows of
+     * this matrix are stored
+     * locally. The first number is
+     * the index of the first row
+     * stored, the second the index
+     * of the one past the last one
+     * that is stored locally. If
+     * this is a sequential matrix,
+     * then the result will be the
+     * pair (0,m()), otherwise it
+     * will be a pair (i,i+n),
+     * where
+     * <tt>n=local_size()</tt>.
+     */
+    std::pair<unsigned int, unsigned int>
+    local_range () const;
+
+    /**
+     * Return whether @p index is
+     * in the local range or not,
+     * see also local_range().
+     */
+    bool in_local_range (const unsigned int index) const;
+
+    /**
+     * Return the number of nonzero
+     * elements of this matrix.
+     */
+    unsigned int n_nonzero_elements () const;
+
+    /**
+     * Number of entries in a
+     * specific row.
+     */
+    unsigned int row_length (const unsigned int row) const;
+
+    /**
+     * Returns the state of the matrix,
+     * i.e., whether compress() needs to
+     * be called after an operation
+     * requiring data exchange. A call to
+     * compress() is also needed when the
+     * method set() has been called (even
+     * when working in serial).
+     */
+    bool is_compressed () const;
+
+    /**
+     * Determine an estimate for the memory
+     * consumption (in bytes) of this
+     * object. Note that only the memory
+     * reserved on the current processor is
+     * returned in case this is called in
+     * an MPI-based program.
+     */
+    std::size_t memory_consumption () const;
+
+//@}
+    /**
+     * @name Modifying entries
+     */
+//@{
+
+    /**
+     * This operator assigns a scalar to
+     * a matrix. Since this does usually
+     * not make much sense (should we set
+     * all matrix entries to this value?
+     * Only the nonzero entries of the
+     * sparsity pattern?), this operation
+     * is only allowed if the actual
+     * value to be assigned is zero. This
+     * operator only exists to allow for
+     * the obvious notation
+     * <tt>matrix=0</tt>, which sets all
+     * elements of the matrix to zero,
+     * but keeps the sparsity pattern
+     * previously used.
+     */
+    SparseMatrix &
+    operator = (const double d);
+
+    /**
+     * Release all memory and return to a
+     * state just like after having
+     * called the default constructor.
+     *
+     * This is a collective operation
+     * that needs to be called on all
+     * processors in order to avoid a
+     * dead lock.
+     */
+    void clear ();
+
+    /**
+     * This command does two things:
+     * <ul>
+     * <li> If the matrix was initialized
+     * without a sparsity pattern,
+     * elements have been added manually
+     * using the set() command. When this
+     * process is completed, a call to
+     * compress() reorganizes the
+     * internal data structures (aparsity
+     * pattern) so that a fast access to
+     * data is possible in matrix-vector
+     * products.
+     * <li> If the matrix structure has
+     * already been fixed (either by
+     * initialization with a sparsity
+     * pattern or by calling compress()
+     * during the setup phase), this
+     * command does the %parallel
+     * exchange of data. This is
+     * necessary when we perform assembly
+     * on more than one (MPI) process,
+     * because then some non-local row
+     * data will accumulate on nodes that
+     * belong to the current's processor
+     * element, but are actually held by
+     * another. This command is usually
+     * called after all elements have
+     * been traversed.
+     * </ul>
+     *
+     * In both cases, this function
+     * compresses the data structures and
+     * allows the resulting matrix to be
+     * used in all other operations like
+     * matrix-vector products. This is a
+     * collective operation, i.e., it
+     * needs to be run on all processors
+     * when used in %parallel.
+     *
+     * See @ref GlossCompress "Compressing distributed objects"
+     * for more information.
+     */
+    void compress (::dealii::VectorOperation::values operation
+                   =::dealii::VectorOperation::unknown);
+
+    /**
+     * Set the element (<i>i,j</i>)
+     * to @p value.
+     *
+     * This function is able to insert new
+     * elements into the matrix as long as
+     * compress() has not been called, so
+     * the sparsity pattern will be
+     * extended. When compress() is called
+     * for the first time, then this is no
+     * longer possible and an insertion of
+     * elements at positions which have not
+     * been initialized will throw an
+     * exception. Note that in case
+     * elements need to be inserted, it is
+     * mandatory that elements are inserted
+     * only once. Otherwise, the elements
+     * will actually be added in the end
+     * (since it is not possible to
+     * efficiently find values to the same
+     * entry before compress() has been
+     * called). In the case that an element
+     * is set more than once, initialize
+     * the matrix with a sparsity pattern
+     * first.
+     */
+    void set (const unsigned int i,
+              const unsigned int j,
+              const TrilinosScalar value);
+
+    /**
+     * Set all elements given in a
+     * FullMatrix<double> into the sparse
+     * matrix locations given by
+     * <tt>indices</tt>. In other words,
+     * this function writes the elements
+     * in <tt>full_matrix</tt> into the
+     * calling matrix, using the
+     * local-to-global indexing specified
+     * by <tt>indices</tt> for both the
+     * rows and the columns of the
+     * matrix. This function assumes a
+     * quadratic sparse matrix and a
+     * quadratic full_matrix, the usual
+     * situation in FE calculations.
+     *
+     * This function is able to insert
+     * new elements into the matrix as
+     * long as compress() has not been
+     * called, so the sparsity pattern
+     * will be extended. When compress()
+     * is called for the first time, then
+     * this is no longer possible and an
+     * insertion of elements at positions
+     * which have not been initialized
+     * will throw an exception.
+     *
+     * The optional parameter
+     * <tt>elide_zero_values</tt> can be
+     * used to specify whether zero
+     * values should be inserted anyway
+     * or they should be filtered
+     * away. The default value is
+     * <tt>false</tt>, i.e., even zero
+     * values are inserted/replaced.
+     */
+    void set (const std::vector<unsigned int> &indices,
+              const FullMatrix<TrilinosScalar> &full_matrix,
+              const bool                        elide_zero_values = false);
+
+    /**
+     * Same function as before, but now
+     * including the possibility to use
+     * rectangular full_matrices and
+     * different local-to-global indexing
+     * on rows and columns, respectively.
+     */
+    void set (const std::vector<unsigned int> &row_indices,
+              const std::vector<unsigned int> &col_indices,
+              const FullMatrix<TrilinosScalar> &full_matrix,
+              const bool                        elide_zero_values = false);
+
+    /**
+     * Set several elements in the
+     * specified row of the matrix with
+     * column indices as given by
+     * <tt>col_indices</tt> to the
+     * respective value.
+     *
+     * This function is able to insert
+     * new elements into the matrix as
+     * long as compress() has not been
+     * called, so the sparsity pattern
+     * will be extended. When compress()
+     * is called for the first time, then
+     * this is no longer possible and an
+     * insertion of elements at positions
+     * which have not been initialized
+     * will throw an exception.
+     *
+     * The optional parameter
+     * <tt>elide_zero_values</tt> can be
+     * used to specify whether zero
+     * values should be inserted anyway
+     * or they should be filtered
+     * away. The default value is
+     * <tt>false</tt>, i.e., even zero
+     * values are inserted/replaced.
+     */
+    void set (const unsigned int                row,
+              const std::vector<unsigned int>   &col_indices,
+              const std::vector<TrilinosScalar> &values,
+              const bool                         elide_zero_values = false);
+
+    /**
+     * Set several elements to values
+     * given by <tt>values</tt> in a
+     * given row in columns given by
+     * col_indices into the sparse
+     * matrix.
+     *
+     * This function is able to insert
+     * new elements into the matrix as
+     * long as compress() has not been
+     * called, so the sparsity pattern
+     * will be extended. When compress()
+     * is called for the first time, then
+     * this is no longer possible and an
+     * insertion of elements at positions
+     * which have not been initialized
+     * will throw an exception.
+     *
+     * The optional parameter
+     * <tt>elide_zero_values</tt> can be
+     * used to specify whether zero
+     * values should be inserted anyway
+     * or they should be filtered
+     * away. The default value is
+     * <tt>false</tt>, i.e., even zero
+     * values are inserted/replaced.
+     */
+    void set (const unsigned int    row,
+              const unsigned int    n_cols,
+              const unsigned int   *col_indices,
+              const TrilinosScalar *values,
+              const bool            elide_zero_values = false);
+
+    /**
+     * Add @p value to the element
+     * (<i>i,j</i>).
+     *
+     * Just as the respective call in
+     * deal.II SparseMatrix<Number>
+     * class (but in contrast to the
+     * situation for PETSc based
+     * matrices), this function
+     * throws an exception if an
+     * entry does not exist in the
+     * sparsity pattern. Moreover, if
+     * <tt>value</tt> is not a finite
+     * number an exception is thrown.
+     */
+    void add (const unsigned int i,
+              const unsigned int j,
+              const TrilinosScalar value);
+
+    /**
+     * Add all elements given in a
+     * FullMatrix<double> into sparse
+     * matrix locations given by
+     * <tt>indices</tt>. In other words,
+     * this function adds the elements in
+     * <tt>full_matrix</tt> to the
+     * respective entries in calling
+     * matrix, using the local-to-global
+     * indexing specified by
+     * <tt>indices</tt> for both the rows
+     * and the columns of the
+     * matrix. This function assumes a
+     * quadratic sparse matrix and a
+     * quadratic full_matrix, the usual
+     * situation in FE calculations.
+     *
+     * Just as the respective call in
+     * deal.II SparseMatrix<Number>
+     * class (but in contrast to the
+     * situation for PETSc based
+     * matrices), this function
+     * throws an exception if an
+     * entry does not exist in the
+     * sparsity pattern.
+     *
+     * The optional parameter
+     * <tt>elide_zero_values</tt> can be
+     * used to specify whether zero
+     * values should be added anyway or
+     * these should be filtered away and
+     * only non-zero data is added. The
+     * default value is <tt>true</tt>,
+     * i.e., zero values won't be added
+     * into the matrix.
+     */
+    void add (const std::vector<unsigned int> &indices,
+              const FullMatrix<TrilinosScalar> &full_matrix,
+              const bool                        elide_zero_values = true);
+
+    /**
+     * Same function as before, but now
+     * including the possibility to use
+     * rectangular full_matrices and
+     * different local-to-global indexing
+     * on rows and columns, respectively.
+     */
+    void add (const std::vector<unsigned int> &row_indices,
+              const std::vector<unsigned int> &col_indices,
+              const FullMatrix<TrilinosScalar> &full_matrix,
+              const bool                        elide_zero_values = true);
+
+    /**
+     * Set several elements in the
+     * specified row of the matrix with
+     * column indices as given by
+     * <tt>col_indices</tt> to the
+     * respective value.
+     *
+     * Just as the respective call in
+     * deal.II SparseMatrix<Number>
+     * class (but in contrast to the
+     * situation for PETSc based
+     * matrices), this function
+     * throws an exception if an
+     * entry does not exist in the
+     * sparsity pattern.
+     *
+     * The optional parameter
+     * <tt>elide_zero_values</tt> can be
+     * used to specify whether zero
+     * values should be added anyway or
+     * these should be filtered away and
+     * only non-zero data is added. The
+     * default value is <tt>true</tt>,
+     * i.e., zero values won't be added
+     * into the matrix.
+     */
+    void add (const unsigned int                row,
+              const std::vector<unsigned int>   &col_indices,
+              const std::vector<TrilinosScalar> &values,
+              const bool                         elide_zero_values = true);
+
+    /**
+     * Add an array of values given by
+     * <tt>values</tt> in the given
+     * global matrix row at columns
+     * specified by col_indices in the
+     * sparse matrix.
+     *
+     * Just as the respective call in
+     * deal.II SparseMatrix<Number> class
+     * (but in contrast to the situation
+     * for PETSc based matrices), this
+     * function throws an exception if an
+     * entry does not exist in the
+     * sparsity pattern.
+     *
+     * The optional parameter
+     * <tt>elide_zero_values</tt> can be
+     * used to specify whether zero
+     * values should be added anyway or
+     * these should be filtered away and
+     * only non-zero data is added. The
+     * default value is <tt>true</tt>,
+     * i.e., zero values won't be added
+     * into the matrix.
+     */
+    void add (const unsigned int    row,
+              const unsigned int    n_cols,
+              const unsigned int   *col_indices,
+              const TrilinosScalar *values,
+              const bool            elide_zero_values = true,
+              const bool            col_indices_are_sorted = false);
+
+    /**
+     * Multiply the entire matrix
+     * by a fixed factor.
+     */
+    SparseMatrix &operator *= (const TrilinosScalar factor);
+
+    /**
+     * Divide the entire matrix by
+     * a fixed factor.
+     */
+    SparseMatrix &operator /= (const TrilinosScalar factor);
+
+    /**
+     * Copy the given (Trilinos) matrix
+     * (sparsity pattern and entries).
+     */
+    void copy_from (const SparseMatrix &source);
+
+    /**
+     * Add <tt>matrix</tt> scaled by
+     * <tt>factor</tt> to this matrix,
+     * i.e. the matrix
+     * <tt>factor*matrix</tt> is added to
+     * <tt>this</tt>. If the sparsity
+     * pattern of the calling matrix does
+     * not contain all the elements in
+     * the sparsity pattern of the input
+     * matrix, this function will throw
+     * an exception.
+     */
+    void add (const TrilinosScalar  factor,
+              const SparseMatrix   &matrix);
+
+    /**
+     * Remove all elements from
+     * this <tt>row</tt> by setting
+     * them to zero. The function
+     * does not modify the number
+     * of allocated nonzero
+     * entries, it only sets some
+     * entries to zero. It may drop
+     * them from the sparsity
+     * pattern, though (but retains
+     * the allocated memory in case
+     * new entries are again added
+     * later). Note that this is a
+     * global operation, so this
+     * needs to be done on all MPI
+     * processes.
+     *
+     * This operation is used in
+     * eliminating constraints
+     * (e.g. due to hanging nodes)
+     * and makes sure that we can
+     * write this modification to
+     * the matrix without having to
+     * read entries (such as the
+     * locations of non-zero
+     * elements) from it &mdash;
+     * without this operation,
+     * removing constraints on
+     * %parallel matrices is a
+     * rather complicated
+     * procedure.
+     *
+     * The second parameter can be
+     * used to set the diagonal
+     * entry of this row to a value
+     * different from zero. The
+     * default is to set it to
+     * zero.
+     */
+    void clear_row (const unsigned int   row,
+                    const TrilinosScalar new_diag_value = 0);
+
+    /**
+     * Same as clear_row(), except
+     * that it works on a number of
+     * rows at once.
+     *
+     * The second parameter can be
+     * used to set the diagonal
+     * entries of all cleared rows
+     * to something different from
+     * zero. Note that all of these
+     * diagonal entries get the
+     * same value -- if you want
+     * different values for the
+     * diagonal entries, you have
+     * to set them by hand.
+     */
+    void clear_rows (const std::vector<unsigned int> &rows,
+                     const TrilinosScalar             new_diag_value = 0);
+
+    /**
+     * Make an in-place transpose
+     * of a matrix.
+     */
+    void transpose ();
+
+//@}
+    /**
+     * @name Entry Access
+     */
+//@{
+
+    /**
+     * Return the value of the
+     * entry (<i>i,j</i>).  This
+     * may be an expensive
+     * operation and you should
+     * always take care where to
+     * call this function. As in
+     * the deal.II sparse matrix
+     * class, we throw an exception
+     * if the respective entry
+     * doesn't exist in the
+     * sparsity pattern of this
+     * class, which is requested
+     * from Trilinos. Moreover, an
+     * exception will be thrown
+     * when the requested element
+     * is not saved on the calling
+     * process.
+     */
+    TrilinosScalar operator () (const unsigned int i,
+                                const unsigned int j) const;
+
+    /**
+     * Return the value of the
+     * matrix entry
+     * (<i>i,j</i>). If this entry
+     * does not exist in the
+     * sparsity pattern, then zero
+     * is returned. While this may
+     * be convenient in some cases,
+     * note that it is simple to
+     * write algorithms that are
+     * slow compared to an optimal
+     * solution, since the sparsity
+     * of the matrix is not used.
+     * On the other hand, if you
+     * want to be sure the entry
+     * exists, you should use
+     * operator() instead.
+     *
+     * The lack of error checking
+     * in this function can also
+     * yield surprising results if
+     * you have a parallel
+     * matrix. In that case, just
+     * because you get a zero
+     * result from this function
+     * does not mean that either
+     * the entry does not exist in
+     * the sparsity pattern or that
+     * it does but has a value of
+     * zero. Rather, it could also
+     * be that it simply isn't
+     * stored on the current
+     * processor; in that case, it
+     * may be stored on a different
+     * processor, and possibly so
+     * with a nonzero value.
+     */
+    TrilinosScalar el (const unsigned int i,
+                       const unsigned int j) const;
+
+    /**
+     * Return the main diagonal
+     * element in the <i>i</i>th
+     * row. This function throws an
+     * error if the matrix is not
+     * quadratic and it also throws
+     * an error if <i>(i,i)</i> is not
+     * element of the local matrix.
+     * See also the comment in
+     * trilinos_sparse_matrix.cc.
+     */
+    TrilinosScalar diag_element (const unsigned int i) const;
+
+//@}
+    /**
+     * @name Multiplications
+     */
+//@{
+
+    /**
+     * Matrix-vector multiplication:
+     * let <i>dst = M*src</i> with
+     * <i>M</i> being this matrix.
+     *
+     * Source and destination must
+     * not be the same vector.
+     *
+     * Note that both vectors have to
+     * be distributed vectors
+     * generated using the same Map
+     * as was used for the matrix in
+     * case you work on a distributed
+     * memory architecture, using the
+     * interface in the
+     * TrilinosWrappers::VectorBase
+     * class (or one of the two
+     * derived classes Vector and
+     * MPI::Vector).
+     *
+     * In case of a localized Vector,
+     * this function will only work
+     * when running on one processor,
+     * since the matrix object is
+     * inherently
+     * distributed. Otherwise, and
+     * exception will be thrown.
+     */
+    void vmult (VectorBase       &dst,
+                const VectorBase &src) const;
+
+    /**
+     * Same as before, but working with
+     * deal.II's own distributed vector
+     * class.
+     */
+    void vmult (parallel::distributed::Vector<TrilinosScalar>       &dst,
+                const parallel::distributed::Vector<TrilinosScalar> &src) const;
+
+    /**
+     * Matrix-vector multiplication:
+     * let <i>dst =
+     * M<sup>T</sup>*src</i> with
+     * <i>M</i> being this
+     * matrix. This function does the
+     * same as vmult() but takes the
+     * transposed matrix.
+     *
+     * Source and destination must
+     * not be the same vector.
+     *
+     * Note that both vectors have to
+     * be distributed vectors
+     * generated using the same Map
+     * as was used for the matrix in
+     * case you work on a distributed
+     * memory architecture, using the
+     * interface in the
+     * TrilinosWrappers::VectorBase
+     * class (or one of the two
+     * derived classes Vector and
+     * MPI::Vector).
+     *
+     * In case of a localized Vector,
+     * this function will only work
+     * when running on one processor,
+     * since the matrix object is
+     * inherently
+     * distributed. Otherwise, and
+     * exception will be thrown.
+     */
+    void Tvmult (VectorBase       &dst,
+                 const VectorBase &src) const;
+
+    /**
+     * Same as before, but working with
+     * deal.II's own distributed vector
+     * class.
+     */
+    void Tvmult (parallel::distributed::Vector<TrilinosScalar>       &dst,
+                 const parallel::distributed::Vector<TrilinosScalar> &src) const;
+
+    /**
+     * Adding Matrix-vector
+     * multiplication. Add
+     * <i>M*src</i> on <i>dst</i>
+     * with <i>M</i> being this
+     * matrix.
+     *
+     * Source and destination must
+     * not be the same vector.
+     *
+     * Note that both vectors have to
+     * be distributed vectors
+     * generated using the same Map
+     * as was used for the matrix in
+     * case you work on a distributed
+     * memory architecture, using the
+     * interface in the
+     * TrilinosWrappers::VectorBase
+     * class (or one of the two
+     * derived classes Vector and
+     * MPI::Vector).
+     *
+     * In case of a localized Vector,
+     * this function will only work
+     * when running on one processor,
+     * since the matrix object is
+     * inherently
+     * distributed. Otherwise, and
+     * exception will be thrown.
+     */
+    void vmult_add (VectorBase       &dst,
+                    const VectorBase &src) const;
+
+    /**
+     * Adding Matrix-vector
+     * multiplication. Add
+     * <i>M<sup>T</sup>*src</i> to
+     * <i>dst</i> with <i>M</i> being
+     * this matrix. This function
+     * does the same as vmult_add()
+     * but takes the transposed
+     * matrix.
+     *
+     * Source and destination must
+     * not be the same vector.
+     *
+     * Note that both vectors have to
+     * be distributed vectors
+     * generated using the same Map
+     * as was used for the matrix in
+     * case you work on a distributed
+     * memory architecture, using the
+     * interface in the
+     * TrilinosWrappers::VectorBase
+     * class (or one of the two
+     * derived classes Vector and
+     * MPI::Vector).
+     *
+     * In case of a localized Vector,
+     * this function will only work
+     * when running on one processor,
+     * since the matrix object is
+     * inherently
+     * distributed. Otherwise, and
+     * exception will be thrown.
+     */
+    void Tvmult_add (VectorBase       &dst,
+                     const VectorBase &src) const;
+
+    /**
+     * Return the square of the norm
+     * of the vector $v$ with respect
+     * to the norm induced by this
+     * matrix, i.e.,
+     * $\left(v,Mv\right)$. This is
+     * useful, e.g. in the finite
+     * element context, where the
+     * $L_2$ norm of a function
+     * equals the matrix norm with
+     * respect to the mass matrix of
+     * the vector representing the
+     * nodal values of the finite
+     * element function.
+     *
+     * Obviously, the matrix needs to
+     * be quadratic for this
+     * operation.
+     *
+     * The implementation of this
+     * function is not as efficient
+     * as the one in the @p
+     * SparseMatrix class used in
+     * deal.II (i.e. the original
+     * one, not the Trilinos wrapper
+     * class) since Trilinos doesn't
+     * support this operation and
+     * needs a temporary vector.
+     *
+     * Note that both vectors have to
+     * be distributed vectors
+     * generated using the same Map
+     * as was used for the matrix in
+     * case you work on a distributed
+     * memory architecture, using the
+     * interface in the
+     * TrilinosWrappers::VectorBase
+     * class (or one of the two
+     * derived classes Vector and
+     * MPI::Vector).
+     *
+     * In case of a localized Vector,
+     * this function will only work
+     * when running on one processor,
+     * since the matrix object is
+     * inherently
+     * distributed. Otherwise, and
+     * exception will be thrown.
+     */
+    TrilinosScalar matrix_norm_square (const VectorBase &v) const;
+
+    /**
+     * Compute the matrix scalar
+     * product $\left(u,Mv\right)$.
+     *
+     * The implementation of this
+     * function is not as efficient
+     * as the one in the @p
+     * SparseMatrix class used in
+     * deal.II (i.e. the original
+     * one, not the Trilinos
+     * wrapper class) since
+     * Trilinos doesn't support
+     * this operation and needs a
+     * temporary vector.
+     *
+     * Note that both vectors have to
+     * be distributed vectors
+     * generated using the same Map
+     * as was used for the matrix in
+     * case you work on a distributed
+     * memory architecture, using the
+     * interface in the
+     * TrilinosWrappers::VectorBase
+     * class (or one of the two
+     * derived classes Vector and
+     * MPI::Vector).
+     *
+     * In case of a localized Vector,
+     * this function will only work
+     * when running on one processor,
+     * since the matrix object is
+     * inherently
+     * distributed. Otherwise, and
+     * exception will be thrown.
+     */
+    TrilinosScalar matrix_scalar_product (const VectorBase &u,
+                                          const VectorBase &v) const;
+
+    /**
+     * Compute the residual of an
+     * equation <i>Mx=b</i>, where
+     * the residual is defined to
+     * be <i>r=b-Mx</i>. Write the
+     * residual into @p dst. The
+     * <i>l<sub>2</sub></i> norm of
+     * the residual vector is
+     * returned.
+     *
+     * Source <i>x</i> and
+     * destination <i>dst</i> must
+     * not be the same vector.
+     *
+     * Note that both vectors have to
+     * be distributed vectors
+     * generated using the same Map
+     * as was used for the matrix in
+     * case you work on a distributed
+     * memory architecture, using the
+     * interface in the
+     * TrilinosWrappers::VectorBase
+     * class (or one of the two
+     * derived classes Vector and
+     * MPI::Vector).
+     *
+     * In case of a localized Vector,
+     * this function will only work
+     * when running on one processor,
+     * since the matrix object is
+     * inherently
+     * distributed. Otherwise, and
+     * exception will be thrown.
+     */
+    TrilinosScalar residual (VectorBase       &dst,
+                             const VectorBase &x,
+                             const VectorBase &b) const;
+
+    /**
+     * Perform the matrix-matrix
+     * multiplication <tt>C = A * B</tt>,
+     * or, if an optional vector argument
+     * is given, <tt>C = A * diag(V) *
+     * B</tt>, where <tt>diag(V)</tt>
+     * defines a diagonal matrix with the
+     * vector entries.
+     *
+     * This function assumes that the
+     * calling matrix <tt>A</tt> and
+     * <tt>B</tt> have compatible
+     * sizes. The size of <tt>C</tt> will
+     * be set within this function.
+     *
+     * The content as well as the sparsity
+     * pattern of the matrix C will be
+     * changed by this function, so make
+     * sure that the sparsity pattern is
+     * not used somewhere else in your
+     * program. This is an expensive
+     * operation, so think twice before you
+     * use this function.
+     */
+    void mmult (SparseMatrix       &C,
+                const SparseMatrix &B,
+                const VectorBase   &V = VectorBase()) const;
+
+
+    /**
+     * Perform the matrix-matrix
+     * multiplication with the transpose of
+     * <tt>this</tt>, i.e., <tt>C =
+     * A<sup>T</sup> * B</tt>, or, if an
+     * optional vector argument is given,
+     * <tt>C = A<sup>T</sup> * diag(V) *
+     * B</tt>, where <tt>diag(V)</tt>
+     * defines a diagonal matrix with the
+     * vector entries.
+     *
+     * This function assumes that the
+     * calling matrix <tt>A</tt> and
+     * <tt>B</tt> have compatible
+     * sizes. The size of <tt>C</tt> will
+     * be set within this function.
+     *
+     * The content as well as the sparsity
+     * pattern of the matrix C will be
+     * changed by this function, so make
+     * sure that the sparsity pattern is
+     * not used somewhere else in your
+     * program. This is an expensive
+     * operation, so think twice before you
+     * use this function.
+     */
+    void Tmmult (SparseMatrix       &C,
+                 const SparseMatrix &B,
+                 const VectorBase   &V = VectorBase()) const;
+
+//@}
+    /**
+     * @name Matrix norms
+     */
+//@{
+
+    /**
+     * Return the
+     * <i>l</i><sub>1</sub>-norm of
+     * the matrix, that is
+     * $|M|_1=
+     * \max_{\mathrm{all\ columns\ } j}
+     * \sum_{\mathrm{all\ rows\ } i}
+     * |M_{ij}|$, (max. sum
+     * of columns).  This is the
+     * natural matrix norm that is
+     * compatible to the l1-norm for
+     * vectors, i.e.  $|Mv|_1 \leq
+     * |M|_1 |v|_1$.
+     * (cf. Haemmerlin-Hoffmann:
+     * Numerische Mathematik)
+     */
+    TrilinosScalar l1_norm () const;
+
+    /**
+     * Return the linfty-norm of the
+     * matrix, that is
+     * $|M|_\infty=\max_{\mathrm{all\
+     * rows\ } i}\sum_{\mathrm{all\
+     * columns\ } j} |M_{ij}|$,
+     * (max. sum of rows).  This is
+     * the natural matrix norm that
+     * is compatible to the
+     * linfty-norm of vectors, i.e.
+     * $|Mv|_\infty \leq |M|_\infty
+     * |v|_\infty$.
+     * (cf. Haemmerlin-Hoffmann:
+     * Numerische Mathematik)
+     */
+    TrilinosScalar linfty_norm () const;
+
+    /**
+     * Return the frobenius norm of
+     * the matrix, i.e. the square
+     * root of the sum of squares
+     * of all entries in the
+     * matrix.
+     */
+    TrilinosScalar frobenius_norm () const;
+
+//@}
+    /**
+     * @name Access to underlying Trilinos data
+     */
+//@{
+
+    /**
+     * Return a const reference to the
+     * underlying Trilinos
+     * Epetra_CrsMatrix data.
+     */
+    const Epetra_CrsMatrix &trilinos_matrix () const;
+
+    /**
+     * Return a const reference to the
+     * underlying Trilinos
+     * Epetra_CrsGraph data that stores
+     * the sparsity pattern of the
+     * matrix.
+     */
+    const Epetra_CrsGraph &trilinos_sparsity_pattern () const;
+
+    /**
+     * Return a const reference to the
+     * underlying Trilinos Epetra_Map
+     * that sets the partitioning of the
+     * domain space of this matrix, i.e.,
+     * the partitioning of the vectors
+     * this matrix has to be multiplied
+     * with.
+     */
+    const Epetra_Map &domain_partitioner () const;
+
+    /**
+     * Return a const reference to the
+     * underlying Trilinos Epetra_Map
+     * that sets the partitioning of the
+     * range space of this matrix, i.e.,
+     * the partitioning of the vectors
+     * that are result from matrix-vector
+     * products.
+     */
+    const Epetra_Map &range_partitioner () const;
+
+    /**
+     * Return a const reference to the
+     * underlying Trilinos Epetra_Map
+     * that sets the partitioning of the
+     * matrix rows. Equal to the
+     * partitioning of the range.
+     */
+    const Epetra_Map &row_partitioner () const;
+
+    /**
+     * Return a const reference to the
+     * underlying Trilinos Epetra_Map
+     * that sets the partitioning of the
+     * matrix columns. This is in general
+     * not equal to the partitioner
+     * Epetra_Map for the domain because
+     * of overlap in the matrix.
+     */
+    const Epetra_Map &col_partitioner () const;
+//@}
+    /**
+     * @name Iterators
+     */
+//@{
+
+    /**
+     * STL-like iterator with the
+     * first entry.
+     */
+    const_iterator begin () const;
+
+    /**
+     * Final iterator.
+     */
+    const_iterator end () const;
+
+    /**
+     * STL-like iterator with the
+     * first entry of row @p r.
+     *
+     * Note that if the given row
+     * is empty, i.e. does not
+     * contain any nonzero entries,
+     * then the iterator returned
+     * by this function equals
+     * <tt>end(r)</tt>. Note also
+     * that the iterator may not be
+     * dereferencable in that case.
+     */
+    const_iterator begin (const unsigned int r) const;
+
+    /**
+     * Final iterator of row
+     * <tt>r</tt>. It points to the
+     * first element past the end
+     * of line @p r, or past the
+     * end of the entire sparsity
+     * pattern.
+     *
+     * Note that the end iterator
+     * is not necessarily
+     * dereferencable. This is in
+     * particular the case if it is
+     * the end iterator for the
+     * last row of a matrix.
+     */
+    const_iterator end (const unsigned int r) const;
+
+//@}
+    /**
+     * @name Input/Output
+     */
+//@{
+
+    /**
+     * Abstract Trilinos object
+     * that helps view in ASCII
+     * other Trilinos
+     * objects. Currently this
+     * function is not
+     * implemented.  TODO: Not
+     * implemented.
+     */
+    void write_ascii () const;		// shuqiangwang
+
+    /**
+     * Print the matrix to the given
+     * stream, using the format
+     * <tt>(line,col) value</tt>, i.e. one
+     * nonzero entry of the matrix per
+     * line. The optional flag outputs the
+     * sparsity pattern in Trilinos style,
+     * where the data is sorted according
+     * to the processor number when printed
+     * to the stream, as well as a summary
+     * of the matrix like the global size.
+     */
+    void print (std::ostream &out,
+                const bool    write_extended_trilinos_info = false) const;
+
+//@}
+    /** @addtogroup Exceptions
+     *
+     */
+//@{
+    /**
+     * Exception
+     */
+    DeclException1 (ExcTrilinosError,
+                    int,
+                    << "An error with error number " << arg1
+                    << " occurred while calling a Trilinos function");
+
+    /**
+     * Exception
+     */
+    DeclException2 (ExcInvalidIndex,
+                    int, int,
+                    << "The entry with index <" << arg1 << ',' << arg2
+                    << "> does not exist.");
+
+    /**
+     * Exception
+     */
+    DeclException0 (ExcSourceEqualsDestination);
+
+    /**
+     * Exception
+     */
+    DeclException0 (ExcMatrixNotCompressed);
+
+    /**
+     * Exception
+     */
+    DeclException4 (ExcAccessToNonLocalElement,
+                    int, int, int, int,
+                    << "You tried to access element (" << arg1
+                    << "/" << arg2 << ")"
+                    << " of a distributed matrix, but only rows "
+                    << arg3 << " through " << arg4
+                    << " are stored locally and can be accessed.");
+
+    /**
+     * Exception
+     */
+    DeclException2 (ExcAccessToNonPresentElement,
+                    int, int,
+                    << "You tried to access element (" << arg1
+                    << "/" << arg2 << ")"
+                    << " of a sparse matrix, but it appears to not"
+                    << " exist in the Trilinos sparsity pattern.");
+//@}
+
+
+
+  protected:
+
+    /**
+    * For some matrix storage
+    * formats, in particular for the
+    * PETSc distributed blockmatrices,
+    * set and add operations on
+    * individual elements can not be
+    * freely mixed. Rather, one has
+    * to synchronize operations when
+    * one wants to switch from
+    * setting elements to adding to
+    * elements.
+    * BlockMatrixBase automatically
+    * synchronizes the access by
+    * calling this helper function
+    * for each block.
+    * This function ensures that the
+    * matrix is in a state that
+    * allows adding elements; if it
+    * previously already was in this
+    * state, the function does
+    * nothing.
+    */
+    void prepare_add();
+
+    /**
+    * Same as prepare_add() but
+    * prepare the matrix for setting
+    * elements if the representation
+    * of elements in this class
+    * requires such an operation.
+    */
+    void prepare_set();
+
+
+
+  private:
+
+    /**
+     * Pointer to the user-supplied
+     * Epetra Trilinos mapping of
+     * the matrix columns that
+     * assigns parts of the matrix
+     * to the individual processes.
+     */
+    std_cxx1x::shared_ptr<Epetra_Map> column_space_map;
+
+    /**
+     * A sparse matrix object in
+     * Trilinos to be used for
+     * finite element based
+     * problems which allows for
+     * assembling into non-local
+     * elements.  The actual type,
+     * a sparse matrix, is set in
+     * the constructor.
+     */
+    std_cxx1x::shared_ptr<Epetra_FECrsMatrix> matrix;
+
+    /**
+     * Trilinos doesn't allow to mix
+     * additions to matrix entries and
+     * overwriting them (to make
+     * synchronisation of %parallel
+     * computations simpler). The way we
+     * do it is to, for each access
+     * operation, store whether it is an
+     * insertion or an addition. If the
+     * previous one was of different
+     * type, then we first have to flush
+     * the Trilinos buffers; otherwise,
+     * we can simply go on. Luckily,
+     * Trilinos has an object for this
+     * which does already all the
+     * %parallel communications in such a
+     * case, so we simply use their
+     * model, which stores whether the
+     * last operation was an addition or
+     * an insertion.
+     */
+    Epetra_CombineMode last_action;
+
+    /**
+     * A boolean variable to hold
+     * information on whether the
+     * vector is compressed or not.
+     */
+    bool compressed;
+
+    /**
+     * An internal Trilinos vector that
+     * is used for accelerating vmult_add
+     * functions (in order not to need to
+     * recreate temporary vectors every
+     * time that function is called).
+     */
+    mutable VectorBase temp_vector;
+
+    /**
+     * An internal array of integer
+     * values that is used to store the
+     * column indices when
+     * adding/inserting local data into
+     * the (large) sparse matrix.
+     */
+    std::vector<unsigned int> column_indices;
+
+    /**
+     * An internal array of double values
+     * that is used to store the column
+     * indices when adding/inserting
+     * local data into the (large) sparse
+     * matrix.
+     */
+    std::vector<TrilinosScalar> column_values;
+
+    /**
+     *  To allow calling protected
+     *  prepare_add() and
+     *  prepare_set().
+     */
+    friend class BlockMatrixBase<SparseMatrix>;
+  };
+
+
+
+// -------------------------- inline and template functions ----------------------
+
+
+#ifndef DOXYGEN
+
+  namespace MatrixIterators
+  {
+
+    inline
+    const_iterator::Accessor::
+    Accessor (const SparseMatrix *matrix,
+              const unsigned int  row,
+              const unsigned int  index)
+      :
+      matrix(const_cast<SparseMatrix *>(matrix)),
+      a_row(row),
+      a_index(index)
+    {
+      visit_present_row ();
+    }
+
+
+    inline
+    unsigned int
+    const_iterator::Accessor::row() const
+    {
+      Assert (a_row < matrix->m(), ExcBeyondEndOfMatrix());
+      return a_row;
+    }
+
+
+
+    inline
+    unsigned int
+    const_iterator::Accessor::column() const
+    {
+      Assert (a_row < matrix->m(), ExcBeyondEndOfMatrix());
+      return (*colnum_cache)[a_index];
+    }
+
+
+
+    inline
+    unsigned int
+    const_iterator::Accessor::index() const
+    {
+      Assert (a_row < matrix->m(), ExcBeyondEndOfMatrix());
+      return a_index;
+    }
+
+
+
+    inline
+    TrilinosScalar
+    const_iterator::Accessor::value() const
+    {
+      Assert (a_row < matrix->m(), ExcBeyondEndOfMatrix());
+      return (*value_cache)[a_index];
+    }
+
+
+
+    inline
+    const_iterator::
+    const_iterator(const SparseMatrix *matrix,
+                   const unsigned int  row,
+                   const unsigned int  index)
+      :
+      accessor(matrix, row, index)
+    {}
+
+
+
+    inline
+    const_iterator &
+    const_iterator::operator++ ()
+    {
+      Assert (accessor.a_row < accessor.matrix->m(), ExcIteratorPastEnd());
+
+      ++accessor.a_index;
+
+      // If at end of line: do one
+      // step, then cycle until we
+      // find a row with a nonzero
+      // number of entries.
+      if (accessor.a_index >= accessor.colnum_cache->size())
+        {
+          accessor.a_index = 0;
+          ++accessor.a_row;
+
+          while ((accessor.a_row < accessor.matrix->m())
+                 &&
+                 (accessor.matrix->row_length(accessor.a_row) == 0))
+            ++accessor.a_row;
+
+          accessor.visit_present_row();
+        }
+      return *this;
+    }
+
+
+
+    inline
+    const_iterator
+    const_iterator::operator++ (int)
+    {
+      const const_iterator old_state = *this;
+      ++(*this);
+      return old_state;
+    }
+
+
+
+    inline
+    const const_iterator::Accessor &
+    const_iterator::operator* () const
+    {
+      return accessor;
+    }
+
+
+
+    inline
+    const const_iterator::Accessor *
+    const_iterator::operator-> () const
+    {
+      return &accessor;
+    }
+
+
+
+    inline
+    bool
+    const_iterator::
+    operator == (const const_iterator &other) const
+    {
+      return (accessor.a_row == other.accessor.a_row &&
+              accessor.a_index == other.accessor.a_index);
+    }
+
+
+
+    inline
+    bool
+    const_iterator::
+    operator != (const const_iterator &other) const
+    {
+      return ! (*this == other);
+    }
+
+
+
+    inline
+    bool
+    const_iterator::
+    operator < (const const_iterator &other) const
+    {
+      return (accessor.row() < other.accessor.row() ||
+              (accessor.row() == other.accessor.row() &&
+               accessor.index() < other.accessor.index()));
+    }
+
+  }
+
+
+
+  inline
+  SparseMatrix::const_iterator
+  SparseMatrix::begin() const
+  {
+    return const_iterator(this, 0, 0);
+  }
+
+
+
+  inline
+  SparseMatrix::const_iterator
+  SparseMatrix::end() const
+  {
+    return const_iterator(this, m(), 0);
+  }
+
+
+
+  inline
+  SparseMatrix::const_iterator
+  SparseMatrix::begin(const unsigned int r) const
+  {
+    Assert (r < m(), ExcIndexRange(r, 0, m()));
+    if (row_length(r) > 0)
+      return const_iterator(this, r, 0);
+    else
+      return end (r);
+  }
+
+
+
+  inline
+  SparseMatrix::const_iterator
+  SparseMatrix::end(const unsigned int r) const
+  {
+    Assert (r < m(), ExcIndexRange(r, 0, m()));
+
+    // place the iterator on the first entry
+    // past this line, or at the end of the
+    // matrix
+    for (unsigned int i=r+1; i<m(); ++i)
+      if (row_length(i) > 0)
+        return const_iterator(this, i, 0);
+
+    // if there is no such line, then take the
+    // end iterator of the matrix
+    return end();
+  }
+
+
+
+  inline
+  bool
+  SparseMatrix::in_local_range (const unsigned int index) const
+  {
+    int begin, end;
+    begin = matrix->RowMap().MinMyGID();
+    end = matrix->RowMap().MaxMyGID()+1;
+
+    return ((index >= static_cast<unsigned int>(begin)) &&
+            (index < static_cast<unsigned int>(end)));
+  }
+
+
+
+  inline
+  bool
+  SparseMatrix::is_compressed () const
+  {
+    return compressed;
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::compress (::dealii::VectorOperation::values /*operation*/)
+  {
+    // flush buffers
+    int ierr;
+    ierr = matrix->GlobalAssemble (*column_space_map, matrix->RowMap(),
+                                   true);
+
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    ierr = matrix->OptimizeStorage ();
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    last_action = Zero;
+
+    compressed = true;
+  }
+
+
+
+  inline
+  SparseMatrix &
+  SparseMatrix::operator = (const double d)
+  {
+    Assert (d==0, ExcScalarAssignmentOnlyForZeroValue());
+    compress ();
+
+    const int ierr = matrix->PutScalar(d);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    return *this;
+  }
+
+
+
+  // Inline the set() and add()
+  // functions, since they will be
+  // called frequently, and the
+  // compiler can optimize away
+  // some unnecessary loops when
+  // the sizes are given at
+  // compile time.
+  inline
+  void
+  SparseMatrix::set (const unsigned int   i,
+                     const unsigned int   j,
+                     const TrilinosScalar value)
+  {
+
+    Assert (numbers::is_finite(value), ExcNumberNotFinite());
+
+    set (i, 1, &j, &value, false);
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::set (const std::vector<unsigned int> &indices,
+                     const FullMatrix<TrilinosScalar> &values,
+                     const bool                        elide_zero_values)
+  {
+    Assert (indices.size() == values.m(),
+            ExcDimensionMismatch(indices.size(), values.m()));
+    Assert (values.m() == values.n(), ExcNotQuadratic());
+
+    for (unsigned int i=0; i<indices.size(); ++i)
+      set (indices[i], indices.size(), &indices[0], &values(i,0),
+           elide_zero_values);
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::set (const std::vector<unsigned int> &row_indices,
+                     const std::vector<unsigned int> &col_indices,
+                     const FullMatrix<TrilinosScalar> &values,
+                     const bool                        elide_zero_values)
+  {
+    Assert (row_indices.size() == values.m(),
+            ExcDimensionMismatch(row_indices.size(), values.m()));
+    Assert (col_indices.size() == values.n(),
+            ExcDimensionMismatch(col_indices.size(), values.n()));
+
+    for (unsigned int i=0; i<row_indices.size(); ++i)
+      set (row_indices[i], col_indices.size(), &col_indices[0], &values(i,0),
+           elide_zero_values);
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::set (const unsigned int                 row,
+                     const std::vector<unsigned int>   &col_indices,
+                     const std::vector<TrilinosScalar> &values,
+                     const bool                         elide_zero_values)
+  {
+    Assert (col_indices.size() == values.size(),
+            ExcDimensionMismatch(col_indices.size(), values.size()));
+
+    set (row, col_indices.size(), &col_indices[0], &values[0],
+         elide_zero_values);
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::set (const unsigned int    row,
+                     const unsigned int    n_cols,
+                     const unsigned int   *col_indices,
+                     const TrilinosScalar *values,
+                     const bool            elide_zero_values)
+  {
+    int ierr;
+    if (last_action == Add)
+      {
+        ierr = matrix->GlobalAssemble (*column_space_map, matrix->RowMap(),
+                                       true);
+
+        Assert (ierr == 0, ExcTrilinosError(ierr));
+      }
+
+    last_action = Insert;
+
+    int *col_index_ptr;
+    TrilinosScalar const *col_value_ptr;
+    int n_columns;
+
+    // If we don't elide zeros, the pointers
+    // are already available...
+    if (elide_zero_values == false)
+      {
+        col_index_ptr = (int *)col_indices;
+        col_value_ptr = values;
+        n_columns = n_cols;
+      }
+    else
+      {
+        // Otherwise, extract nonzero values in
+        // each row and get the respective
+        // indices.
+        if (column_indices.size() < n_cols)
+          {
+            column_indices.resize(n_cols);
+            column_values.resize(n_cols);
+          }
+
+        n_columns = 0;
+        for (unsigned int j=0; j<n_cols; ++j)
+          {
+            const double value = values[j];
+            Assert (numbers::is_finite(value), ExcNumberNotFinite());
+            if (value != 0)
+              {
+                column_indices[n_columns] = col_indices[j];
+                column_values[n_columns] = value;
+                n_columns++;
+              }
+          }
+
+        Assert(n_columns <= (int)n_cols, ExcInternalError());
+
+        col_index_ptr = (int *)&column_indices[0];
+        col_value_ptr = &column_values[0];
+      }
+
+
+    // If the calling matrix owns the row to
+    // which we want to insert values, we
+    // can directly call the Epetra_CrsMatrix
+    // input function, which is much faster
+    // than the Epetra_FECrsMatrix
+    // function. We distinguish between two
+    // cases: the first one is when the matrix
+    // is not filled (i.e., it is possible to
+    // add new elements to the sparsity pattern),
+    // and the second one is when the pattern is
+    // already fixed. In the former case, we
+    // add the possibility to insert new values,
+    // and in the second we just replace
+    // data.
+    if (row_partitioner().MyGID(static_cast<int>(row)) == true)
+      {
+        if (matrix->Filled() == false)
+          {
+            ierr = matrix->Epetra_CrsMatrix::InsertGlobalValues(row, n_columns,
+                                                                const_cast<double *>(col_value_ptr),
+                                                                col_index_ptr);
+
+            // When inserting elements, we do
+            // not want to create exceptions in
+            // the case when inserting non-local
+            // data (since that's what we want
+            // to do right now).
+            if (ierr > 0)
+              ierr = 0;
+          }
+        else
+          ierr = matrix->Epetra_CrsMatrix::ReplaceGlobalValues(row, n_columns,
+                                                               const_cast<double *>(col_value_ptr),
+                                                               col_index_ptr);
+      }
+    else
+      {
+        // When we're at off-processor data, we
+        // have to stick with the standard
+        // Insert/ReplaceGlobalValues
+        // function. Nevertheless, the way we
+        // call it is the fastest one (any other
+        // will lead to repeated allocation and
+        // deallocation of memory in order to
+        // call the function we already use,
+        // which is very unefficient if writing
+        // one element at a time).
+        compressed = false;
+
+        if (matrix->Filled() == false)
+          {
+            ierr = matrix->InsertGlobalValues (1, (int *)&row,
+                                               n_columns, col_index_ptr,
+                                               &col_value_ptr,
+                                               Epetra_FECrsMatrix::ROW_MAJOR);
+            if (ierr > 0)
+              ierr = 0;
+          }
+        else
+          ierr = matrix->ReplaceGlobalValues (1, (int *)&row,
+                                              n_columns, col_index_ptr,
+                                              &col_value_ptr,
+                                              Epetra_FECrsMatrix::ROW_MAJOR);
+      }
+
+    Assert (ierr <= 0, ExcAccessToNonPresentElement(row, col_index_ptr[0]));
+    AssertThrow (ierr >= 0, ExcTrilinosError(ierr));
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::add (const unsigned int   i,
+                     const unsigned int   j,
+                     const TrilinosScalar value)
+  {
+    Assert (numbers::is_finite(value), ExcNumberNotFinite());
+
+    if (value == 0)
+      {
+        // we have to do checkings on Insert/Add
+        // in any case
+        // to be consistent with the MPI
+        // communication model (see the comments
+        // in the documentation of
+        // TrilinosWrappers::Vector), but we can
+        // save some work if the addend is
+        // zero. However, these actions are done
+        // in case we pass on to the other
+        // function.
+        if (last_action == Insert)
+          {
+            int ierr;
+            ierr = matrix->GlobalAssemble(*column_space_map,
+                                          row_partitioner(), false);
+
+            Assert (ierr == 0, ExcTrilinosError(ierr));
+            (void)ierr; // removes -Wunused-but-set-variable in optimized mode
+          }
+
+        last_action = Add;
+
+        return;
+      }
+    else
+      add (i, 1, &j, &value, false);
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::add (const std::vector<unsigned int> &indices,
+                     const FullMatrix<TrilinosScalar> &values,
+                     const bool                        elide_zero_values)
+  {
+    Assert (indices.size() == values.m(),
+            ExcDimensionMismatch(indices.size(), values.m()));
+    Assert (values.m() == values.n(), ExcNotQuadratic());
+
+    for (unsigned int i=0; i<indices.size(); ++i)
+      add (indices[i], indices.size(), &indices[0], &values(i,0),
+           elide_zero_values);
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::add (const std::vector<unsigned int> &row_indices,
+                     const std::vector<unsigned int> &col_indices,
+                     const FullMatrix<TrilinosScalar> &values,
+                     const bool                        elide_zero_values)
+  {
+    Assert (row_indices.size() == values.m(),
+            ExcDimensionMismatch(row_indices.size(), values.m()));
+    Assert (col_indices.size() == values.n(),
+            ExcDimensionMismatch(col_indices.size(), values.n()));
+
+    for (unsigned int i=0; i<row_indices.size(); ++i)
+      add (row_indices[i], col_indices.size(), &col_indices[0],
+           &values(i,0), elide_zero_values);
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::add (const unsigned int                 row,
+                     const std::vector<unsigned int>   &col_indices,
+                     const std::vector<TrilinosScalar> &values,
+                     const bool                         elide_zero_values)
+  {
+    Assert (col_indices.size() == values.size(),
+            ExcDimensionMismatch(col_indices.size(), values.size()));
+
+    add (row, col_indices.size(), &col_indices[0], &values[0],
+         elide_zero_values);
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::add (const unsigned int    row,
+                     const unsigned int    n_cols,
+                     const unsigned int   *col_indices,
+                     const TrilinosScalar *values,
+                     const bool            elide_zero_values,
+                     const bool            /*col_indices_are_sorted*/)
+  {
+    int ierr;
+    if (last_action == Insert)
+      {
+        // TODO: this could lead to a dead lock when only one processor
+        // calls GlobalAssemble.
+        ierr = matrix->GlobalAssemble(*column_space_map,
+                                      row_partitioner(), false);
+
+        AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+      }
+
+    last_action = Add;
+
+    int *col_index_ptr;
+    TrilinosScalar const *col_value_ptr;
+    int n_columns;
+
+    // If we don't elide zeros, the pointers
+    // are already available...
+    if (elide_zero_values == false)
+      {
+        col_index_ptr = (int *)col_indices;
+        col_value_ptr = values;
+        n_columns = n_cols;
+#ifdef DEBUG
+        for (unsigned int j=0; j<n_cols; ++j)
+          Assert (numbers::is_finite(values[j]), ExcNumberNotFinite());
+#endif
+      }
+    else
+      {
+        // Otherwise, extract nonzero values in
+        // each row and the corresponding index.
+        if (column_indices.size() < n_cols)
+          {
+            column_indices.resize(n_cols);
+            column_values.resize(n_cols);
+          }
+
+        n_columns = 0;
+        for (unsigned int j=0; j<n_cols; ++j)
+          {
+            const double value = values[j];
+            Assert (numbers::is_finite(value), ExcNumberNotFinite());
+            if (value != 0)
+              {
+                column_indices[n_columns] = col_indices[j];
+                column_values[n_columns] = value;
+                n_columns++;
+              }
+          }
+
+        Assert(n_columns <= (int)n_cols, ExcInternalError());
+
+        col_index_ptr = (int *)&column_indices[0];
+        col_value_ptr = &column_values[0];
+      }
+
+    // If the calling matrix owns the row to
+    // which we want to add values, we
+    // can directly call the Epetra_CrsMatrix
+    // input function, which is much faster
+    // than the Epetra_FECrsMatrix function.
+    if (row_partitioner().MyGID(static_cast<int>(row)) == true)
+      {
+        ierr = matrix->Epetra_CrsMatrix::SumIntoGlobalValues(row, n_columns,
+                                                             const_cast<double *>(col_value_ptr),
+                                                             col_index_ptr);
+      }
+    else
+      {
+        // When we're at off-processor data, we
+        // have to stick with the standard
+        // SumIntoGlobalValues
+        // function. Nevertheless, the way we
+        // call it is the fastest one (any other
+        // will lead to repeated allocation and
+        // deallocation of memory in order to
+        // call the function we already use,
+        // which is very unefficient if writing
+        // one element at a time).
+        compressed = false;
+
+        ierr = matrix->SumIntoGlobalValues (1, (int *)&row, n_columns,
+                                            col_index_ptr,
+                                            &col_value_ptr,
+                                            Epetra_FECrsMatrix::ROW_MAJOR);
+      }
+
+#ifdef DEBUG
+    if (ierr > 0)
+      {
+        std::cout << "------------------------------------------"
+                  << std::endl;
+        std::cout << "Got error " << ierr << " in row " << row
+                  << " of proc " << row_partitioner().Comm().MyPID()
+                  << " when trying to add the columns:" << std::endl;
+        for (int i=0; i<n_columns; ++i)
+          std::cout << col_index_ptr[i] << " ";
+        std::cout << std::endl << std::endl;
+        std::cout << "Matrix row has the following indices:" << std::endl;
+        int n_indices, *indices;
+        trilinos_sparsity_pattern().ExtractMyRowView(row_partitioner().LID(static_cast<int>(row)),
+                                                     n_indices,
+                                                     indices);
+        for (int i=0; i<n_indices; ++i)
+          std::cout << indices[i] << " ";
+        std::cout << endl << std::endl;
+        Assert (ierr <= 0,
+                ExcAccessToNonPresentElement(row, col_index_ptr[0]));
+      }
+#endif
+    Assert (ierr >= 0, ExcTrilinosError(ierr));
+  }
+
+
+
+  // inline "simple" functions that are
+  // called frequently and do only involve
+  // a call to some Trilinos function.
+  inline
+  unsigned int
+  SparseMatrix::m () const
+  {
+    return matrix -> NumGlobalRows();
+  }
+
+
+
+  inline
+  unsigned int
+  SparseMatrix::n () const
+  {
+    return matrix -> NumGlobalCols();
+  }
+
+
+
+  inline
+  unsigned int
+  SparseMatrix::local_size () const
+  {
+    return matrix -> NumMyRows();
+  }
+
+
+
+  inline
+  std::pair<unsigned int, unsigned int>
+  SparseMatrix::local_range () const
+  {
+    unsigned int begin, end;
+    begin = matrix -> RowMap().MinMyGID();
+    end = matrix -> RowMap().MaxMyGID()+1;
+
+    return std::make_pair (begin, end);
+  }
+
+
+
+  inline
+  unsigned int
+  SparseMatrix::n_nonzero_elements () const
+  {
+    return matrix->NumGlobalNonzeros();
+  }
+
+
+
+  template <typename SparsityType>
+  inline
+  void SparseMatrix::reinit (const IndexSet      &parallel_partitioning,
+                             const SparsityType &sparsity_pattern,
+                             const MPI_Comm      &communicator,
+                             const bool           exchange_data)
+  {
+    Epetra_Map map = parallel_partitioning.make_trilinos_map (communicator, false);
+    reinit (map, map, sparsity_pattern, exchange_data);
+  }
+
+
+
+  template <typename SparsityType>
+  inline
+  void SparseMatrix::reinit (const IndexSet      &row_parallel_partitioning,
+                             const IndexSet      &col_parallel_partitioning,
+                             const SparsityType &sparsity_pattern,
+                             const MPI_Comm      &communicator,
+                             const bool           exchange_data)
+  {
+    Epetra_Map row_map =
+      row_parallel_partitioning.make_trilinos_map (communicator, false);
+    Epetra_Map col_map =
+      col_parallel_partitioning.make_trilinos_map (communicator, false);
+    reinit (row_map, col_map, sparsity_pattern, exchange_data);
+  }
+
+
+
+  template <typename number>
+  inline
+  void SparseMatrix::reinit (const IndexSet      &parallel_partitioning,
+                             const ::dealii::SparseMatrix<number> &sparse_matrix,
+                             const MPI_Comm      &communicator,
+                             const double         drop_tolerance,
+                             const bool           copy_values,
+                             const ::dealii::SparsityPattern *use_this_sparsity)
+  {
+    Epetra_Map map = parallel_partitioning.make_trilinos_map (communicator, false);
+    reinit (map, map, sparse_matrix, drop_tolerance, copy_values,
+            use_this_sparsity);
+  }
+
+
+
+  template <typename number>
+  inline
+  void SparseMatrix::reinit (const IndexSet      &row_parallel_partitioning,
+                             const IndexSet      &col_parallel_partitioning,
+                             const ::dealii::SparseMatrix<number> &sparse_matrix,
+                             const MPI_Comm      &communicator,
+                             const double         drop_tolerance,
+                             const bool           copy_values,
+                             const ::dealii::SparsityPattern *use_this_sparsity)
+  {
+    Epetra_Map row_map =
+      row_parallel_partitioning.make_trilinos_map (communicator, false);
+    Epetra_Map col_map =
+      col_parallel_partitioning.make_trilinos_map (communicator, false);
+    reinit (row_map, col_map, sparse_matrix, drop_tolerance, copy_values,
+            use_this_sparsity);
+  }
+
+
+
+  inline
+  TrilinosScalar
+  SparseMatrix::l1_norm () const
+  {
+    Assert (matrix->Filled(), ExcMatrixNotCompressed());
+    return matrix->NormOne();
+  }
+
+
+
+  inline
+  TrilinosScalar
+  SparseMatrix::linfty_norm () const
+  {
+    Assert (matrix->Filled(), ExcMatrixNotCompressed());
+    return matrix->NormInf();
+  }
+
+
+
+  inline
+  TrilinosScalar
+  SparseMatrix::frobenius_norm () const
+  {
+    Assert (matrix->Filled(), ExcMatrixNotCompressed());
+    return matrix->NormFrobenius();
+  }
+
+
+
+  inline
+  SparseMatrix &
+  SparseMatrix::operator *= (const TrilinosScalar a)
+  {
+    const int ierr = matrix->Scale (a);
+    Assert (ierr == 0, ExcTrilinosError(ierr));
+    (void)ierr; // removes -Wunused-variable in optimized mode
+
+    return *this;
+  }
+
+
+
+  inline
+  SparseMatrix &
+  SparseMatrix::operator /= (const TrilinosScalar a)
+  {
+    Assert (a !=0, ExcDivideByZero());
+
+    const TrilinosScalar factor = 1./a;
+
+    const int ierr = matrix->Scale (factor);
+    Assert (ierr == 0, ExcTrilinosError(ierr));
+    (void)ierr; // removes -Wunused-variable in optimized mode
+
+    return *this;
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::vmult (VectorBase       &dst,
+                       const VectorBase &src) const
+  {
+    Assert (&src != &dst, ExcSourceEqualsDestination());
+    Assert (matrix->Filled(), ExcMatrixNotCompressed());
+
+    Assert (src.vector_partitioner().SameAs(matrix->DomainMap()) == true,
+            ExcMessage ("Column map of matrix does not fit with vector map!"));
+    Assert (dst.vector_partitioner().SameAs(matrix->RangeMap()) == true,
+            ExcMessage ("Row map of matrix does not fit with vector map!"));
+
+    const int ierr = matrix->Multiply (false, src.trilinos_vector(),
+                                       dst.trilinos_vector());
+    Assert (ierr == 0, ExcTrilinosError(ierr));
+    (void)ierr; // removes -Wunused-variable in optimized mode
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::vmult (parallel::distributed::Vector<TrilinosScalar>       &dst,
+                       const parallel::distributed::Vector<TrilinosScalar> &src) const
+  {
+    Assert (&src != &dst, ExcSourceEqualsDestination());
+    Assert (matrix->Filled(), ExcMatrixNotCompressed());
+
+    AssertDimension (dst.local_size(), static_cast<unsigned int>(matrix->RangeMap().NumMyElements()));
+    AssertDimension (src.local_size(), static_cast<unsigned int>(matrix->DomainMap().NumMyElements()));
+
+    Epetra_Vector tril_dst (View, matrix->RangeMap(), dst.begin());
+    Epetra_Vector tril_src (View, matrix->DomainMap(),
+                            const_cast<double *>(src.begin()));
+
+    const int ierr = matrix->Multiply (false, tril_src, tril_dst);
+    Assert (ierr == 0, ExcTrilinosError(ierr));
+    (void)ierr; // removes -Wunused-variable in optimized mode
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::Tvmult (VectorBase       &dst,
+                        const VectorBase &src) const
+  {
+    Assert (&src != &dst, ExcSourceEqualsDestination());
+    Assert (matrix->Filled(), ExcMatrixNotCompressed());
+
+    Assert (src.vector_partitioner().SameAs(matrix->RangeMap()) == true,
+            ExcMessage ("Column map of matrix does not fit with vector map!"));
+    Assert (dst.vector_partitioner().SameAs(matrix->DomainMap()) == true,
+            ExcMessage ("Row map of matrix does not fit with vector map!"));
+
+    const int ierr = matrix->Multiply (true, src.trilinos_vector(),
+                                       dst.trilinos_vector());
+    Assert (ierr == 0, ExcTrilinosError(ierr));
+    (void)ierr; // removes -Wunused-variable in optimized mode
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::Tvmult (parallel::distributed::Vector<TrilinosScalar>      &dst,
+                        const parallel::distributed::Vector<TrilinosScalar> &src) const
+  {
+    Assert (&src != &dst, ExcSourceEqualsDestination());
+    Assert (matrix->Filled(), ExcMatrixNotCompressed());
+
+    AssertDimension (dst.local_size(), static_cast<unsigned int>(matrix->DomainMap().NumMyElements()));
+    AssertDimension (src.local_size(), static_cast<unsigned int>(matrix->RangeMap().NumMyElements()));
+
+    Epetra_Vector tril_dst (View, matrix->DomainMap(), dst.begin());
+    Epetra_Vector tril_src (View, matrix->RangeMap(),
+                            const_cast<double *>(src.begin()));
+
+    const int ierr = matrix->Multiply (true, tril_src, tril_dst);
+    Assert (ierr == 0, ExcTrilinosError(ierr));
+    (void)ierr; // removes -Wunused-variable in optimized mode
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::vmult_add (VectorBase       &dst,
+                           const VectorBase &src) const
+  {
+    Assert (&src != &dst, ExcSourceEqualsDestination());
+
+    // Choose to reinit the vector with fast
+    // argument set, which does not overwrite
+    // the content -- this is what we need
+    // since we're going to overwrite that
+    // anyway in the vmult operation.
+    temp_vector.reinit(dst, true);
+
+    vmult (temp_vector, src);
+    dst += temp_vector;
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::Tvmult_add (VectorBase       &dst,
+                            const VectorBase &src) const
+  {
+    Assert (&src != &dst, ExcSourceEqualsDestination());
+
+    temp_vector.reinit(dst, true);
+
+    Tvmult (temp_vector, src);
+    dst += temp_vector;
+  }
+
+
+
+  inline
+  TrilinosScalar
+  SparseMatrix::matrix_norm_square (const VectorBase &v) const
+  {
+    Assert (row_partitioner().SameAs(domain_partitioner()),
+            ExcNotQuadratic());
+
+    temp_vector.reinit(v);
+
+    vmult (temp_vector, v);
+    return temp_vector*v;
+  }
+
+
+
+  inline
+  TrilinosScalar
+  SparseMatrix::matrix_scalar_product (const VectorBase &u,
+                                       const VectorBase &v) const
+  {
+    Assert (row_partitioner().SameAs(domain_partitioner()),
+            ExcNotQuadratic());
+
+    temp_vector.reinit(v);
+
+    vmult (temp_vector, v);
+    return u*temp_vector;
+  }
+
+
+
+  inline
+  TrilinosScalar
+  SparseMatrix::residual (VectorBase       &dst,
+                          const VectorBase &x,
+                          const VectorBase &b) const
+  {
+    vmult (dst, x);
+    dst -= b;
+    dst *= -1.;
+
+    return dst.l2_norm();
+  }
+
+
+  inline
+  const Epetra_CrsMatrix &
+  SparseMatrix::trilinos_matrix () const
+  {
+    return static_cast<const Epetra_CrsMatrix &>(*matrix);
+  }
+
+
+
+  inline
+  const Epetra_CrsGraph &
+  SparseMatrix::trilinos_sparsity_pattern () const
+  {
+    return matrix->Graph();
+  }
+
+
+
+  inline
+  const Epetra_Map &
+  SparseMatrix::domain_partitioner () const
+  {
+    return matrix->DomainMap();
+  }
+
+
+
+  inline
+  const Epetra_Map &
+  SparseMatrix::range_partitioner () const
+  {
+    return matrix->RangeMap();
+  }
+
+
+
+  inline
+  const Epetra_Map &
+  SparseMatrix::row_partitioner () const
+  {
+    return matrix->RowMap();
+  }
+
+
+
+  inline
+  const Epetra_Map &
+  SparseMatrix::col_partitioner () const
+  {
+    return matrix->ColMap();
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::prepare_add()
+  {
+    //nothing to do here
+  }
+
+
+
+  inline
+  void
+  SparseMatrix::prepare_set()
+  {
+    //nothing to do here
+  }
+
+
+
+#endif // DOXYGEN
+
+}
+
+
+DEAL_II_NAMESPACE_CLOSE
+
+
+#endif // DEAL_II_USE_TRILINOS
+
+
+/*-----------------------   trilinos_sparse_matrix.h     --------------------*/
+
+#endif
+/*-----------------------   trilinos_sparse_matrix.h     --------------------*/

Added: branches/s-wang2/for_deal.II/include/deal.II/lac/trilinos_vector_base.h
===================================================================
--- branches/s-wang2/for_deal.II/include/deal.II/lac/trilinos_vector_base.h	                        (rev 0)
+++ branches/s-wang2/for_deal.II/include/deal.II/lac/trilinos_vector_base.h	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,1997 @@
+//---------------------------------------------------------------------------
+//    $Id: trilinos_vector_base.h 27628 2012-11-20 22:49:26Z heister $
+//
+//    Copyright (C) 2008, 2009, 2010, 2011, 2012 by the deal.II authors
+//
+//    This file is subject to QPL and may not be  distributed
+//    without copyright and license information. Please refer
+//    to the file deal.II/doc/license.html for the  text  and
+//    further information on this license.
+//
+//---------------------------------------------------------------------------
+#ifndef __deal2__trilinos_vector_base_h
+#define __deal2__trilinos_vector_base_h
+
+
+#include <deal.II/base/config.h>
+
+#ifdef DEAL_II_USE_TRILINOS
+
+#include <deal.II/base/utilities.h>
+#  include <deal.II/base/std_cxx1x/shared_ptr.h>
+#  include <deal.II/base/subscriptor.h>
+#  include <deal.II/lac/exceptions.h>
+#  include <deal.II/lac/vector.h>
+
+#  include <vector>
+#  include <utility>
+#  include <memory>
+
+#  define TrilinosScalar double
+#  include "Epetra_ConfigDefs.h"
+#  ifdef DEAL_II_COMPILER_SUPPORTS_MPI // only if MPI is installed
+#    include "mpi.h"
+#    include "Epetra_MpiComm.h"
+#  else
+#    include "Epetra_SerialComm.h"
+#  endif
+#  include "Epetra_FEVector.h"
+
+DEAL_II_NAMESPACE_OPEN
+
+// forward declaration
+template <typename number> class Vector;
+
+
+/**
+ * @addtogroup TrilinosWrappers
+ *@{
+ */
+namespace TrilinosWrappers
+{
+  // forward declaration
+  class VectorBase;
+
+
+  /**
+   * @cond internal
+   */
+
+  /**
+   * A namespace for internal implementation details of the
+   * TrilinosWrapper members.
+   *
+   * @ingroup TrilinosWrappers
+   */
+  namespace internal
+  {
+    /**
+     * This class implements a
+     * wrapper for accessing the
+     * Trilinos vector in the same
+     * way as we access deal.II
+     * objects: it is initialized
+     * with a vector and an element
+     * within it, and has a
+     * conversion operator to
+     * extract the scalar value of
+     * this element. It also has a
+     * variety of assignment
+     * operator for writing to this
+     * one element.  @ingroup
+     * TrilinosWrappers
+     */
+    class VectorReference
+    {
+    private:
+      /**
+       * Constructor. It is made
+       * private so as to only allow
+       * the actual vector class to
+       * create it.
+       */
+      VectorReference (VectorBase        &vector,
+                       const unsigned int index);
+
+    public:
+      /**
+       * This looks like a copy
+       * operator, but does something
+       * different than usual. In
+       * particular, it does not copy
+       * the member variables of this
+       * reference. Rather, it
+       * handles the situation where
+       * we have two vectors @p v and
+       * @p w, and assign elements
+       * like in
+       * <tt>v(i)=w(i)</tt>. Here,
+       * both left and right hand
+       * side of the assignment have
+       * data type VectorReference,
+       * but what we really mean is
+       * to assign the vector
+       * elements represented by the
+       * two references. This
+       * operator implements this
+       * operation. Note also that
+       * this allows us to make the
+       * assignment operator const.
+       */
+      const VectorReference &
+      operator = (const VectorReference &r) const;
+
+      /**
+       * Same as above but for non-const
+       * reference objects.
+       */
+      const VectorReference &
+      operator = (const VectorReference &r);
+
+      /**
+       * Set the referenced element of the
+       * vector to <tt>s</tt>.
+       */
+      const VectorReference &
+      operator = (const TrilinosScalar &s) const;
+
+      /**
+       * Add <tt>s</tt> to the
+       * referenced element of the
+       * vector->
+       */
+      const VectorReference &
+      operator += (const TrilinosScalar &s) const;
+
+      /**
+       * Subtract <tt>s</tt> from the
+       * referenced element of the
+       * vector->
+       */
+      const VectorReference &
+      operator -= (const TrilinosScalar &s) const;
+
+      /**
+       * Multiply the referenced
+       * element of the vector by
+       * <tt>s</tt>.
+       */
+      const VectorReference &
+      operator *= (const TrilinosScalar &s) const;
+
+      /**
+       * Divide the referenced
+       * element of the vector by
+       * <tt>s</tt>.
+       */
+      const VectorReference &
+      operator /= (const TrilinosScalar &s) const;
+
+      /**
+       * Convert the reference to an
+       * actual value, i.e. return
+       * the value of the referenced
+       * element of the vector.
+       */
+      operator TrilinosScalar () const;
+
+      /**
+       * Exception
+       */
+      DeclException1 (ExcTrilinosError,
+                      int,
+                      << "An error with error number " << arg1
+                      << " occurred while calling a Trilinos function");
+
+      /**
+       * Exception
+       */
+      DeclException3 (ExcAccessToNonLocalElement,
+                      int, int, int,
+                      << "You tried to access element " << arg1
+                      << " of a distributed vector, but only elements "
+                      << arg2 << " through " << arg3
+                      << " are stored locally and can be accessed.");
+
+    private:
+      /**
+       * Point to the vector we are
+       * referencing.
+       */
+      VectorBase   &vector;
+
+      /**
+       * Index of the referenced element
+       * of the vector.
+       */
+      const unsigned int  index;
+
+      /**
+       * Make the vector class a
+       * friend, so that it can
+       * create objects of the
+       * present type.
+       */
+      friend class ::dealii::TrilinosWrappers::VectorBase;
+    };
+  }
+  /**
+   * @endcond
+   */
+
+
+  /**
+   * Base class for the two types of Trilinos vectors, the distributed
+   * memory vector MPI::Vector and a localized vector Vector. The latter
+   * is designed for use in either serial implementations or as a
+   * localized copy on each processor.  The implementation of this class
+   * is based on the Trilinos vector class Epetra_FEVector, the (parallel)
+   * partitioning of which is governed by an Epetra_Map. This means that
+   * the vector type is generic and can be done in this base class, while
+   * the definition of the partition map (and hence, the constructor and
+   * reinit function) will have to be done in the derived classes. The
+   * Epetra_FEVector is precisely the kind of vector we deal with all the
+   * time - we probably get it from some assembly process, where also
+   * entries not locally owned might need to written and hence need to be
+   * forwarded to the owner. The only requirement for this class to work
+   * is that Trilinos is installed with the same compiler as is used for
+   * compilation of deal.II.
+   *
+   * The interface of this class is modeled after the existing Vector
+   * class in deal.II. It has almost the same member functions, and is
+   * often exchangable. However, since Trilinos only supports a single
+   * scalar type (double), it is not templated, and only works with that
+   * type.
+   *
+   * Note that Trilinos only guarantees that operations do what you expect
+   * if the function @p GlobalAssemble has been called after vector
+   * assembly in order to distribute the data. Therefore, you need to call
+   * Vector::compress() before you actually use the vectors.
+   *
+   * @ingroup TrilinosWrappers
+   * @ingroup Vectors
+   * @author Martin Kronbichler, 2008
+   */
+  class VectorBase : public Subscriptor
+  {
+  public:
+    /**
+     * Declare some of the standard
+     * types used in all
+     * containers. These types
+     * parallel those in the
+     * <tt>C</tt> standard libraries
+     * <tt>vector<...></tt> class.
+     */
+    typedef TrilinosScalar            value_type;
+    typedef TrilinosScalar            real_type;
+    typedef std::size_t               size_type;
+    typedef internal::VectorReference reference;
+    typedef const internal::VectorReference const_reference;
+
+    /**
+     * @name 1: Basic Object-handling
+     */
+    //@{
+
+    /**
+     * Default constructor that
+     * generates an empty (zero size)
+     * vector. The function
+     * <tt>reinit()</tt> will have to
+     * give the vector the correct
+     * size and distribution among
+     * processes in case of an MPI
+     * run.
+     */
+    VectorBase ();
+
+    /**
+     * Copy constructor. Sets the
+     * dimension to that of the given
+     * vector, and copies all the
+     * elements.
+     */
+    VectorBase (const VectorBase &v);
+
+    /**
+     * Destructor
+     */
+    virtual ~VectorBase ();
+
+    /**
+     * Release all memory and return
+     * to a state just like after
+     * having called the default
+     * constructor.
+     */
+    void clear ();
+
+    /**
+     * Reinit functionality, sets the
+     * dimension and possibly the
+     * parallel partitioning (Epetra_Map)
+     * of the calling vector to the
+     * settings of the input vector.
+     */
+    void reinit (const VectorBase &v,
+                 const bool        fast = false);
+
+    /**
+     * Compress the underlying
+     * representation of the Trilinos
+     * object, i.e. flush the buffers
+     * of the vector object if it has
+     * any. This function is
+     * necessary after writing into a
+     * vector element-by-element and
+     * before anything else can be
+     * done on it.
+     *
+     * The (defaulted) argument can
+     * be used to specify the
+     * compress mode
+     * (<code>Add</code> or
+     * <code>Insert</code>) in case
+     * the vector has not been
+     * written to since the last
+     * time this function was
+     * called. The argument is
+     * ignored if the vector has
+     * been added or written to
+     * since the last time
+     * compress() was called.
+     *
+     * See @ref GlossCompress "Compressing distributed objects"
+     * for more information.
+     */
+    void compress (::dealii::VectorOperation::values operation
+                   =::dealii::VectorOperation::unknown);
+
+    /**
+    * @deprecated
+    */
+    void compress (const Epetra_CombineMode last_action);
+
+    /**
+     * Returns the state of the
+     * vector, i.e., whether
+     * compress() has already been
+     * called after an operation
+     * requiring data exchange.
+     */
+    bool is_compressed () const;
+
+    /**
+     * Set all components of the
+     * vector to the given number @p
+     * s. Simply pass this down to
+     * the Trilinos Epetra object,
+     * but we still need to declare
+     * this function to make the
+     * example given in the
+     * discussion about making the
+     * constructor explicit work.
+     *
+     * Since the semantics of
+     * assigning a scalar to a vector
+     * are not immediately clear,
+     * this operator should really
+     * only be used if you want to
+     * set the entire vector to
+     * zero. This allows the
+     * intuitive notation
+     * <tt>v=0</tt>. Assigning other
+     * values is deprecated and may
+     * be disallowed in the future.
+     */
+    VectorBase &
+    operator = (const TrilinosScalar s);
+
+    /**
+     * Copy function. This function takes
+     * a VectorBase vector and copies all
+     * the elements. The target vector
+     * will have the same parallel
+     * distribution as the calling
+     * vector.
+     */
+    VectorBase &
+    operator = (const VectorBase &v);
+
+    /**
+     * Another copy function. This
+     * one takes a deal.II vector and
+     * copies it into a
+     * TrilinosWrapper vector. Note
+     * that since we do not provide
+     * any Epetra_map that tells
+     * about the partitioning of the
+     * vector among the MPI
+     * processes, the size of the
+     * TrilinosWrapper vector has to
+     * be the same as the size of the
+     * input vector. In order to
+     * change the map, use the
+     * reinit(const Epetra_Map
+     * &input_map) function.
+     */
+    template <typename Number>
+    VectorBase &
+    operator = (const ::dealii::Vector<Number> &v);
+
+    /**
+     * Test for equality. This
+     * function assumes that the
+     * present vector and the one to
+     * compare with have the same
+     * size already, since comparing
+     * vectors of different sizes
+     * makes not much sense anyway.
+     */
+    bool operator == (const VectorBase &v) const;
+
+    /**
+     * Test for inequality. This
+     * function assumes that the
+     * present vector and the one to
+     * compare with have the same
+     * size already, since comparing
+     * vectors of different sizes
+     * makes not much sense anyway.
+     */
+    bool operator != (const VectorBase &v) const;
+
+    /**
+     * Return the global dimension of
+     * the vector.
+     */
+    unsigned int size () const;
+
+    /**
+     * Return the local dimension of
+     * the vector, i.e. the number of
+     * elements stored on the present
+     * MPI process. For sequential
+     * vectors, this number is the
+     * same as size(), but for
+     * parallel vectors it may be
+     * smaller.
+     *
+     * To figure out which elements
+     * exactly are stored locally,
+     * use local_range().
+     *
+     * If the vector contains ghost
+     * elements, they are included in
+     * this number.
+     */
+    unsigned int local_size () const;
+
+    /**
+     * Return a pair of indices
+     * indicating which elements of
+     * this vector are stored
+     * locally. The first number is
+     * the index of the first element
+     * stored, the second the index
+     * of the one past the last one
+     * that is stored locally. If
+     * this is a sequential vector,
+     * then the result will be the
+     * pair (0,N), otherwise it will
+     * be a pair (i,i+n), where
+     * <tt>n=local_size()</tt>.
+     */
+    std::pair<unsigned int, unsigned int> local_range () const;
+
+    /**
+     * Return whether @p index is in
+     * the local range or not, see
+     * also local_range().
+     */
+    bool in_local_range (const unsigned int index) const;
+
+    /**
+     * Return if the vector contains ghost
+     * elements. This answer is true if there
+     * are ghost elements on at least one
+     * process.
+     */
+    bool has_ghost_elements() const;
+
+    /**
+     * Return the scalar (inner)
+     * product of two vectors. The
+     * vectors must have the same
+     * size.
+     */
+    TrilinosScalar operator * (const VectorBase &vec) const;
+
+    /**
+     * Return square of the
+     * $l_2$-norm.
+     */
+    real_type norm_sqr () const;
+
+    /**
+     * Mean value of the elements of
+     * this vector.
+     */
+    TrilinosScalar mean_value () const;
+
+    /**
+     * Compute the minimal value of
+     * the elements of this vector.
+     */
+    TrilinosScalar minimal_value () const;
+
+    /**
+     * $l_1$-norm of the vector.  The
+     * sum of the absolute values.
+     */
+    real_type l1_norm () const;
+
+    /**
+     * $l_2$-norm of the vector.  The
+     * square root of the sum of the
+     * squares of the elements.
+     */
+    real_type l2_norm () const;
+
+    /**
+     * $l_p$-norm of the vector. The
+     * <i>p</i>th root of the sum of
+     * the <i>p</i>th powers of the
+     * absolute values of the
+     * elements.
+     */
+    real_type lp_norm (const TrilinosScalar p) const;
+
+    /**
+     * Maximum absolute value of the
+     * elements.
+     */
+    real_type linfty_norm () const;
+
+    /**
+									   * Return vector component with
+									   * the minimal magnitude.
+									   */
+      real_type min () const;			// shuqiangwang
+
+									  /**
+									   * Return vector component with
+									   * the maximal magnitude.
+									   */
+      real_type max () const;
+                                       /**
+     * Return whether the vector
+     * contains only elements with
+     * value zero. This function is
+     * mainly for internal
+     * consistency checks and should
+     * seldom be used when not in
+     * debug mode since it uses quite
+     * some time.
+     */
+    bool all_zero () const;
+
+    /**
+     * Return @p true if the vector
+     * has no negative entries,
+     * i.e. all entries are zero or
+     * positive. This function is
+     * used, for example, to check
+     * whether refinement indicators
+     * are really all positive (or
+     * zero).
+     */
+    bool is_non_negative () const;
+    //@}
+
+
+    /**
+     * @name 2: Data-Access
+     */
+    //@{
+
+    /**
+     * Provide access to a given
+     * element, both read and write.
+     */
+    reference
+    operator () (const unsigned int index);
+
+    /**
+     * Provide read-only access to an
+     * element. This is equivalent to
+     * the <code>el()</code> command.
+     */
+    TrilinosScalar
+    operator () (const unsigned int index) const;
+
+    /**
+     * Provide access to a given
+     * element, both read and write.
+     *
+     * Exactly the same as operator().
+     */
+    reference
+    operator [] (const unsigned int index);
+
+    /**
+     * Provide read-only access to an
+     * element. This is equivalent to
+     * the <code>el()</code> command.
+     *
+     * Exactly the same as operator().
+     */
+    TrilinosScalar
+    operator [] (const unsigned int index) const;
+
+    /**
+     * Return the value of the vector
+     * entry <i>i</i>. Note that this
+     * function does only work
+     * properly when we request a
+     * data stored on the local
+     * processor. The function will
+     * throw an exception in case the
+     * elements sits on another
+     * process.
+     */
+    TrilinosScalar el (const unsigned int index) const;
+
+    /**
+     * A collective set operation:
+     * instead of setting individual
+     * elements of a vector, this
+     * function allows to set a whole
+     * set of elements at once. The
+     * indices of the elements to be
+     * set are stated in the first
+     * argument, the corresponding
+     * values in the second.
+     */
+    void set (const std::vector<unsigned int>    &indices,
+              const std::vector<TrilinosScalar> &values);
+
+    /**
+     * This is a second collective
+     * set operation. As a
+     * difference, this function
+     * takes a deal.II vector of
+     * values.
+     */
+    void set (const std::vector<unsigned int>        &indices,
+              const ::dealii::Vector<TrilinosScalar> &values);
+    //@}
+
+
+    /**
+     * @name 3: Modification of vectors
+     */
+    //@{
+
+    /**
+     * This collective set operation
+     * is of lower level and can
+     * handle anything else &mdash;
+     * the only thing you have to
+     * provide is an address where
+     * all the indices are stored and
+     * the number of elements to be
+     * set.
+     */
+    void set (const unsigned int    n_elements,
+              const unsigned int   *indices,
+              const TrilinosScalar *values);
+
+    /**
+     * A collective add operation:
+     * This funnction adds a whole
+     * set of values stored in @p
+     * values to the vector
+     * components specified by @p
+     * indices.
+     */
+    void add (const std::vector<unsigned int>   &indices,
+              const std::vector<TrilinosScalar> &values);
+
+    /**
+     * This is a second collective
+     * add operation. As a
+     * difference, this function
+     * takes a deal.II vector of
+     * values.
+     */
+    void add (const std::vector<unsigned int>        &indices,
+              const ::dealii::Vector<TrilinosScalar> &values);
+
+    /**
+     * Take an address where
+     * <tt>n_elements</tt> are stored
+     * contiguously and add them into
+     * the vector. Handles all cases
+     * which are not covered by the
+     * other two <tt>add()</tt>
+     * functions above.
+     */
+    void add (const unsigned int    n_elements,
+              const unsigned int   *indices,
+              const TrilinosScalar *values);
+
+    /**
+     * Multiply the entire vector by
+     * a fixed factor.
+     */
+    VectorBase &operator *= (const TrilinosScalar factor);
+
+    /**
+     * Divide the entire vector by a
+     * fixed factor.
+     */
+    VectorBase &operator /= (const TrilinosScalar factor);
+
+    /**
+     * Add the given vector to the
+     * present one.
+     */
+    VectorBase &operator += (const VectorBase &V);
+
+    /**
+     * Subtract the given vector from
+     * the present one.
+     */
+    VectorBase &operator -= (const VectorBase &V);
+
+    /**
+     * Addition of @p s to all
+     * components. Note that @p s is
+     * a scalar and not a vector.
+     */
+    void add (const TrilinosScalar s);
+
+    /**
+     * Simple vector addition, equal
+     * to the <tt>operator
+     * +=</tt>.
+     *
+     * Though, if the second argument
+     * <tt>allow_different_maps</tt>
+     * is set, then it is possible to
+     * add data from a different map.
+     */
+    void add (const VectorBase &V,
+              const bool        allow_different_maps = false);
+
+    /**
+     * Simple addition of a multiple
+     * of a vector, i.e. <tt>*this =
+     * a*V</tt>.
+     */
+    void add (const TrilinosScalar  a,
+              const VectorBase     &V);
+
+    /**
+     * Multiple addition of scaled
+     * vectors, i.e. <tt>*this = a*V +
+     * b*W</tt>.
+     */
+    void add (const TrilinosScalar  a,
+              const VectorBase     &V,
+              const TrilinosScalar  b,
+              const VectorBase     &W);
+
+    /**
+     * Scaling and simple vector
+     * addition, i.e.  <tt>*this =
+     * s*(*this) + V</tt>.
+     */
+    void sadd (const TrilinosScalar  s,
+               const VectorBase     &V);
+
+    /**
+     * Scaling and simple addition,
+     * i.e.  <tt>*this = s*(*this) +
+     * a*V</tt>.
+     */
+    void sadd (const TrilinosScalar  s,
+               const TrilinosScalar  a,
+               const VectorBase     &V);
+
+    /**
+     * Scaling and multiple addition.
+     */
+    void sadd (const TrilinosScalar  s,
+               const TrilinosScalar  a,
+               const VectorBase     &V,
+               const TrilinosScalar  b,
+               const VectorBase     &W);
+
+    /**
+     * Scaling and multiple addition.
+     * <tt>*this = s*(*this) + a*V +
+     * b*W + c*X</tt>.
+     */
+    void sadd (const TrilinosScalar  s,
+               const TrilinosScalar  a,
+               const VectorBase     &V,
+               const TrilinosScalar  b,
+               const VectorBase     &W,
+               const TrilinosScalar  c,
+               const VectorBase     &X);
+
+    /**
+     * Scale each element of this
+     * vector by the corresponding
+     * element in the argument. This
+     * function is mostly meant to
+     * simulate multiplication (and
+     * immediate re-assignment) by a
+     * diagonal scaling matrix.
+     */
+    void scale (const VectorBase &scaling_factors);
+
+    /**
+     * Assignment <tt>*this =
+     * a*V</tt>.
+     */
+    void equ (const TrilinosScalar  a,
+              const VectorBase     &V);
+
+    /**
+     * Assignment <tt>*this = a*V +
+     * b*W</tt>.
+     */
+    void equ (const TrilinosScalar  a,
+              const VectorBase     &V,
+              const TrilinosScalar  b,
+              const VectorBase     &W);
+
+    /**
+     * Compute the elementwise ratio
+     * of the two given vectors, that
+     * is let <tt>this[i] =
+     * a[i]/b[i]</tt>. This is useful
+     * for example if you want to
+     * compute the cellwise ratio of
+     * true to estimated error.
+     *
+     * This vector is appropriately
+     * scaled to hold the result.
+     *
+     * If any of the <tt>b[i]</tt> is
+     * zero, the result is
+     * undefined. No attempt is made
+     * to catch such situations.
+     */
+    void ratio (const VectorBase &a,
+                const VectorBase &b);
+    //@}
+
+
+    /**
+     * @name 4: Mixed stuff
+     */
+    //@{
+
+    /**
+     * Return a const reference to the
+     * underlying Trilinos
+     * Epetra_MultiVector class.
+     */
+    const Epetra_MultiVector &trilinos_vector () const;
+
+    /**
+     * Return a (modifyable) reference to
+     * the underlying Trilinos
+     * Epetra_FEVector class.
+     */
+    Epetra_FEVector &trilinos_vector ();
+
+    /**
+     * Return a const reference to the
+     * underlying Trilinos Epetra_Map
+     * that sets the parallel
+     * partitioning of the vector.
+     */
+    const Epetra_Map &vector_partitioner () const;
+
+    /**
+     *  Output of vector in
+     *  user-defined format in analogy
+     *  to the dealii::Vector<number>
+     *  class.
+     */
+    void print (const char *format = 0) const;
+
+    /**
+     * Print to a stream. @p
+     * precision denotes the desired
+     * precision with which values
+     * shall be printed, @p
+     * scientific whether scientific
+     * notation shall be used. If @p
+     * across is @p true then the
+     * vector is printed in a line,
+     * while if @p false then the
+     * elements are printed on a
+     * separate line each.
+     */
+    void print (std::ostream       &out,
+                const unsigned int  precision  = 3,
+                const bool          scientific = true,
+                const bool          across     = true) const;
+
+    /**
+     * Swap the contents of this
+     * vector and the other vector @p
+     * v. One could do this operation
+     * with a temporary variable and
+     * copying over the data
+     * elements, but this function is
+     * significantly more efficient
+     * since it only swaps the
+     * pointers to the data of the
+     * two vectors and therefore does
+     * not need to allocate temporary
+     * storage and move data
+     * around. Note that the vectors
+     * need to be of the same size
+     * and base on the same map.
+     *
+     * This function is analog to the
+     * the @p swap function of all C
+     * standard containers. Also,
+     * there is a global function
+     * <tt>swap(u,v)</tt> that simply
+     * calls <tt>u.swap(v)</tt>,
+     * again in analogy to standard
+     * functions.
+     */
+    void swap (VectorBase &v);
+
+    /**
+     * Estimate for the memory
+     * consumption in bytes.
+     */
+    std::size_t memory_consumption () const;
+    //@}
+
+    /**
+     * Exception
+     */
+    DeclException0 (ExcGhostsPresent);
+
+    /**
+     * Exception
+     */
+    DeclException0 (ExcDifferentParallelPartitioning);
+
+    /**
+     * Exception
+     */
+    DeclException1 (ExcTrilinosError,
+                    int,
+                    << "An error with error number " << arg1
+                    << " occurred while calling a Trilinos function");
+
+    /**
+     * Exception
+     */
+    DeclException3 (ExcAccessToNonlocalElement,
+                    int, int, int,
+                    << "You tried to access element " << arg1
+                    << " of a distributed vector, but only entries "
+                    << arg2 << " through " << arg3
+                    << " are stored locally and can be accessed.");
+
+
+  private:
+    /**
+     * Trilinos doesn't allow to
+     * mix additions to matrix
+     * entries and overwriting them
+     * (to make synchronisation of
+     * parallel computations
+     * simpler). The way we do it
+     * is to, for each access
+     * operation, store whether it
+     * is an insertion or an
+     * addition. If the previous
+     * one was of different type,
+     * then we first have to flush
+     * the Trilinos buffers;
+     * otherwise, we can simply go
+     * on.  Luckily, Trilinos has
+     * an object for this which
+     * does already all the
+     * parallel communications in
+     * such a case, so we simply
+     * use their model, which
+     * stores whether the last
+     * operation was an addition or
+     * an insertion.
+     */
+    Epetra_CombineMode last_action;
+
+    /**
+     * A boolean variable to hold
+     * information on whether the
+     * vector is compressed or not.
+     */
+    bool compressed;
+
+    /**
+     * Whether this vector has ghost elements. This is true
+     * on all processors even if only one of them has any
+     * ghost elements.
+     */
+    bool has_ghosts;
+
+    /**
+     * An Epetra distibuted vector
+     * type. Requires an existing
+     * Epetra_Map for storing data.
+     */
+    std_cxx1x::shared_ptr<Epetra_FEVector> vector;
+
+
+    /**
+     * Make the reference class a
+     * friend.
+     */
+    friend class internal::VectorReference;
+    friend class Vector;
+    friend class MPI::Vector;
+  };
+
+
+
+
+// ------------------- inline and template functions --------------
+
+  /**
+   * Global function swap which overloads the default implementation of
+   * the C standard library which uses a temporary object. The function
+   * simply exchanges the data of the two vectors.
+   *
+   * @relates TrilinosWrappers::VectorBase
+   * @author Martin Kronbichler, Wolfgang Bangerth, 2008
+   */
+  inline
+  void swap (VectorBase &u, VectorBase &v)
+  {
+    u.swap (v);
+  }
+
+
+#ifndef DOXYGEN
+
+  namespace internal
+  {
+    inline
+    VectorReference::VectorReference (VectorBase        &vector,
+                                      const unsigned int index)
+      :
+      vector (vector),
+      index (index)
+    {}
+
+
+    inline
+    const VectorReference &
+    VectorReference::operator = (const VectorReference &r) const
+    {
+      // as explained in the class
+      // documentation, this is not the copy
+      // operator. so simply pass on to the
+      // "correct" assignment operator
+      *this = static_cast<TrilinosScalar> (r);
+
+      return *this;
+    }
+
+
+
+    inline
+    const VectorReference &
+    VectorReference::operator = (const VectorReference &r)
+    {
+      // as above
+      *this = static_cast<TrilinosScalar> (r);
+
+      return *this;
+    }
+
+
+    inline
+    const VectorReference &
+    VectorReference::operator = (const TrilinosScalar &value) const
+    {
+      vector.set (1, &index, &value);
+      return *this;
+    }
+
+
+
+    inline
+    const VectorReference &
+    VectorReference::operator += (const TrilinosScalar &value) const
+    {
+      vector.add (1, &index, &value);
+      return *this;
+    }
+
+
+
+    inline
+    const VectorReference &
+    VectorReference::operator -= (const TrilinosScalar &value) const
+    {
+      TrilinosScalar new_value = -value;
+      vector.add (1, &index, &new_value);
+      return *this;
+    }
+
+
+
+    inline
+    const VectorReference &
+    VectorReference::operator *= (const TrilinosScalar &value) const
+    {
+      TrilinosScalar new_value = static_cast<TrilinosScalar>(*this) * value;
+      vector.set (1, &index, &new_value);
+      return *this;
+    }
+
+
+
+    inline
+    const VectorReference &
+    VectorReference::operator /= (const TrilinosScalar &value) const
+    {
+      TrilinosScalar new_value = static_cast<TrilinosScalar>(*this) / value;
+      vector.set (1, &index, &new_value);
+      return *this;
+    }
+  }
+
+
+
+  inline
+  bool
+  VectorBase::is_compressed () const
+  {
+    return compressed;
+  }
+
+
+
+  inline
+  bool
+  VectorBase::in_local_range (const unsigned int index) const
+  {
+    std::pair<unsigned int, unsigned int> range = local_range();
+
+    return ((index >= range.first) && (index <  range.second));
+  }
+
+
+
+  inline
+  bool
+  VectorBase::has_ghost_elements() const
+  {
+    return has_ghosts;
+  }
+
+
+
+  inline
+  internal::VectorReference
+  VectorBase::operator () (const unsigned int index)
+  {
+    return internal::VectorReference (*this, index);
+  }
+
+
+
+  inline
+  internal::VectorReference
+  VectorBase::operator [] (const unsigned int index)
+  {
+    return operator() (index);
+  }
+
+
+  inline
+  TrilinosScalar
+  VectorBase::operator [] (const unsigned int index) const
+  {
+    return operator() (index);
+  }
+
+
+
+  inline
+  void
+  VectorBase::reinit (const VectorBase &v,
+                      const bool        fast)
+  {
+    Assert (vector.get() != 0,
+            ExcMessage("Vector has not been constructed properly."));
+
+    if (fast == false ||
+        vector_partitioner().SameAs(v.vector_partitioner())==false)
+      vector.reset (new Epetra_FEVector(*v.vector));
+  }
+
+
+
+  inline
+  void
+  VectorBase::compress (const Epetra_CombineMode last_action)
+  {
+    ::dealii::VectorOperation::values last_action_ =
+      ::dealii::VectorOperation::unknown;
+    if (last_action == Add)
+      last_action_ = ::dealii::VectorOperation::add;
+    else if (last_action == Insert)
+      last_action_ = ::dealii::VectorOperation::insert;
+    else
+      AssertThrow(false, ExcNotImplemented());
+
+    compress(last_action_);
+  }
+
+
+
+  inline
+  void
+  VectorBase::compress (::dealii::VectorOperation::values given_last_action)
+  {
+    //Select which mode to send to
+    //Trilinos. Note that we use last_action
+    //if available and ignore what the user
+    //tells us to detect wrongly mixed
+    //operations. Typically given_last_action
+    //is only used on machines that do not
+    //execute an operation (because they have
+    //no own cells for example).
+    Epetra_CombineMode mode = last_action;
+    if (last_action == Zero)
+      {
+        if (given_last_action==::dealii::VectorOperation::add)
+          mode = Add;
+        else if (given_last_action==::dealii::VectorOperation::insert)
+          mode = Insert;
+      }
+
+#ifdef DEBUG
+#  ifdef DEAL_II_COMPILER_SUPPORTS_MPI
+    // check that every process has decided
+    // to use the same mode. This will
+    // otherwise result in undefined
+    // behaviour in the call to
+    // GlobalAssemble().
+    double double_mode = mode;
+    Utilities::MPI::MinMaxAvg result
+      = Utilities::MPI::min_max_avg (double_mode,
+                                     dynamic_cast<const Epetra_MpiComm *>
+                                     (&vector_partitioner().Comm())->GetMpiComm());
+    Assert(result.max-result.min<1e-5,
+           ExcMessage ("Not all processors agree whether the last operation on "
+                       "this vector was an addition or a set operation. This will "
+                       "prevent the compress() operation from succeeding."));
+
+#  endif
+#endif
+
+    // Now pass over the information about
+    // what we did last to the vector.
+    const int ierr = vector->GlobalAssemble(mode);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+    last_action = Zero;
+
+    compressed = true;
+  }
+
+
+
+  inline
+  VectorBase &
+  VectorBase::operator = (const TrilinosScalar s)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+
+    Assert (numbers::is_finite(s), ExcNumberNotFinite());
+
+    const int ierr = vector->PutScalar(s);
+
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    return *this;
+  }
+
+
+
+  inline
+  void
+  VectorBase::set (const std::vector<unsigned int>    &indices,
+                   const std::vector<TrilinosScalar> &values)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+
+    Assert (indices.size() == values.size(),
+            ExcDimensionMismatch(indices.size(),values.size()));
+
+    set (indices.size(), &indices[0], &values[0]);
+  }
+
+
+
+  inline
+  void
+  VectorBase::set (const std::vector<unsigned int>        &indices,
+                   const ::dealii::Vector<TrilinosScalar> &values)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+
+    Assert (indices.size() == values.size(),
+            ExcDimensionMismatch(indices.size(),values.size()));
+
+    set (indices.size(), &indices[0], values.begin());
+  }
+
+
+
+  inline
+  void
+  VectorBase::set (const unsigned int    n_elements,
+                   const unsigned int   *indices,
+                   const TrilinosScalar *values)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+
+    if (last_action == Add)
+      vector->GlobalAssemble(Add);
+
+    if (last_action != Insert)
+      last_action = Insert;
+
+    for (unsigned int i=0; i<n_elements; ++i)
+      {
+        const unsigned int row = indices[i];
+        const int local_row = vector->Map().LID(static_cast<int>(row));
+        if (local_row == -1)
+          {
+            const int ierr = vector->ReplaceGlobalValues (1,
+                                                          (const int *)(&row),
+                                                          &values[i]);
+            AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+            compressed = false;
+          }
+        else
+          (*vector)[0][local_row] = values[i];
+      }
+  }
+
+
+
+  inline
+  void
+  VectorBase::add (const std::vector<unsigned int>    &indices,
+                   const std::vector<TrilinosScalar> &values)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+    Assert (indices.size() == values.size(),
+            ExcDimensionMismatch(indices.size(),values.size()));
+
+    add (indices.size(), &indices[0], &values[0]);
+  }
+
+
+
+  inline
+  void
+  VectorBase::add (const std::vector<unsigned int>        &indices,
+                   const ::dealii::Vector<TrilinosScalar> &values)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+    Assert (indices.size() == values.size(),
+            ExcDimensionMismatch(indices.size(),values.size()));
+
+    add (indices.size(), &indices[0], values.begin());
+  }
+
+
+
+  inline
+  void
+  VectorBase::add (const unsigned int    n_elements,
+                   const unsigned int   *indices,
+                   const TrilinosScalar *values)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+
+    if (last_action != Add)
+      {
+        if (last_action == Insert)
+          vector->GlobalAssemble(Insert);
+        last_action = Add;
+      }
+
+    for (unsigned int i=0; i<n_elements; ++i)
+      {
+        const unsigned int row = indices[i];
+        const int local_row = vector->Map().LID(static_cast<int>(row));
+        if (local_row == -1)
+          {
+            const int ierr = vector->SumIntoGlobalValues (1,
+                                                          (const int *)(&row),
+                                                          &values[i]);
+            AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+            compressed = false;
+          }
+        else
+          (*vector)[0][local_row] += values[i];
+      }
+  }
+
+
+
+  inline
+  unsigned int
+  VectorBase::size () const
+  {
+    return (unsigned int) (vector->Map().MaxAllGID() + 1 -
+                           vector->Map().MinAllGID());
+  }
+
+
+
+  inline
+  unsigned int
+  VectorBase::local_size () const
+  {
+    return (unsigned int) vector->Map().NumMyElements();
+  }
+
+
+
+  inline
+  std::pair<unsigned int, unsigned int>
+  VectorBase::local_range () const
+  {
+    int begin, end;
+    begin = vector->Map().MinMyGID();
+    end = vector->Map().MaxMyGID()+1;
+    return std::make_pair (begin, end);
+  }
+
+
+
+  inline
+  TrilinosScalar
+  VectorBase::operator * (const VectorBase &vec) const
+  {
+    Assert (vector->Map().SameAs(vec.vector->Map()),
+            ExcDifferentParallelPartitioning());
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+
+    TrilinosScalar result;
+
+    const int ierr = vector->Dot(*(vec.vector), &result);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    return result;
+  }
+
+
+
+  inline
+  VectorBase::real_type
+  VectorBase::norm_sqr () const
+  {
+    const TrilinosScalar d = l2_norm();
+    return d*d;
+  }
+
+
+
+  inline
+  TrilinosScalar
+  VectorBase::mean_value () const
+  {
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+
+    TrilinosScalar mean;
+    const int ierr = vector->MeanValue (&mean);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    return mean;
+  }
+
+
+
+  inline
+  TrilinosScalar
+  VectorBase::minimal_value () const
+  {
+    TrilinosScalar min_value;
+    const int ierr = vector->MinValue (&min_value);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    return min_value;
+  }
+
+
+
+  inline
+  VectorBase::real_type
+  VectorBase::l1_norm () const
+  {
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+
+    TrilinosScalar d;
+    const int ierr = vector->Norm1 (&d);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    return d;
+  }
+
+
+
+  inline
+  VectorBase::real_type
+  VectorBase::l2_norm () const
+  {
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+
+    TrilinosScalar d;
+    const int ierr = vector->Norm2 (&d);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    return d;
+  }
+
+
+
+  inline
+  VectorBase::real_type
+  VectorBase::lp_norm (const TrilinosScalar p) const
+  {
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+
+    TrilinosScalar norm = 0;
+    TrilinosScalar sum=0;
+    const unsigned int n_local = local_size();
+
+    // loop over all the elements because
+    // Trilinos does not support lp norms
+    for (unsigned int i=0; i<n_local; i++)
+      sum += std::pow(std::fabs((*vector)[0][i]), p);
+
+    norm = std::pow(sum, static_cast<TrilinosScalar>(1./p));
+
+    return norm;
+  }
+
+
+
+  inline
+  VectorBase::real_type
+  VectorBase::linfty_norm () const
+  {
+    // while we disallow the other
+    // norm operations on ghosted
+    // vectors, this particular norm
+    // is safe to run even in the
+    // presence of ghost elements
+    TrilinosScalar d;
+    const int ierr = vector->NormInf (&d);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    return d;
+  }
+
+
+
+  // inline also scalar products, vector
+  // additions etc. since they are all
+  // representable by a single Trilinos
+  // call. This reduces the overhead of the
+  // wrapper class.
+  inline
+  VectorBase &
+  VectorBase::operator *= (const TrilinosScalar a)
+  {
+    Assert (numbers::is_finite(a), ExcNumberNotFinite());
+
+    const int ierr = vector->Scale(a);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    return *this;
+  }
+
+
+
+  inline
+  VectorBase &
+  VectorBase::operator /= (const TrilinosScalar a)
+  {
+    Assert (numbers::is_finite(a), ExcNumberNotFinite());
+
+    const TrilinosScalar factor = 1./a;
+
+    Assert (numbers::is_finite(factor), ExcNumberNotFinite());
+
+    const int ierr = vector->Scale(factor);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    return *this;
+  }
+
+
+
+  inline
+  VectorBase &
+  VectorBase::operator += (const VectorBase &v)
+  {
+    Assert (size() == v.size(),
+            ExcDimensionMismatch(size(), v.size()));
+    Assert (vector->Map().SameAs(v.vector->Map()),
+            ExcDifferentParallelPartitioning());
+
+    const int ierr = vector->Update (1.0, *(v.vector), 1.0);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    return *this;
+  }
+
+
+
+  inline
+  VectorBase &
+  VectorBase::operator -= (const VectorBase &v)
+  {
+    Assert (size() == v.size(),
+            ExcDimensionMismatch(size(), v.size()));
+    Assert (vector->Map().SameAs(v.vector->Map()),
+            ExcDifferentParallelPartitioning());
+
+    const int ierr = vector->Update (-1.0, *(v.vector), 1.0);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    return *this;
+  }
+
+
+
+  inline
+  void
+  VectorBase::add (const TrilinosScalar s)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+    Assert (numbers::is_finite(s), ExcNumberNotFinite());
+
+    unsigned int n_local = local_size();
+    for (unsigned int i=0; i<n_local; i++)
+      (*vector)[0][i] += s;
+  }
+
+
+
+  inline
+  void
+  VectorBase::add (const TrilinosScalar  a,
+                   const VectorBase     &v)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+    Assert (local_size() == v.local_size(),
+            ExcDimensionMismatch(local_size(), v.local_size()));
+
+    Assert (numbers::is_finite(a), ExcNumberNotFinite());
+
+    const int ierr = vector->Update(a, *(v.vector), 1.);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+  }
+
+
+
+  inline
+  void
+  VectorBase::add (const TrilinosScalar  a,
+                   const VectorBase     &v,
+                   const TrilinosScalar  b,
+                   const VectorBase     &w)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+    Assert (local_size() == v.local_size(),
+            ExcDimensionMismatch(local_size(), v.local_size()));
+    Assert (local_size() == w.local_size(),
+            ExcDimensionMismatch(local_size(), w.local_size()));
+
+    Assert (numbers::is_finite(a), ExcNumberNotFinite());
+    Assert (numbers::is_finite(b), ExcNumberNotFinite());
+
+    const int ierr = vector->Update(a, *(v.vector), b, *(w.vector), 1.);
+
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+  }
+
+
+
+  inline
+  void
+  VectorBase::sadd (const TrilinosScalar  s,
+                    const VectorBase     &v)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+    Assert (local_size() == v.local_size(),
+            ExcDimensionMismatch(local_size(), v.local_size()));
+
+    Assert (numbers::is_finite(s), ExcNumberNotFinite());
+
+    const int ierr = vector->Update(1., *(v.vector), s);
+
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+  }
+
+
+
+  inline
+  void
+  VectorBase::sadd (const TrilinosScalar  s,
+                    const TrilinosScalar  a,
+                    const VectorBase     &v)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+    Assert (local_size() == v.local_size(),
+            ExcDimensionMismatch(local_size(), v.local_size()));
+
+    Assert (numbers::is_finite(s), ExcNumberNotFinite());
+    Assert (numbers::is_finite(a), ExcNumberNotFinite());
+
+    const int ierr = vector->Update(a, *(v.vector), s);
+
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+  }
+
+
+
+  inline
+  void
+  VectorBase::sadd (const TrilinosScalar  s,
+                    const TrilinosScalar  a,
+                    const VectorBase     &v,
+                    const TrilinosScalar  b,
+                    const VectorBase     &w)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+    Assert (local_size() == v.local_size(),
+            ExcDimensionMismatch(local_size(), v.local_size()));
+    Assert (local_size() == w.local_size(),
+            ExcDimensionMismatch(local_size(), w.local_size()));
+
+    Assert (numbers::is_finite(s), ExcNumberNotFinite());
+    Assert (numbers::is_finite(a), ExcNumberNotFinite());
+    Assert (numbers::is_finite(b), ExcNumberNotFinite());
+
+    const int ierr = vector->Update(a, *(v.vector), b, *(w.vector), s);
+
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+  }
+
+
+
+  inline
+  void
+  VectorBase::sadd (const TrilinosScalar  s,
+                    const TrilinosScalar  a,
+                    const VectorBase     &v,
+                    const TrilinosScalar  b,
+                    const VectorBase     &w,
+                    const TrilinosScalar  c,
+                    const VectorBase     &x)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+    Assert (local_size() == v.local_size(),
+            ExcDimensionMismatch(local_size(), v.local_size()));
+    Assert (local_size() == w.local_size(),
+            ExcDimensionMismatch(local_size(), w.local_size()));
+    Assert (local_size() == x.local_size(),
+            ExcDimensionMismatch(local_size(), x.local_size()));
+
+    Assert (numbers::is_finite(s), ExcNumberNotFinite());
+    Assert (numbers::is_finite(a), ExcNumberNotFinite());
+    Assert (numbers::is_finite(b), ExcNumberNotFinite());
+    Assert (numbers::is_finite(c), ExcNumberNotFinite());
+
+    // Update member can only
+    // input two other vectors so
+    // do it in two steps
+    const int ierr = vector->Update(a, *(v.vector), b, *(w.vector), s);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    const int jerr = vector->Update(c, *(x.vector), 1.);
+    Assert (jerr == 0, ExcTrilinosError(jerr));
+    (void)jerr; // removes -Wunused-parameter warning in optimized mode
+  }
+
+
+
+  inline
+  void
+  VectorBase::scale (const VectorBase &factors)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+    Assert (local_size() == factors.local_size(),
+            ExcDimensionMismatch(local_size(), factors.local_size()));
+
+    const int ierr = vector->Multiply (1.0, *(factors.vector), *vector, 0.0);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+  }
+
+
+
+  inline
+  void
+  VectorBase::equ (const TrilinosScalar  a,
+                   const VectorBase     &v)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+    Assert (numbers::is_finite(a), ExcNumberNotFinite());
+
+    // If we don't have the same map, copy.
+    if (vector->Map().SameAs(v.vector->Map())==false)
+      {
+        *vector = *v.vector;
+        *this *= a;
+      }
+    else
+      {
+        // Otherwise, just update
+        int ierr = vector->Update(a, *v.vector, 0.0);
+        AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+        last_action = Zero;
+      }
+
+  }
+
+
+
+  inline
+  void
+  VectorBase::equ (const TrilinosScalar  a,
+                   const VectorBase     &v,
+                   const TrilinosScalar  b,
+                   const VectorBase     &w)
+  {
+    // if we have ghost values, do not allow
+    // writing to this vector at all.
+    Assert (!has_ghost_elements(), ExcGhostsPresent());
+    Assert (v.local_size() == w.local_size(),
+            ExcDimensionMismatch (v.local_size(), w.local_size()));
+
+    Assert (numbers::is_finite(a), ExcNumberNotFinite());
+    Assert (numbers::is_finite(b), ExcNumberNotFinite());
+
+    // If we don't have the same map, copy.
+    if (vector->Map().SameAs(v.vector->Map())==false)
+      {
+        *vector = *v.vector;
+        sadd(a, b, w);
+      }
+    else
+      {
+        // Otherwise, just update. verify
+        // that *this does not only have
+        // the same map as v (the
+        // if-condition above) but also as
+        // w
+        Assert (vector->Map().SameAs(w.vector->Map()),
+                ExcDifferentParallelPartitioning());
+        int ierr = vector->Update(a, *v.vector, b, *w.vector, 0.0);
+        AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+        last_action = Zero;
+      }
+  }
+
+
+
+  inline
+  void
+  VectorBase::ratio (const VectorBase &v,
+                     const VectorBase &w)
+  {
+    Assert (v.local_size() == w.local_size(),
+            ExcDimensionMismatch (v.local_size(), w.local_size()));
+    Assert (local_size() == w.local_size(),
+            ExcDimensionMismatch (local_size(), w.local_size()));
+
+    const int ierr = vector->ReciprocalMultiply(1.0, *(w.vector),
+                                                *(v.vector), 0.0);
+
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+  }
+
+
+
+  inline
+  const Epetra_MultiVector &
+  VectorBase::trilinos_vector () const
+  {
+    return static_cast<const Epetra_MultiVector &>(*vector);
+  }
+
+
+
+  inline
+  Epetra_FEVector &
+  VectorBase::trilinos_vector ()
+  {
+    return *vector;
+  }
+
+
+
+  inline
+  const Epetra_Map &
+  VectorBase::vector_partitioner () const
+  {
+    return static_cast<const Epetra_Map &>(vector->Map());
+  }
+
+
+#endif // DOXYGEN
+
+}
+
+/*@}*/
+
+DEAL_II_NAMESPACE_CLOSE
+
+#endif // DEAL_II_USE_TRILINOS
+
+/*----------------------------   trilinos_vector_base.h     ---------------------------*/
+
+#endif
+/*----------------------------   trilinos_vector_base.h     ---------------------------*/

Added: branches/s-wang2/for_deal.II/source/lac/constraint_matrix.cc
===================================================================
--- branches/s-wang2/for_deal.II/source/lac/constraint_matrix.cc	                        (rev 0)
+++ branches/s-wang2/for_deal.II/source/lac/constraint_matrix.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,2578 @@
+//---------------------------------------------------------------------------
+//    $Id: constraint_matrix.cc 27628 2012-11-20 22:49:26Z heister $
+//
+//    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 by the deal.II authors
+//
+//    This file is subject to QPL and may not be  distributed
+//    without copyright and license information. Please refer
+//    to the file deal.II/doc/license.html for the  text  and
+//    further information on this license.
+//
+//---------------------------------------------------------------------------
+
+
+#include <deal.II/lac/constraint_matrix.h>
+#include <deal.II/lac/constraint_matrix.templates.h>
+
+#include <deal.II/base/memory_consumption.h>
+#include <deal.II/lac/compressed_sparsity_pattern.h>
+#include <deal.II/lac/compressed_set_sparsity_pattern.h>
+#include <deal.II/lac/compressed_simple_sparsity_pattern.h>
+#include <deal.II/lac/block_vector.h>
+#include <deal.II/lac/block_sparse_matrix.h>
+#include <deal.II/lac/sparse_matrix_ez.h>
+#include <deal.II/lac/block_sparse_matrix_ez.h>
+#include <deal.II/lac/parallel_vector.h>
+#include <deal.II/lac/parallel_block_vector.h>
+#include <deal.II/lac/petsc_vector.h>
+#include <deal.II/lac/petsc_block_vector.h>
+#include <deal.II/lac/petsc_sparse_matrix.h>
+#include <deal.II/lac/petsc_block_sparse_matrix.h>
+#include <deal.II/lac/petsc_parallel_vector.h>
+#include <deal.II/lac/petsc_parallel_block_vector.h>
+#include <deal.II/lac/petsc_parallel_sparse_matrix.h>
+#include <deal.II/lac/petsc_parallel_block_sparse_matrix.h>
+#include <deal.II/lac/trilinos_vector.h>
+#include <deal.II/lac/trilinos_block_vector.h>
+#include <deal.II/lac/trilinos_sparse_matrix.h>
+#include <deal.II/lac/trilinos_block_sparse_matrix.h>
+#include <deal.II/lac/matrix_block.h>
+
+#include <algorithm>
+#include <numeric>
+#include <set>
+#include <ostream>
+
+DEAL_II_NAMESPACE_OPEN
+
+
+
+// Static member variable
+const Table<2,bool> ConstraintMatrix::default_empty_table = Table<2,bool>();
+
+
+
+bool
+ConstraintMatrix::check_zero_weight (const std::pair<unsigned int, double> &p)
+{
+  return (p.second == 0);
+}
+
+
+
+bool
+ConstraintMatrix::ConstraintLine::operator < (const ConstraintLine &a) const
+{
+  return line < a.line;
+}
+
+
+
+bool
+ConstraintMatrix::ConstraintLine::operator == (const ConstraintLine &a) const
+{
+  return line == a.line;
+}
+
+
+
+std::size_t
+ConstraintMatrix::ConstraintLine::memory_consumption () const
+{
+  return (MemoryConsumption::memory_consumption (line) +
+          MemoryConsumption::memory_consumption (entries) +
+          MemoryConsumption::memory_consumption (inhomogeneity));
+}
+
+
+
+void
+ConstraintMatrix::add_lines (const std::set<unsigned int> &lines)
+{
+  for (std::set<unsigned int>::const_iterator
+       i = lines.begin(); i != lines.end(); ++i)
+    add_line (*i);
+}
+
+
+
+void
+ConstraintMatrix::add_lines (const std::vector<bool> &lines)
+{
+  for (unsigned int i=0; i<lines.size(); ++i)
+    if (lines[i] == true)
+      add_line (i);
+}
+
+
+
+void
+ConstraintMatrix::add_lines (const IndexSet &lines)
+{
+  for (unsigned int i=0; i<lines.n_elements(); ++i)
+    add_line (lines.nth_index_in_set(i));
+}
+
+
+
+void
+ConstraintMatrix::add_entries
+(const unsigned int                                  line,
+ const std::vector<std::pair<unsigned int,double> > &col_val_pairs)
+{
+  Assert (sorted==false, ExcMatrixIsClosed());
+  Assert (is_constrained(line), ExcLineInexistant(line));
+
+  ConstraintLine *line_ptr = &lines[lines_cache[calculate_line_index(line)]];
+  Assert (line_ptr->line == line, ExcInternalError());
+
+  // if in debug mode, check whether an
+  // entry for this column already
+  // exists and if its the same as
+  // the one entered at present
+  //
+  // in any case: skip this entry if
+  // an entry for this column already
+  // exists, since we don't want to
+  // enter it twice
+  for (std::vector<std::pair<unsigned int,double> >::const_iterator
+       col_val_pair = col_val_pairs.begin();
+       col_val_pair!=col_val_pairs.end(); ++col_val_pair)
+    {
+      Assert (line != col_val_pair->first,
+              ExcMessage ("Can't constrain a degree of freedom to itself"));
+
+      for (ConstraintLine::Entries::const_iterator
+           p=line_ptr->entries.begin();
+           p != line_ptr->entries.end(); ++p)
+        if (p->first == col_val_pair->first)
+          {
+            // entry exists, break
+            // innermost loop
+            Assert (p->second == col_val_pair->second,
+                    ExcEntryAlreadyExists(line, col_val_pair->first,
+                                          p->second, col_val_pair->second));
+            break;
+          }
+
+      line_ptr->entries.push_back (*col_val_pair);
+    }
+}
+
+
+
+void ConstraintMatrix::add_selected_constraints
+(const ConstraintMatrix &constraints,
+ const IndexSet         &filter)
+{
+  if (constraints.n_constraints() == 0)
+    return;
+
+  Assert (filter.size() > constraints.lines.back().line,
+          ExcMessage ("Filter needs to be larger than constraint matrix size."));
+  for (std::vector<ConstraintLine>::const_iterator line=constraints.lines.begin();
+       line!=constraints.lines.end(); ++line)
+    if (filter.is_element(line->line))
+      {
+        const unsigned int row = filter.index_within_set (line->line);
+        add_line (row);
+        set_inhomogeneity (row, line->inhomogeneity);
+        for (unsigned int i=0; i<line->entries.size(); ++i)
+          if (filter.is_element(line->entries[i].first))
+            add_entry (row, filter.index_within_set (line->entries[i].first),
+                       line->entries[i].second);
+      }
+}
+
+
+
+void ConstraintMatrix::close ()
+{
+  if (sorted == true)
+    return;
+
+  // sort the lines
+  std::sort (lines.begin(), lines.end());
+
+  // update list of pointers and give the
+  // vector a sharp size since we won't
+  // modify the size any more after this
+  // point.
+  {
+    std::vector<unsigned int> new_lines (lines_cache.size(),
+                                         numbers::invalid_unsigned_int);
+    unsigned int counter = 0;
+    for (std::vector<ConstraintLine>::const_iterator line=lines.begin();
+         line!=lines.end(); ++line, ++counter)
+      new_lines[calculate_line_index(line->line)] = counter;
+    std::swap (lines_cache, new_lines);
+  }
+
+  // in debug mode: check whether we really
+  // set the pointers correctly.
+  for (unsigned int i=0; i<lines_cache.size(); ++i)
+    if (lines_cache[i] != numbers::invalid_unsigned_int)
+      Assert (i == calculate_line_index(lines[lines_cache[i]].line),
+              ExcInternalError());
+
+  // first, strip zero entries, as we
+  // have to do that only once
+  for (std::vector<ConstraintLine>::iterator line = lines.begin();
+       line!=lines.end(); ++line)
+    // first remove zero
+    // entries. that would mean that
+    // in the linear constraint for a
+    // node, x_i = ax_1 + bx_2 + ...,
+    // another node times 0
+    // appears. obviously,
+    // 0*something can be omitted
+    line->entries.erase (std::remove_if (line->entries.begin(),
+                                         line->entries.end(),
+                                         &check_zero_weight),
+                         line->entries.end());
+
+
+
+#ifdef DEBUG
+  // In debug mode we are computing an estimate for the maximum number
+  // of constraints so that we can bail out if there is a cycle in the
+  // constraints (which is easier than searching for cycles in the graph).
+  //
+  // Let us figure out the largest dof index. This is an upper bound for the
+  // number of constraints because it is an approximation for the number of dofs
+  // in our system.
+  unsigned int largest_idx = 0;
+  for (std::vector<ConstraintLine>::iterator line = lines.begin();
+       line!=lines.end(); ++line)
+    {
+      for (ConstraintLine::Entries::iterator it = line->entries.begin(); it!=line->entries.end(); ++it)
+        {
+          largest_idx=std::max(largest_idx, it->first);
+        }
+    }
+#endif
+
+  // replace references to dofs that
+  // are themselves constrained. note
+  // that because we may replace
+  // references to other dofs that
+  // may themselves be constrained to
+  // third ones, we have to iterate
+  // over all this until we replace
+  // no chains of constraints any
+  // more
+  //
+  // the iteration replaces
+  // references to constrained
+  // degrees of freedom by
+  // second-order references. for
+  // example if x3=x0/2+x2/2 and
+  // x2=x0/2+x1/2, then the new list
+  // will be x3=x0/2+x0/4+x1/4. note
+  // that x0 appear twice. we will
+  // throw this duplicate out in the
+  // following step, where we sort
+  // the list so that throwing out
+  // duplicates becomes much more
+  // efficient. also, we have to do
+  // it only once, rather than in
+  // each iteration
+  unsigned int iteration = 0;
+  while (true)
+    {
+      bool chained_constraint_replaced = false;
+
+      for (std::vector<ConstraintLine>::iterator line = lines.begin();
+           line!=lines.end(); ++line)
+        {
+#ifdef DEBUG
+          // we need to keep track of how many replacements we do in this line, because we can
+          // end up in a cycle A->B->C->A without the number of entries growing.
+          unsigned int n_replacements = 0;
+#endif
+
+
+
+          // loop over all entries of
+          // this line (including
+          // ones that we have
+          // appended in this go
+          // around) and see whether
+          // they are further
+          // constrained. ignore
+          // elements that we don't
+          // store on the current
+          // processor
+          unsigned int entry = 0;
+          while (entry < line->entries.size())
+            if (((local_lines.size() == 0)
+                 ||
+                 (local_lines.is_element(line->entries[entry].first)))
+                &&
+                is_constrained (line->entries[entry].first))
+              {
+                // ok, this entry is
+                // further
+                // constrained:
+                chained_constraint_replaced = true;
+
+                // look up the chain
+                // of constraints for
+                // this entry
+                const unsigned int dof_index = line->entries[entry].first;
+                const double       weight = line->entries[entry].second;
+
+                Assert (dof_index != line->line,
+                        ExcMessage ("Cycle in constraints detected!"));
+
+                const ConstraintLine *constrained_line =
+                  &lines[lines_cache[calculate_line_index(dof_index)]];
+                Assert (constrained_line->line == dof_index,
+                        ExcInternalError());
+
+                // now we have to
+                // replace an entry
+                // by its
+                // expansion. we do
+                // that by
+                // overwriting the
+                // entry by the first
+                // entry of the
+                // expansion and
+                // adding the
+                // remaining ones to
+                // the end, where we
+                // will later process
+                // them once more
+                //
+                // we can of course
+                // only do that if
+                // the DoF that we
+                // are currently
+                // handle is
+                // constrained by a
+                // linear combination
+                // of other dofs:
+                if (constrained_line->entries.size() > 0)
+                  {
+                    for (unsigned int i=0; i<constrained_line->entries.size(); ++i)
+                      Assert (dof_index != constrained_line->entries[i].first,
+                              ExcMessage ("Cycle in constraints detected!"));
+
+                    // replace first
+                    // entry, then tack
+                    // the rest to the
+                    // end of the list
+                    line->entries[entry] =
+                      std::make_pair (constrained_line->entries[0].first,
+                                      constrained_line->entries[0].second *
+                                      weight);
+
+                    for (unsigned int i=1; i<constrained_line->entries.size(); ++i)
+                      line->entries
+                      .push_back (std::make_pair (constrained_line->entries[i].first,
+                                                  constrained_line->entries[i].second *
+                                                  weight));
+
+#ifdef DEBUG
+                    // keep track of how many entries we replace in this line. If we do more than
+                    // there are constraints or dofs in our system, we must have a cycle.
+                    ++n_replacements;
+                    Assert(n_replacements/2<largest_idx, ExcMessage("Cycle in constraints detected!"));
+                    if (n_replacements/2>=largest_idx)
+                      return; // this enables us to test for this Exception.
+#endif
+                  }
+                else
+                  // the DoF that we
+                  // encountered is not
+                  // constrained by a linear
+                  // combination of other
+                  // dofs but is equal to
+                  // just the inhomogeneity
+                  // (i.e. its chain of
+                  // entries is empty). in
+                  // that case, we can't just
+                  // overwrite the current
+                  // entry, but we have to
+                  // actually eliminate it
+                  {
+                    line->entries.erase (line->entries.begin()+entry);
+                  }
+
+                line->inhomogeneity += constrained_line->inhomogeneity *
+                                       weight;
+
+                // now that we're here, do
+                // not increase index by
+                // one but rather make
+                // another pass for the
+                // present entry because we
+                // have replaced the
+                // present entry by another
+                // one, or because we have
+                // deleted it and shifted
+                // all following ones one
+                // forward
+              }
+            else
+              // entry not further
+              // constrained. just move
+              // ahead by one
+              ++entry;
+        }
+
+      // if we didn't do anything in
+      // this round, then quit the
+      // loop
+      if (chained_constraint_replaced == false)
+        break;
+
+      // increase iteration count. note
+      // that we should not iterate more
+      // times than there are constraints,
+      // since this puts a natural upper
+      // bound on the length of constraint
+      // chains
+      ++iteration;
+      Assert (iteration <= lines.size(), ExcInternalError());
+    }
+
+  // finally sort the entries and re-scale
+  // them if necessary. in this step, we also
+  // throw out duplicates as mentioned
+  // above. moreover, as some entries might
+  // have had zero weights, we replace them
+  // by a vector with sharp sizes.
+  for (std::vector<ConstraintLine>::iterator line = lines.begin();
+       line!=lines.end(); ++line)
+    {
+      std::sort (line->entries.begin(), line->entries.end());
+
+      // loop over the now sorted list and
+      // see whether any of the entries
+      // references the same dofs more than
+      // once in order to find how many
+      // non-duplicate entries we have. This
+      // lets us allocate the correct amount
+      // of memory for the constraint
+      // entries.
+      unsigned int duplicates = 0;
+      for (unsigned int i=1; i<line->entries.size(); ++i)
+        if (line->entries[i].first == line->entries[i-1].first)
+          duplicates++;
+
+      if (duplicates > 0 || line->entries.size() < line->entries.capacity())
+        {
+          ConstraintLine::Entries new_entries;
+
+          // if we have no duplicates, copy
+          // verbatim the entries. this
+          // way, the final size is of the
+          // vector is correct.
+          if (duplicates == 0)
+            new_entries = line->entries;
+          else
+            {
+              // otherwise, we need to go
+              // through the list by and and
+              // resolve the duplicates
+              new_entries.reserve (line->entries.size() - duplicates);
+              new_entries.push_back(line->entries[0]);
+              for (unsigned int j=1; j<line->entries.size(); ++j)
+                if (line->entries[j].first == line->entries[j-1].first)
+                  {
+                    Assert (new_entries.back().first == line->entries[j].first,
+                            ExcInternalError());
+                    new_entries.back().second += line->entries[j].second;
+                  }
+                else
+                  new_entries.push_back (line->entries[j]);
+
+              Assert (new_entries.size() == line->entries.size() - duplicates,
+                      ExcInternalError());
+
+              // make sure there are
+              // really no duplicates
+              // left and that the list
+              // is still sorted
+              for (unsigned int j=1; j<new_entries.size(); ++j)
+                {
+                  Assert (new_entries[j].first != new_entries[j-1].first,
+                          ExcInternalError());
+                  Assert (new_entries[j].first > new_entries[j-1].first,
+                          ExcInternalError());
+                }
+            }
+
+          // replace old list of
+          // constraints for this dof by
+          // the new one
+          line->entries.swap (new_entries);
+        }
+
+      // finally do the following
+      // check: if the sum of
+      // weights for the
+      // constraints is close to
+      // one, but not exactly
+      // one, then rescale all
+      // the weights so that they
+      // sum up to 1. this adds a
+      // little numerical
+      // stability and avoids all
+      // sorts of problems where
+      // the actual value is
+      // close to, but not quite
+      // what we expected
+      //
+      // the case where the
+      // weights don't quite sum
+      // up happens when we
+      // compute the
+      // interpolation weights
+      // "on the fly", i.e. not
+      // from precomputed
+      // tables. in this case,
+      // the interpolation
+      // weights are also subject
+      // to round-off
+      double sum = 0;
+      for (unsigned int i=0; i<line->entries.size(); ++i)
+        sum += line->entries[i].second;
+      if ((sum != 1.0) && (std::fabs (sum-1.) < 1.e-13))
+        {
+          for (unsigned int i=0; i<line->entries.size(); ++i)
+            line->entries[i].second /= sum;
+          line->inhomogeneity /= sum;
+        }
+    } // end of loop over all constraint lines
+
+#ifdef DEBUG
+  // if in debug mode: check that no dof is
+  // constrained to another dof that is also
+  // constrained. exclude dofs from this
+  // check whose constraint lines are not
+  // stored on the local processor
+  for (std::vector<ConstraintLine>::const_iterator line=lines.begin();
+       line!=lines.end(); ++line)
+    for (ConstraintLine::Entries::const_iterator
+         entry=line->entries.begin();
+         entry!=line->entries.end(); ++entry)
+      if ((local_lines.size() == 0)
+          ||
+          (local_lines.is_element(entry->first)))
+        {
+          // make sure that entry->first is
+          // not the index of a line itself
+          const bool is_circle = is_constrained(entry->first);
+          Assert (is_circle == false,
+                  ExcDoFConstrainedToConstrainedDoF(line->line, entry->first));
+        }
+#endif
+
+  sorted = true;
+}
+
+
+
+void
+ConstraintMatrix::merge (const ConstraintMatrix &other_constraints,
+                         const MergeConflictBehavior merge_conflict_behavior)
+{
+  AssertThrow(local_lines == other_constraints.local_lines,
+              ExcNotImplemented());
+
+  // store the previous state with
+  // respect to sorting
+  const bool object_was_sorted = sorted;
+  sorted = false;
+
+  if (other_constraints.lines_cache.size() > lines_cache.size())
+    lines_cache.resize(other_constraints.lines_cache.size(),
+                       numbers::invalid_unsigned_int);
+
+  // first action is to fold into the present
+  // object possible constraints in the
+  // second object. we don't strictly need to
+  // do this any more since the
+  // ConstraintMatrix has learned to deal
+  // with chains of constraints in the
+  // close() function, but we have
+  // traditionally done this and it's not
+  // overly hard to do.
+  //
+  // for this, loop over all
+  // constraints and replace the
+  // constraint lines with a new one
+  // where constraints are replaced
+  // if necessary.
+  ConstraintLine::Entries tmp;
+  for (std::vector<ConstraintLine>::iterator line=lines.begin();
+       line!=lines.end(); ++line)
+    {
+      tmp.clear ();
+      for (unsigned int i=0; i<line->entries.size(); ++i)
+        {
+          // if the present dof is not
+          // constrained, or if we won't take
+          // the constraint from the other
+          // object, then simply copy it over
+          if (other_constraints.is_constrained(line->entries[i].first) == false
+              ||
+              ((merge_conflict_behavior != right_object_wins)
+               &&
+               other_constraints.is_constrained(line->entries[i].first)
+               &&
+               this->is_constrained(line->entries[i].first)))
+            tmp.push_back(line->entries[i]);
+          else
+            // otherwise resolve
+            // further constraints by
+            // replacing the old
+            // entry by a sequence of
+            // new entries taken from
+            // the other object, but
+            // with multiplied
+            // weights
+            {
+              const ConstraintLine::Entries *other_line
+                = other_constraints.get_constraint_entries (line->entries[i].first);
+              Assert (other_line != 0,
+                      ExcInternalError());
+
+              const double weight = line->entries[i].second;
+
+              for (ConstraintLine::Entries::const_iterator j=other_line->begin();
+                   j!=other_line->end(); ++j)
+                tmp.push_back (std::pair<unsigned int,double>(j->first,
+                                                              j->second*weight));
+
+              line->inhomogeneity += other_constraints.get_inhomogeneity(line->entries[i].first) *
+                                     weight;
+            }
+        }
+      // finally exchange old and
+      // newly resolved line
+      line->entries.swap (tmp);
+    }
+
+
+
+  // next action: append those lines at the
+  // end that we want to add
+  for (std::vector<ConstraintLine>::const_iterator
+       line=other_constraints.lines.begin();
+       line!=other_constraints.lines.end(); ++line)
+    if (is_constrained(line->line) == false)
+      lines.push_back (*line);
+    else
+      {
+        // the constrained dof we want to
+        // copy from the other object is also
+        // constrained here. let's see what
+        // we should do with that
+        switch (merge_conflict_behavior)
+          {
+          case no_conflicts_allowed:
+            AssertThrow (false,
+                         ExcDoFIsConstrainedFromBothObjects (line->line));
+            break;
+
+          case left_object_wins:
+            // ignore this constraint
+            break;
+
+          case right_object_wins:
+            // we need to replace the
+            // existing constraint by
+            // the one from the other
+            // object
+            lines[lines_cache[calculate_line_index(line->line)]].entries
+              = line->entries;
+            lines[lines_cache[calculate_line_index(line->line)]].inhomogeneity
+              = line->inhomogeneity;
+            break;
+
+          default:
+            Assert (false, ExcNotImplemented());
+          }
+      }
+
+  // update the lines cache
+  unsigned int counter = 0;
+  for (std::vector<ConstraintLine>::const_iterator line=lines.begin();
+       line!=lines.end(); ++line, ++counter)
+    lines_cache[calculate_line_index(line->line)] = counter;
+
+  // if the object was sorted before,
+  // then make sure it is so
+  // afterward as well. otherwise
+  // leave everything in the unsorted
+  // state
+  if (object_was_sorted == true)
+    close ();
+}
+
+
+
+void ConstraintMatrix::shift (const unsigned int offset)
+{
+  //TODO: this doesn't work with IndexSets yet. [TH]
+  AssertThrow(local_lines.size()==0, ExcNotImplemented());
+
+  lines_cache.insert (lines_cache.begin(), offset,
+                      numbers::invalid_unsigned_int);
+
+  for (std::vector<ConstraintLine>::iterator i = lines.begin();
+       i != lines.end(); ++i)
+    {
+      i->line += offset;
+      for (ConstraintLine::Entries::iterator
+           j = i->entries.begin();
+           j != i->entries.end(); ++j)
+        j->first += offset;
+    }
+}
+
+
+
+void ConstraintMatrix::clear ()
+{
+  {
+    std::vector<ConstraintLine> tmp;
+    lines.swap (tmp);
+  }
+
+  {
+    std::vector<unsigned int> tmp;
+    lines_cache.swap (tmp);
+  }
+
+  sorted = false;
+}
+
+
+
+void ConstraintMatrix::reinit (const IndexSet &local_constraints)
+{
+  local_lines = local_constraints;
+  clear();
+}
+
+
+
+void ConstraintMatrix::condense (const SparsityPattern &uncondensed,
+                                 SparsityPattern       &condensed) const
+{
+  Assert (sorted == true, ExcMatrixNotClosed());
+  Assert (uncondensed.is_compressed() == true, ExcMatrixNotClosed());
+  Assert (uncondensed.n_rows() == uncondensed.n_cols(),
+          ExcNotQuadratic());
+
+
+  // store for each line of the matrix
+  // its new line number
+  // after compression. If the shift is
+  // -1, this line will be condensed away
+  std::vector<int> new_line;
+
+  new_line.reserve (uncondensed.n_rows());
+
+  std::vector<ConstraintLine>::const_iterator next_constraint = lines.begin();
+  unsigned int                                shift           = 0;
+  unsigned int n_rows = uncondensed.n_rows();
+
+  if (next_constraint == lines.end())
+    // if no constraint is to be handled
+    for (unsigned int row=0; row!=n_rows; ++row)
+      new_line.push_back (row);
+  else
+    for (unsigned int row=0; row!=n_rows; ++row)
+      if (row == next_constraint->line)
+        {
+          // this line is constrained
+          new_line.push_back (-1);
+          // note that @p{lines} is ordered
+          ++shift;
+          ++next_constraint;
+          if (next_constraint == lines.end())
+            // nothing more to do; finish rest
+            // of loop
+            {
+              for (unsigned int i=row+1; i<n_rows; ++i)
+                new_line.push_back (i-shift);
+              break;
+            };
+        }
+      else
+        new_line.push_back (row-shift);
+
+
+  next_constraint = lines.begin();
+  // note: in this loop we need not check
+  // whether @p{next_constraint} is a valid
+  // iterator, since @p{next_constraint} is
+  // only evaluated so often as there are
+  // entries in new_line[*] which tells us
+  // which constraints exist
+  for (unsigned int row=0; row<uncondensed.n_rows(); ++row)
+    if (new_line[row] != -1)
+      // line not constrained
+      // copy entries if column will not
+      // be condensed away, distribute
+      // otherwise
+      for (unsigned int j=uncondensed.get_rowstart_indices()[row];
+           j<uncondensed.get_rowstart_indices()[row+1]; ++j)
+        if (new_line[uncondensed.get_column_numbers()[j]] != -1)
+          condensed.add (new_line[row], new_line[uncondensed.get_column_numbers()[j]]);
+        else
+          {
+            // let c point to the constraint
+            // of this column
+            std::vector<ConstraintLine>::const_iterator c = lines.begin();
+            while (c->line != uncondensed.get_column_numbers()[j])
+              ++c;
+
+            for (unsigned int q=0; q!=c->entries.size(); ++q)
+              condensed.add (new_line[row], new_line[c->entries[q].first]);
+          }
+    else
+      // line must be distributed
+      {
+        for (unsigned int j=uncondensed.get_rowstart_indices()[row];
+             j<uncondensed.get_rowstart_indices()[row+1]; ++j)
+          // for each entry: distribute
+          if (new_line[uncondensed.get_column_numbers()[j]] != -1)
+            // column is not constrained
+            for (unsigned int q=0; q!=next_constraint->entries.size(); ++q)
+              condensed.add (new_line[next_constraint->entries[q].first],
+                             new_line[uncondensed.get_column_numbers()[j]]);
+
+          else
+            // not only this line but
+            // also this col is constrained
+            {
+              // let c point to the constraint
+              // of this column
+              std::vector<ConstraintLine>::const_iterator c = lines.begin();
+              while (c->line != uncondensed.get_column_numbers()[j]) ++c;
+
+              for (unsigned int p=0; p!=c->entries.size(); ++p)
+                for (unsigned int q=0; q!=next_constraint->entries.size(); ++q)
+                  condensed.add (new_line[next_constraint->entries[q].first],
+                                 new_line[c->entries[p].first]);
+            };
+
+        ++next_constraint;
+      };
+
+  condensed.compress();
+}
+
+
+
+void ConstraintMatrix::condense (SparsityPattern &sparsity) const
+{
+  Assert (sorted == true, ExcMatrixNotClosed());
+  Assert (sparsity.is_compressed() == false, ExcMatrixIsClosed());
+  Assert (sparsity.n_rows() == sparsity.n_cols(),
+          ExcNotQuadratic());
+
+  // store for each index whether it must be
+  // distributed or not. If entry is
+  // numbers::invalid_unsigned_int,
+  // no distribution is necessary.
+  // otherwise, the number states which line
+  // in the constraint matrix handles this
+  // index
+  std::vector<unsigned int> distribute(sparsity.n_rows(),
+                                       numbers::invalid_unsigned_int);
+
+  for (unsigned int c=0; c<lines.size(); ++c)
+    distribute[lines[c].line] = c;
+
+  const unsigned int n_rows = sparsity.n_rows();
+  for (unsigned int row=0; row<n_rows; ++row)
+    {
+      if (distribute[row] == numbers::invalid_unsigned_int)
+        {
+          // regular line. loop over cols all
+          // valid cols. note that this
+          // changes the line we are
+          // presently working on: we add
+          // additional entries. these are
+          // put to the end of the
+          // row. however, as constrained
+          // nodes cannot be constrained to
+          // other constrained nodes, nothing
+          // will happen if we run into these
+          // added nodes, as they can't be
+          // distributed further. we might
+          // store the position of the last
+          // old entry and stop work there,
+          // but since operating on the newly
+          // added ones only takes two
+          // comparisons (column index valid,
+          // distribute[column] necessarily
+          // ==numbers::invalid_unsigned_int),
+          // it is cheaper to not do so and
+          // run right until the end of the
+          // line
+          for (SparsityPattern::iterator entry = sparsity.begin(row);
+               ((entry != sparsity.end(row)) &&
+                entry->is_valid_entry());
+               ++entry)
+            {
+              const unsigned int column = entry->column();
+
+              if (distribute[column] != numbers::invalid_unsigned_int)
+                {
+                  // distribute entry
+                  // at regular row
+                  // @p{row} and
+                  // irregular column
+                  // sparsity.colnums[j]
+                  for (unsigned int q=0;
+                       q!=lines[distribute[column]].entries.size();
+                       ++q)
+                    sparsity.add (row,
+                                  lines[distribute[column]].entries[q].first);
+                }
+            }
+        }
+      else
+        // row must be
+        // distributed. note that
+        // here the present row is
+        // not touched (unlike above)
+        {
+          for (SparsityPattern::iterator entry = sparsity.begin(row);
+               (entry != sparsity.end(row)) && entry->is_valid_entry(); ++entry)
+            {
+              const unsigned int column = entry->column();
+              if (distribute[column] == numbers::invalid_unsigned_int)
+                // distribute entry at irregular
+                // row @p{row} and regular column
+                // sparsity.colnums[j]
+                for (unsigned int q=0;
+                     q!=lines[distribute[row]].entries.size(); ++q)
+                  sparsity.add (lines[distribute[row]].entries[q].first,
+                                column);
+              else
+                // distribute entry at irregular
+                // row @p{row} and irregular column
+                // sparsity.get_column_numbers()[j]
+                for (unsigned int p=0; p!=lines[distribute[row]].entries.size(); ++p)
+                  for (unsigned int q=0;
+                       q!=lines[distribute[column]].entries.size(); ++q)
+                    sparsity.add (lines[distribute[row]].entries[p].first,
+                                  lines[distribute[column]].entries[q].first);
+            }
+        }
+    }
+
+  sparsity.compress();
+}
+
+
+
+void ConstraintMatrix::condense (CompressedSparsityPattern &sparsity) const
+{
+  Assert (sorted == true, ExcMatrixNotClosed());
+  Assert (sparsity.n_rows() == sparsity.n_cols(),
+          ExcNotQuadratic());
+
+  // store for each index whether it must be
+  // distributed or not. If entry is
+  // numbers::invalid_unsigned_int,
+  // no distribution is necessary.
+  // otherwise, the number states which line
+  // in the constraint matrix handles this
+  // index
+  std::vector<unsigned int> distribute(sparsity.n_rows(),
+                                       numbers::invalid_unsigned_int);
+
+  for (unsigned int c=0; c<lines.size(); ++c)
+    distribute[lines[c].line] = c;
+
+  const unsigned int n_rows = sparsity.n_rows();
+  for (unsigned int row=0; row<n_rows; ++row)
+    {
+      if (distribute[row] == numbers::invalid_unsigned_int)
+        // regular line. loop over
+        // cols. note that as we
+        // proceed to distribute
+        // cols, the loop may get
+        // longer
+        for (unsigned int j=0; j<sparsity.row_length(row); ++j)
+          {
+            const unsigned int column = sparsity.column_number(row,j);
+
+            if (distribute[column] != numbers::invalid_unsigned_int)
+              {
+                // distribute entry
+                // at regular row
+                // @p{row} and
+                // irregular column
+                // column. note that
+                // this changes the
+                // line we are
+                // presently working
+                // on: we add
+                // additional
+                // entries. if we add
+                // another entry at a
+                // column behind the
+                // present one, we
+                // will encounter it
+                // later on (but
+                // since it can't be
+                // further
+                // constrained, won't
+                // have to do
+                // anything about
+                // it). if we add it
+                // up front of the
+                // present column, we
+                // will find the
+                // present column
+                // later on again as
+                // it was shifted
+                // back (again
+                // nothing happens,
+                // in particular no
+                // endless loop, as
+                // when we encounter
+                // it the second time
+                // we won't be able
+                // to add more
+                // entries as they
+                // all already exist,
+                // but we do the same
+                // work more often
+                // than necessary,
+                // and the loop gets
+                // longer), so move
+                // the cursor one to
+                // the right in the
+                // case that we add
+                // an entry up front
+                // that did not exist
+                // before. check
+                // whether it existed
+                // before by tracking
+                // the length of this
+                // row
+                unsigned int old_rowlength = sparsity.row_length(row);
+                for (unsigned int q=0;
+                     q!=lines[distribute[column]].entries.size();
+                     ++q)
+                  {
+                    const unsigned int
+                    new_col = lines[distribute[column]].entries[q].first;
+
+                    sparsity.add (row, new_col);
+
+                    const unsigned int new_rowlength = sparsity.row_length(row);
+                    if ((new_col < column) && (old_rowlength != new_rowlength))
+                      ++j;
+                    old_rowlength = new_rowlength;
+                  };
+              };
+          }
+      else
+        // row must be distributed
+        for (unsigned int j=0; j<sparsity.row_length(row); ++j)
+          {
+            const unsigned int column = sparsity.column_number(row,j);
+
+            if (distribute[column] == numbers::invalid_unsigned_int)
+              // distribute entry at irregular
+              // row @p{row} and regular column
+              // sparsity.colnums[j]
+              for (unsigned int q=0;
+                   q!=lines[distribute[row]].entries.size(); ++q)
+                sparsity.add (lines[distribute[row]].entries[q].first,
+                              column);
+            else
+              // distribute entry at irregular
+              // row @p{row} and irregular column
+              // sparsity.get_column_numbers()[j]
+              for (unsigned int p=0; p!=lines[distribute[row]].entries.size(); ++p)
+                for (unsigned int q=0;
+                     q!=lines[distribute[sparsity.column_number(row,j)]]
+                     .entries.size(); ++q)
+                  sparsity.add (lines[distribute[row]].entries[p].first,
+                                lines[distribute[sparsity.column_number(row,j)]]
+                                .entries[q].first);
+          };
+    };
+}
+
+
+
+void ConstraintMatrix::condense (CompressedSetSparsityPattern &sparsity) const
+{
+  Assert (sorted == true, ExcMatrixNotClosed());
+  Assert (sparsity.n_rows() == sparsity.n_cols(),
+          ExcNotQuadratic());
+
+  // store for each index whether it must be
+  // distributed or not. If entry is
+  // numbers::invalid_unsigned_int,
+  // no distribution is necessary.
+  // otherwise, the number states which line
+  // in the constraint matrix handles this
+  // index
+  std::vector<unsigned int> distribute(sparsity.n_rows(),
+                                       numbers::invalid_unsigned_int);
+
+  for (unsigned int c=0; c<lines.size(); ++c)
+    distribute[lines[c].line] = c;
+
+  const unsigned int n_rows = sparsity.n_rows();
+  for (unsigned int row=0; row<n_rows; ++row)
+    {
+      if (distribute[row] == numbers::invalid_unsigned_int)
+        {
+          // regular line. loop over
+          // cols. note that as we proceed to
+          // distribute cols, the loop may
+          // get longer
+          CompressedSetSparsityPattern::row_iterator col_num = sparsity.row_begin (row);
+
+          for (; col_num != sparsity.row_end (row); ++col_num)
+            {
+              const unsigned int column = *col_num;
+
+              if (distribute[column] != numbers::invalid_unsigned_int)
+                {
+                  // row
+                  for (unsigned int q=0;
+                       q!=lines[distribute[column]].entries.size();
+                       ++q)
+                    {
+                      const unsigned int
+                      new_col = lines[distribute[column]].entries[q].first;
+
+                      sparsity.add (row, new_col);
+                    }
+                }
+            }
+        }
+      else
+        // row must be distributed
+        {
+          CompressedSetSparsityPattern::row_iterator col_num = sparsity.row_begin (row);
+
+          for (; col_num != sparsity.row_end (row); ++col_num)
+            {
+              const unsigned int column = *col_num;
+
+              if (distribute[column] == numbers::invalid_unsigned_int)
+                // distribute entry at irregular
+                // row @p{row} and regular column
+                // sparsity.colnums[j]
+                for (unsigned int q=0;
+                     q!=lines[distribute[row]].entries.size(); ++q)
+                  sparsity.add (lines[distribute[row]].entries[q].first,
+                                column);
+              else
+                // distribute entry at irregular
+                // row @p{row} and irregular column
+                // sparsity.get_column_numbers()[j]
+                for (unsigned int p=0; p!=lines[distribute[row]].entries.size(); ++p)
+                  for (unsigned int q=0;
+                       q!=lines[distribute[column]]
+                       .entries.size(); ++q)
+                    sparsity.add (lines[distribute[row]].entries[p].first,
+                                  lines[distribute[column]]
+                                  .entries[q].first);
+            };
+        }
+    };
+}
+
+
+
+void ConstraintMatrix::condense (CompressedSimpleSparsityPattern &sparsity) const
+{
+  Assert (sorted == true, ExcMatrixNotClosed());
+  Assert (sparsity.n_rows() == sparsity.n_cols(),
+          ExcNotQuadratic());
+
+  // store for each index whether it must be
+  // distributed or not. If entry is
+  // numbers::invalid_unsigned_int,
+  // no distribution is necessary.
+  // otherwise, the number states which line
+  // in the constraint matrix handles this
+  // index
+  std::vector<unsigned int> distribute(sparsity.n_rows(),
+                                       numbers::invalid_unsigned_int);
+
+  for (unsigned int c=0; c<lines.size(); ++c)
+    distribute[lines[c].line] = c;
+
+  const unsigned int n_rows = sparsity.n_rows();
+  for (unsigned int row=0; row<n_rows; ++row)
+    {
+      if (distribute[row] == numbers::invalid_unsigned_int)
+        // regular line. loop over
+        // cols. note that as we
+        // proceed to distribute
+        // cols, the loop may get
+        // longer
+        for (unsigned int j=0; j<sparsity.row_length(row); ++j)
+          {
+            const unsigned int column = sparsity.column_number(row,j);
+
+            if (distribute[column] != numbers::invalid_unsigned_int)
+              {
+                // distribute entry
+                // at regular row
+                // @p{row} and
+                // irregular column
+                // column. note that
+                // this changes the
+                // line we are
+                // presently working
+                // on: we add
+                // additional
+                // entries. if we add
+                // another entry at a
+                // column behind the
+                // present one, we
+                // will encounter it
+                // later on (but
+                // since it can't be
+                // further
+                // constrained, won't
+                // have to do
+                // anything about
+                // it). if we add it
+                // up front of the
+                // present column, we
+                // will find the
+                // present column
+                // later on again as
+                // it was shifted
+                // back (again
+                // nothing happens,
+                // in particular no
+                // endless loop, as
+                // when we encounter
+                // it the second time
+                // we won't be able
+                // to add more
+                // entries as they
+                // all already exist,
+                // but we do the same
+                // work more often
+                // than necessary,
+                // and the loop gets
+                // longer), so move
+                // the cursor one to
+                // the right in the
+                // case that we add
+                // an entry up front
+                // that did not exist
+                // before. check
+                // whether it existed
+                // before by tracking
+                // the length of this
+                // row
+                unsigned int old_rowlength = sparsity.row_length(row);
+                for (unsigned int q=0;
+                     q!=lines[distribute[column]].entries.size();
+                     ++q)
+                  {
+                    const unsigned int
+                    new_col = lines[distribute[column]].entries[q].first;
+
+                    sparsity.add (row, new_col);
+
+                    const unsigned int new_rowlength = sparsity.row_length(row);
+                    if ((new_col < column) && (old_rowlength != new_rowlength))
+                      ++j;
+                    old_rowlength = new_rowlength;
+                  };
+              };
+          }
+      else
+        // row must be distributed
+        for (unsigned int j=0; j<sparsity.row_length(row); ++j)
+          {
+            const unsigned int column = sparsity.column_number(row,j);
+
+            if (distribute[column] == numbers::invalid_unsigned_int)
+              // distribute entry at irregular
+              // row @p{row} and regular column
+              // sparsity.colnums[j]
+              for (unsigned int q=0;
+                   q!=lines[distribute[row]].entries.size(); ++q)
+                sparsity.add (lines[distribute[row]].entries[q].first,
+                              column);
+            else
+              // distribute entry at irregular
+              // row @p{row} and irregular column
+              // sparsity.get_column_numbers()[j]
+              for (unsigned int p=0; p!=lines[distribute[row]].entries.size(); ++p)
+                for (unsigned int q=0;
+                     q!=lines[distribute[sparsity.column_number(row,j)]]
+                     .entries.size(); ++q)
+                  sparsity.add (lines[distribute[row]].entries[p].first,
+                                lines[distribute[sparsity.column_number(row,j)]]
+                                .entries[q].first);
+          };
+    };
+}
+
+
+
+void ConstraintMatrix::condense (BlockSparsityPattern &sparsity) const
+{
+  Assert (sorted == true, ExcMatrixNotClosed());
+  Assert (sparsity.is_compressed() == false, ExcMatrixIsClosed());
+  Assert (sparsity.n_rows() == sparsity.n_cols(),
+          ExcNotQuadratic());
+  Assert (sparsity.n_block_rows() == sparsity.n_block_cols(),
+          ExcNotQuadratic());
+  Assert (sparsity.get_column_indices() == sparsity.get_row_indices(),
+          ExcNotQuadratic());
+
+  const BlockIndices &
+  index_mapping = sparsity.get_column_indices();
+
+  const unsigned int n_blocks = sparsity.n_block_rows();
+
+  // store for each index whether it must be
+  // distributed or not. If entry is
+  // numbers::invalid_unsigned_int,
+  // no distribution is necessary.
+  // otherwise, the number states which line
+  // in the constraint matrix handles this
+  // index
+  std::vector<unsigned int> distribute (sparsity.n_rows(),
+                                        numbers::invalid_unsigned_int);
+
+  for (unsigned int c=0; c<lines.size(); ++c)
+    distribute[lines[c].line] = c;
+
+  const unsigned int n_rows = sparsity.n_rows();
+  for (unsigned int row=0; row<n_rows; ++row)
+    {
+      // get index of this row
+      // within the blocks
+      const std::pair<unsigned int,unsigned int>
+      block_index = index_mapping.global_to_local(row);
+      const unsigned int block_row = block_index.first;
+
+      if (distribute[row] == numbers::invalid_unsigned_int)
+        // regular line. loop over
+        // all columns and see
+        // whether this column must
+        // be distributed
+        {
+
+          // to loop over all entries
+          // in this row, we have to
+          // loop over all blocks in
+          // this blockrow and the
+          // corresponding row
+          // therein
+          for (unsigned int block_col=0; block_col<n_blocks; ++block_col)
+            {
+              const SparsityPattern &
+              block_sparsity = sparsity.block(block_row, block_col);
+
+              for (SparsityPattern::const_iterator
+                   entry = block_sparsity.begin(block_index.second);
+                   (entry != block_sparsity.end(block_index.second)) &&
+                   entry->is_valid_entry();
+                   ++entry)
+                {
+                  const unsigned int global_col
+                    = index_mapping.local_to_global(block_col, entry->column());
+
+                  if (distribute[global_col] != numbers::invalid_unsigned_int)
+                    // distribute entry at regular
+                    // row @p{row} and irregular column
+                    // global_col
+                    {
+                      for (unsigned int q=0;
+                           q!=lines[distribute[global_col]].entries.size(); ++q)
+                        sparsity.add (row,
+                                      lines[distribute[global_col]].entries[q].first);
+                    }
+                }
+            }
+        }
+      else
+        {
+          // row must be
+          // distributed. split the
+          // whole row into the
+          // chunks defined by the
+          // blocks
+          for (unsigned int block_col=0; block_col<n_blocks; ++block_col)
+            {
+              const SparsityPattern &
+              block_sparsity = sparsity.block(block_row,block_col);
+
+              for (SparsityPattern::const_iterator
+                   entry = block_sparsity.begin(block_index.second);
+                   (entry != block_sparsity.end(block_index.second)) &&
+                   entry->is_valid_entry();
+                   ++entry)
+                {
+                  const unsigned int global_col
+                    = index_mapping.local_to_global (block_col, entry->column());
+
+                  if (distribute[global_col] == numbers::invalid_unsigned_int)
+                    // distribute entry at irregular
+                    // row @p{row} and regular column
+                    // global_col.
+                    {
+                      for (unsigned int q=0; q!=lines[distribute[row]].entries.size(); ++q)
+                        sparsity.add (lines[distribute[row]].entries[q].first, global_col);
+                    }
+                  else
+                    // distribute entry at irregular
+                    // row @p{row} and irregular column
+                    // @p{global_col}
+                    {
+                      for (unsigned int p=0; p!=lines[distribute[row]].entries.size(); ++p)
+                        for (unsigned int q=0; q!=lines[distribute[global_col]].entries.size(); ++q)
+                          sparsity.add (lines[distribute[row]].entries[p].first,
+                                        lines[distribute[global_col]].entries[q].first);
+                    }
+                }
+            }
+        }
+    }
+
+  sparsity.compress();
+}
+
+
+
+void ConstraintMatrix::condense (BlockCompressedSparsityPattern &sparsity) const
+{
+  Assert (sorted == true, ExcMatrixNotClosed());
+  Assert (sparsity.n_rows() == sparsity.n_cols(),
+          ExcNotQuadratic());
+  Assert (sparsity.n_block_rows() == sparsity.n_block_cols(),
+          ExcNotQuadratic());
+  Assert (sparsity.get_column_indices() == sparsity.get_row_indices(),
+          ExcNotQuadratic());
+
+  const BlockIndices &
+  index_mapping = sparsity.get_column_indices();
+
+  const unsigned int n_blocks = sparsity.n_block_rows();
+
+  // store for each index whether it must be
+  // distributed or not. If entry is
+  // numbers::invalid_unsigned_int,
+  // no distribution is necessary.
+  // otherwise, the number states which line
+  // in the constraint matrix handles this
+  // index
+  std::vector<unsigned int> distribute (sparsity.n_rows(),
+                                        numbers::invalid_unsigned_int);
+
+  for (unsigned int c=0; c<lines.size(); ++c)
+    distribute[lines[c].line] = static_cast<signed int>(c);
+
+  const unsigned int n_rows = sparsity.n_rows();
+  for (unsigned int row=0; row<n_rows; ++row)
+    {
+      // get index of this row
+      // within the blocks
+      const std::pair<unsigned int,unsigned int>
+      block_index = index_mapping.global_to_local(row);
+      const unsigned int block_row = block_index.first;
+      const unsigned int local_row = block_index.second;
+
+      if (distribute[row] == numbers::invalid_unsigned_int)
+        // regular line. loop over
+        // all columns and see
+        // whether this column must
+        // be distributed. note that
+        // as we proceed to
+        // distribute cols, the loop
+        // over cols may get longer.
+        //
+        // don't try to be clever
+        // here as in the algorithm
+        // for the
+        // CompressedSparsityPattern,
+        // as that would be much more
+        // complicated here. after
+        // all, we know that
+        // compressed patterns are
+        // inefficient...
+        {
+
+          // to loop over all entries
+          // in this row, we have to
+          // loop over all blocks in
+          // this blockrow and the
+          // corresponding row
+          // therein
+          for (unsigned int block_col=0; block_col<n_blocks; ++block_col)
+            {
+              const CompressedSparsityPattern &
+              block_sparsity = sparsity.block(block_row, block_col);
+
+              for (unsigned int j=0; j<block_sparsity.row_length(local_row); ++j)
+                {
+                  const unsigned int global_col
+                    = index_mapping.local_to_global(block_col,
+                                                    block_sparsity.column_number(local_row,j));
+
+                  if (distribute[global_col] != numbers::invalid_unsigned_int)
+                    // distribute entry at regular
+                    // row @p{row} and irregular column
+                    // global_col
+                    {
+                      for (unsigned int q=0;
+                           q!=lines[distribute[global_col]]
+                           .entries.size(); ++q)
+                        sparsity.add (row,
+                                      lines[distribute[global_col]].entries[q].first);
+                    };
+                };
+            };
+        }
+      else
+        {
+          // row must be
+          // distributed. split the
+          // whole row into the
+          // chunks defined by the
+          // blocks
+          for (unsigned int block_col=0; block_col<n_blocks; ++block_col)
+            {
+              const CompressedSparsityPattern &
+              block_sparsity = sparsity.block(block_row,block_col);
+
+              for (unsigned int j=0; j<block_sparsity.row_length(local_row); ++j)
+                {
+                  const unsigned int global_col
+                    = index_mapping.local_to_global (block_col,
+                                                     block_sparsity.column_number(local_row,j));
+
+                  if (distribute[global_col] == numbers::invalid_unsigned_int)
+                    // distribute entry at irregular
+                    // row @p{row} and regular column
+                    // global_col.
+                    {
+                      for (unsigned int q=0; q!=lines[distribute[row]].entries.size(); ++q)
+                        sparsity.add (lines[distribute[row]].entries[q].first,
+                                      global_col);
+                    }
+                  else
+                    // distribute entry at irregular
+                    // row @p{row} and irregular column
+                    // @p{global_col}
+                    {
+                      for (unsigned int p=0; p!=lines[distribute[row]].entries.size(); ++p)
+                        for (unsigned int q=0; q!=lines[distribute[global_col]].entries.size(); ++q)
+                          sparsity.add (lines[distribute[row]].entries[p].first,
+                                        lines[distribute[global_col]].entries[q].first);
+                    };
+                };
+            };
+        };
+    };
+}
+
+
+
+void ConstraintMatrix::condense (BlockCompressedSetSparsityPattern &sparsity) const
+{
+  Assert (sorted == true, ExcMatrixNotClosed());
+  Assert (sparsity.n_rows() == sparsity.n_cols(),
+          ExcNotQuadratic());
+  Assert (sparsity.n_block_rows() == sparsity.n_block_cols(),
+          ExcNotQuadratic());
+  Assert (sparsity.get_column_indices() == sparsity.get_row_indices(),
+          ExcNotQuadratic());
+
+  const BlockIndices &
+  index_mapping = sparsity.get_column_indices();
+
+  const unsigned int n_blocks = sparsity.n_block_rows();
+
+  // store for each index whether it must be
+  // distributed or not. If entry is
+  // numbers::invalid_unsigned_int,
+  // no distribution is necessary.
+  // otherwise, the number states which line
+  // in the constraint matrix handles this
+  // index
+  std::vector<unsigned int> distribute (sparsity.n_rows(),
+                                        numbers::invalid_unsigned_int);
+
+  for (unsigned int c=0; c<lines.size(); ++c)
+    distribute[lines[c].line] = static_cast<signed int>(c);
+
+  const unsigned int n_rows = sparsity.n_rows();
+  for (unsigned int row=0; row<n_rows; ++row)
+    {
+      // get index of this row
+      // within the blocks
+      const std::pair<unsigned int,unsigned int>
+      block_index = index_mapping.global_to_local(row);
+      const unsigned int block_row = block_index.first;
+      const unsigned int local_row = block_index.second;
+
+      if (distribute[row] == numbers::invalid_unsigned_int)
+        // regular line. loop over
+        // all columns and see
+        // whether this column must
+        // be distributed. note that
+        // as we proceed to
+        // distribute cols, the loop
+        // over cols may get longer.
+        //
+        // don't try to be clever
+        // here as in the algorithm
+        // for the
+        // CompressedSparsityPattern,
+        // as that would be much more
+        // complicated here. after
+        // all, we know that
+        // compressed patterns are
+        // inefficient...
+        {
+
+          // to loop over all entries
+          // in this row, we have to
+          // loop over all blocks in
+          // this blockrow and the
+          // corresponding row
+          // therein
+          for (unsigned int block_col=0; block_col<n_blocks; ++block_col)
+            {
+              const CompressedSetSparsityPattern &
+              block_sparsity = sparsity.block(block_row, block_col);
+
+              for (CompressedSetSparsityPattern::row_iterator
+                   j = block_sparsity.row_begin(local_row);
+                   j != block_sparsity.row_end(local_row); ++j)
+                {
+                  const unsigned int global_col
+                    = index_mapping.local_to_global(block_col, *j);
+
+                  if (distribute[global_col] != numbers::invalid_unsigned_int)
+                    // distribute entry at regular
+                    // row @p{row} and irregular column
+                    // global_col
+                    {
+                      for (unsigned int q=0;
+                           q!=lines[distribute[global_col]]
+                           .entries.size(); ++q)
+                        sparsity.add (row,
+                                      lines[distribute[global_col]].entries[q].first);
+                    };
+                };
+            };
+        }
+      else
+        {
+          // row must be
+          // distributed. split the
+          // whole row into the
+          // chunks defined by the
+          // blocks
+          for (unsigned int block_col=0; block_col<n_blocks; ++block_col)
+            {
+              const CompressedSetSparsityPattern &
+              block_sparsity = sparsity.block(block_row,block_col);
+
+              for (CompressedSetSparsityPattern::row_iterator
+                   j = block_sparsity.row_begin(local_row);
+                   j != block_sparsity.row_end(local_row); ++j)
+                {
+                  const unsigned int global_col
+                    = index_mapping.local_to_global (block_col, *j);
+
+                  if (distribute[global_col] == numbers::invalid_unsigned_int)
+                    // distribute entry at irregular
+                    // row @p{row} and regular column
+                    // global_col.
+                    {
+                      for (unsigned int q=0; q!=lines[distribute[row]].entries.size(); ++q)
+                        sparsity.add (lines[distribute[row]].entries[q].first,
+                                      global_col);
+                    }
+                  else
+                    // distribute entry at irregular
+                    // row @p{row} and irregular column
+                    // @p{global_col}
+                    {
+                      for (unsigned int p=0; p!=lines[distribute[row]].entries.size(); ++p)
+                        for (unsigned int q=0; q!=lines[distribute[global_col]].entries.size(); ++q)
+                          sparsity.add (lines[distribute[row]].entries[p].first,
+                                        lines[distribute[global_col]].entries[q].first);
+                    };
+                };
+            };
+        };
+    };
+}
+
+
+
+void ConstraintMatrix::condense (BlockCompressedSimpleSparsityPattern &sparsity) const
+{
+  Assert (sorted == true, ExcMatrixNotClosed());
+  Assert (sparsity.n_rows() == sparsity.n_cols(),
+          ExcNotQuadratic());
+  Assert (sparsity.n_block_rows() == sparsity.n_block_cols(),
+          ExcNotQuadratic());
+  Assert (sparsity.get_column_indices() == sparsity.get_row_indices(),
+          ExcNotQuadratic());
+
+  const BlockIndices &
+  index_mapping = sparsity.get_column_indices();
+
+  const unsigned int n_blocks = sparsity.n_block_rows();
+
+  // store for each index whether it must be
+  // distributed or not. If entry is
+  // numbers::invalid_unsigned_int,
+  // no distribution is necessary.
+  // otherwise, the number states which line
+  // in the constraint matrix handles this
+  // index
+  std::vector<unsigned int> distribute (sparsity.n_rows(),
+                                        numbers::invalid_unsigned_int);
+
+  for (unsigned int c=0; c<lines.size(); ++c)
+    distribute[lines[c].line] = static_cast<signed int>(c);
+
+  const unsigned int n_rows = sparsity.n_rows();
+  for (unsigned int row=0; row<n_rows; ++row)
+    {
+      // get index of this row
+      // within the blocks
+      const std::pair<unsigned int,unsigned int>
+      block_index = index_mapping.global_to_local(row);
+      const unsigned int block_row = block_index.first;
+      const unsigned int local_row = block_index.second;
+
+      if (distribute[row] == numbers::invalid_unsigned_int)
+        // regular line. loop over
+        // all columns and see
+        // whether this column must
+        // be distributed. note that
+        // as we proceed to
+        // distribute cols, the loop
+        // over cols may get longer.
+        //
+        // don't try to be clever
+        // here as in the algorithm
+        // for the
+        // CompressedSparsityPattern,
+        // as that would be much more
+        // complicated here. after
+        // all, we know that
+        // compressed patterns are
+        // inefficient...
+        {
+
+          // to loop over all entries
+          // in this row, we have to
+          // loop over all blocks in
+          // this blockrow and the
+          // corresponding row
+          // therein
+          for (unsigned int block_col=0; block_col<n_blocks; ++block_col)
+            {
+              const CompressedSimpleSparsityPattern &
+              block_sparsity = sparsity.block(block_row, block_col);
+
+              for (unsigned int j=0; j<block_sparsity.row_length(local_row); ++j)
+                {
+                  const unsigned int global_col
+                    = index_mapping.local_to_global(block_col,
+                                                    block_sparsity.column_number(local_row,j));
+
+                  if (distribute[global_col] != numbers::invalid_unsigned_int)
+                    // distribute entry at regular
+                    // row @p{row} and irregular column
+                    // global_col
+                    {
+                      for (unsigned int q=0;
+                           q!=lines[distribute[global_col]]
+                           .entries.size(); ++q)
+                        sparsity.add (row,
+                                      lines[distribute[global_col]].entries[q].first);
+                    };
+                };
+            };
+        }
+      else
+        {
+          // row must be
+          // distributed. split the
+          // whole row into the
+          // chunks defined by the
+          // blocks
+          for (unsigned int block_col=0; block_col<n_blocks; ++block_col)
+            {
+              const CompressedSimpleSparsityPattern &
+              block_sparsity = sparsity.block(block_row,block_col);
+
+              for (unsigned int j=0; j<block_sparsity.row_length(local_row); ++j)
+                {
+                  const unsigned int global_col
+                    = index_mapping.local_to_global (block_col,
+                                                     block_sparsity.column_number(local_row,j));
+
+                  if (distribute[global_col] == numbers::invalid_unsigned_int)
+                    // distribute entry at irregular
+                    // row @p{row} and regular column
+                    // global_col.
+                    {
+                      for (unsigned int q=0; q!=lines[distribute[row]].entries.size(); ++q)
+                        sparsity.add (lines[distribute[row]].entries[q].first,
+                                      global_col);
+                    }
+                  else
+                    // distribute entry at irregular
+                    // row @p{row} and irregular column
+                    // @p{global_col}
+                    {
+                      for (unsigned int p=0; p!=lines[distribute[row]].entries.size(); ++p)
+                        for (unsigned int q=0; q!=lines[distribute[global_col]].entries.size(); ++q)
+                          sparsity.add (lines[distribute[row]].entries[p].first,
+                                        lines[distribute[global_col]].entries[q].first);
+                    };
+                };
+            };
+        };
+    };
+}
+
+
+
+#ifdef DEAL_II_USE_TRILINOS
+
+// this is a specialization for a
+// parallel (non-block) Trilinos
+// vector. The basic idea is to just work
+// on the local range of the vector. But
+// we need access to values that the
+// local nodes are constrained to.
+
+template<>
+void
+ConstraintMatrix::distribute (TrilinosWrappers::MPI::Vector &vec) const
+{
+  Assert (sorted==true, ExcMatrixIsClosed());
+
+  //TODO: not implemented yet, we need to fix
+  //LocalRange() first to only include
+  //"owned" indices. For this we need to keep
+  //track of the owned indices, because
+  //Trilinos doesn't. Use same constructor
+  //interface as in PETSc with two IndexSets!
+  AssertThrow (vec.vector_partitioner().IsOneToOne(),
+               ExcMessage ("Distribute does not work on vectors with overlapping parallel partitioning."));
+
+  typedef std::vector<ConstraintLine>::const_iterator constraint_iterator;
+  ConstraintLine index_comparison;
+  index_comparison.line = vec.local_range().first;
+  const constraint_iterator begin_my_constraints =
+    Utilities::lower_bound (lines.begin(),lines.end(),index_comparison);
+
+  index_comparison.line = vec.local_range().second;
+  const constraint_iterator end_my_constraints
+    = Utilities::lower_bound(lines.begin(),lines.end(),index_comparison);
+
+  // Here we search all the indices that we
+  // need to have read-access to - the
+  // local nodes and all the nodes that the
+  // constraints indicate.
+  IndexSet my_indices (vec.size());
+  {
+    const std::pair<unsigned int, unsigned int>
+    local_range = vec.local_range();
+
+    my_indices.add_range (local_range.first, local_range.second);
+
+    std::set<unsigned int> individual_indices;
+    for (constraint_iterator it = begin_my_constraints;
+         it != end_my_constraints; ++it)
+      for (unsigned int i=0; i<it->entries.size(); ++i)
+        if ((it->entries[i].first < local_range.first)
+            ||
+            (it->entries[i].first >= local_range.second))
+          individual_indices.insert (it->entries[i].first);
+
+    my_indices.add_indices (individual_indices.begin(),
+                            individual_indices.end());
+  }
+
+#ifdef DEAL_II_COMPILER_SUPPORTS_MPI
+  const Epetra_MpiComm *mpi_comm
+    = dynamic_cast<const Epetra_MpiComm *>(&vec.trilinos_vector().Comm());
+
+  Assert (mpi_comm != 0, ExcInternalError());
+
+  TrilinosWrappers::MPI::Vector vec_distribute
+  (my_indices.make_trilinos_map (mpi_comm->Comm(), true));
+#else
+  TrilinosWrappers::MPI::Vector vec_distribute
+  (my_indices.make_trilinos_map (MPI_COMM_WORLD, true));
+#endif
+
+  // here we import the data
+  vec_distribute.reinit(vec,false,true);
+
+  for (constraint_iterator it = begin_my_constraints;
+       it != end_my_constraints; ++it)
+    {
+      // fill entry in line
+      // next_constraint.line by adding the
+      // different contributions
+      double new_value = it->inhomogeneity;
+      for (unsigned int i=0; i<it->entries.size(); ++i)
+        new_value += (vec_distribute(it->entries[i].first) *
+                      it->entries[i].second);
+      vec(it->line) = new_value;
+    }
+
+  // some processes might not apply
+  // constraints, so we need to explicitly
+  // state, that the others are doing an
+  // insert here:
+  vec.compress (::dealii::VectorOperation::insert);
+}
+
+
+
+template<>
+void
+ConstraintMatrix::distribute (TrilinosWrappers::MPI::BlockVector &vec) const
+{
+  Assert (sorted==true, ExcMatrixIsClosed());
+
+  IndexSet my_indices (vec.size());
+  for (unsigned int block=0; block<vec.n_blocks(); ++block)
+    {
+      typedef std::vector<ConstraintLine>::const_iterator constraint_iterator;
+      ConstraintLine index_comparison;
+      index_comparison.line = vec.block(block).local_range().first
+                              +vec.get_block_indices().block_start(block);
+      const constraint_iterator begin_my_constraints =
+        Utilities::lower_bound (lines.begin(),lines.end(),index_comparison);
+
+      index_comparison.line = vec.block(block).local_range().second
+                              +vec.get_block_indices().block_start(block);
+
+      const constraint_iterator end_my_constraints
+        = Utilities::lower_bound(lines.begin(),lines.end(),index_comparison);
+
+      // Here we search all the indices that we
+      // need to have read-access to - the local
+      // nodes and all the nodes that the
+      // constraints indicate. No caching done
+      // yet. would need some more clever data
+      // structures for doing that.
+      const std::pair<unsigned int, unsigned int>
+      local_range = vec.block(block).local_range();
+
+      my_indices.add_range (local_range.first, local_range.second);
+
+      std::set<unsigned int> individual_indices;
+      for (constraint_iterator it = begin_my_constraints;
+           it != end_my_constraints; ++it)
+        for (unsigned int i=0; i<it->entries.size(); ++i)
+          if ((it->entries[i].first < local_range.first)
+              ||
+              (it->entries[i].first >= local_range.second))
+            individual_indices.insert (it->entries[i].first);
+
+      my_indices.add_indices (individual_indices.begin(),
+                              individual_indices.end());
+    }
+
+#ifdef DEAL_II_COMPILER_SUPPORTS_MPI
+  const Epetra_MpiComm *mpi_comm
+    = dynamic_cast<const Epetra_MpiComm *>(&vec.block(0).trilinos_vector().Comm());
+
+  Assert (mpi_comm != 0, ExcInternalError());
+
+  TrilinosWrappers::MPI::Vector vec_distribute
+  (my_indices.make_trilinos_map (mpi_comm->Comm(), true));
+#else
+  TrilinosWrappers::MPI::Vector vec_distribute
+  (my_indices.make_trilinos_map (MPI_COMM_WORLD, true));
+#endif
+
+  // here we import the data
+  vec_distribute.reinit(vec,true);
+
+  for (unsigned int block=0; block<vec.n_blocks(); ++block)
+    {
+      typedef std::vector<ConstraintLine>::const_iterator constraint_iterator;
+      ConstraintLine index_comparison;
+      index_comparison.line = vec.block(block).local_range().first
+                              +vec.get_block_indices().block_start(block);
+      const constraint_iterator begin_my_constraints =
+        Utilities::lower_bound (lines.begin(),lines.end(),index_comparison);
+
+      index_comparison.line = vec.block(block).local_range().second
+                              +vec.get_block_indices().block_start(block);
+
+      const constraint_iterator end_my_constraints
+        = Utilities::lower_bound(lines.begin(),lines.end(),index_comparison);
+
+      for (constraint_iterator it = begin_my_constraints;
+           it != end_my_constraints; ++it)
+        {
+          // fill entry in line
+          // next_constraint.line by adding the
+          // different contributions
+          double new_value = it->inhomogeneity;
+          for (unsigned int i=0; i<it->entries.size(); ++i)
+            new_value += (vec_distribute(it->entries[i].first) *
+                          it->entries[i].second);
+          vec(it->line) = new_value;
+        }
+      vec.block(block).compress(::dealii::VectorOperation::insert);
+    }
+}
+
+#endif
+
+#ifdef DEAL_II_USE_PETSC
+
+// this is a specialization for a
+// parallel (non-block) PETSc
+// vector. The basic idea is to just work
+// on the local range of the vector. But
+// we need access to values that the
+// local nodes are constrained to.
+
+template<>
+void
+ConstraintMatrix::distribute (PETScWrappers::MPI::Vector &vec) const
+{
+  Assert (sorted==true, ExcMatrixIsClosed());
+
+  typedef std::vector<ConstraintLine>::const_iterator constraint_iterator;
+  ConstraintLine index_comparison;
+  index_comparison.line = vec.local_range().first;
+  const constraint_iterator begin_my_constraints =
+    Utilities::lower_bound (lines.begin(),lines.end(),index_comparison);
+
+  index_comparison.line = vec.local_range().second;
+  const constraint_iterator end_my_constraints
+    = Utilities::lower_bound(lines.begin(),lines.end(),index_comparison);
+
+  // all indices we need to read from
+  IndexSet my_indices (vec.size());
+
+  const std::pair<unsigned int, unsigned int>
+  local_range = vec.local_range();
+
+  my_indices.add_range (local_range.first, local_range.second);
+
+  std::set<unsigned int> individual_indices;
+  for (constraint_iterator it = begin_my_constraints;
+       it != end_my_constraints; ++it)
+    for (unsigned int i=0; i<it->entries.size(); ++i)
+      if ((it->entries[i].first < local_range.first)
+          ||
+          (it->entries[i].first >= local_range.second))
+        individual_indices.insert (it->entries[i].first);
+
+  my_indices.add_indices (individual_indices.begin(),
+                          individual_indices.end());
+
+  IndexSet local_range_is (vec.size());
+  local_range_is.add_range(local_range.first, local_range.second);
+
+
+  // create a vector and import those indices
+  PETScWrappers::MPI::Vector ghost_vec (vec.get_mpi_communicator(),
+                                        local_range_is,
+                                        my_indices);
+  ghost_vec = vec;
+  ghost_vec.update_ghost_values();
+
+  // finally do the distribution on own
+  // constraints
+  for (constraint_iterator it = begin_my_constraints;
+       it != end_my_constraints; ++it)
+    {
+      // fill entry in line
+      // next_constraint.line by adding the
+      // different contributions
+      PetscScalar new_value = it->inhomogeneity;
+      for (unsigned int i=0; i<it->entries.size(); ++i)
+        new_value += (PetscScalar(ghost_vec(it->entries[i].first)) *
+                      it->entries[i].second);
+      vec(it->line) = new_value;
+    }
+
+  vec.compress ();
+}
+
+
+
+template<>
+void
+ConstraintMatrix::distribute (PETScWrappers::MPI::BlockVector &vec) const   // modified by shuqiangwang
+{
+//  Assert (sorted==true, ExcMatrixIsClosed());
+//  for(unsigned int i=0; i<vec.n_blocks(); i++)
+//	  distribute(vec.block(i));
+
+	Assert (sorted==true, ExcMatrixIsClosed());
+
+//	IndexSet my_indices (vec.size());
+//	IndexSet local_range_is (vec.size());
+	std::vector<IndexSet> my_indices(vec.n_blocks());				// relevant
+	std::vector<IndexSet> local_range_is(vec.n_blocks());			// locally owned
+	std::vector<unsigned int> block_sizes(vec.n_blocks());			// size of locally owned
+
+	for(unsigned int block=0; block<vec.n_blocks(); block++)
+	{
+		my_indices[block].set_size(vec.block(block).size());
+//		my_indices[block].set_size(vec.size());
+		local_range_is[block].set_size(vec.block(block).size());
+//		local_range_is[block].set_size(vec.size());
+//		block_sizes[block] = vec(block).size();
+	}
+
+	static int debug_index = 0;
+	debug_index++;
+
+	for (unsigned int block=0; block<vec.n_blocks(); ++block)
+	{
+		typedef std::vector<ConstraintLine>::const_iterator constraint_iterator;
+		ConstraintLine index_comparison;
+		index_comparison.line = vec.block(block).local_range().first
+				+vec.get_block_indices().block_start(block);
+		const constraint_iterator begin_my_constraints =
+				Utilities::lower_bound (lines.begin(),lines.end(),index_comparison);
+
+		index_comparison.line = vec.block(block).local_range().second
+				+vec.get_block_indices().block_start(block);
+
+		const constraint_iterator end_my_constraints
+		= Utilities::lower_bound(lines.begin(),lines.end(),index_comparison);
+
+		// Here we search all the indices that we
+		// need to have read-access to - the local
+		// nodes and all the nodes that the
+		// constraints indicate. No caching done
+		// yet. would need some more clever data
+		// structures for doing that.
+		const std::pair<unsigned int, unsigned int>
+		local_range = vec.block(block).local_range();
+		local_range_is[block].add_range(local_range.first, local_range.second);
+		block_sizes[block] = local_range.second - local_range.first;
+
+		my_indices[block].add_range (local_range.first, local_range.second);
+
+		std::set<unsigned int> individual_indices;
+		for (constraint_iterator it = begin_my_constraints; it != end_my_constraints; ++it)
+			for (unsigned int i=0; i<it->entries.size(); ++i)
+				if ((it->entries[i].first < local_range.first) || (it->entries[i].first >= local_range.second))
+					individual_indices.insert (it->entries[i].first - vec.get_block_indices().block_start(block));
+
+		my_indices[block].add_indices (individual_indices.begin(),
+				individual_indices.end());
+	}
+
+	// create a vector and import those indices
+//	PETScWrappers::MPI::BlockVector ghost_vec (vec.get_mpi_communicator(),
+//			local_range_is,
+//			my_indices);
+//	ghost_vec = vec;
+//	ghost_vec.update_ghost_values();
+	PETScWrappers::MPI::BlockVector ghost_vec;
+	ghost_vec.reinit(block_sizes,vec.get_mpi_communicator());
+	for (unsigned int block=0; block<vec.n_blocks(); ++block)
+		ghost_vec.block(block).reinit(vec.get_mpi_communicator(),local_range_is[block],my_indices[block]);
+	ghost_vec.collect_sizes();
+
+	ghost_vec = vec;
+	ghost_vec.update_ghost_values();
+
+	for (unsigned int block=0; block<vec.n_blocks(); ++block)
+	{
+		typedef std::vector<ConstraintLine>::const_iterator constraint_iterator;
+		ConstraintLine index_comparison;
+		index_comparison.line = vec.block(block).local_range().first
+				+vec.get_block_indices().block_start(block);
+		const constraint_iterator begin_my_constraints =
+				Utilities::lower_bound (lines.begin(),lines.end(),index_comparison);
+
+		index_comparison.line = vec.block(block).local_range().second
+				+vec.get_block_indices().block_start(block);
+
+		const constraint_iterator end_my_constraints
+		= Utilities::lower_bound(lines.begin(),lines.end(),index_comparison);
+
+		for (constraint_iterator it = begin_my_constraints;
+				it != end_my_constraints; ++it)
+		{
+			// fill entry in line
+			// next_constraint.line by adding the
+			// different contributions
+			double new_value = it->inhomogeneity;
+			for (unsigned int i=0; i<it->entries.size(); ++i)
+				new_value += (ghost_vec(it->entries[i].first) *
+						it->entries[i].second);
+			vec(it->line) = new_value;
+		}
+		vec.block(block).compress(::dealii::VectorOperation::insert);
+	}
+
+}
+
+#endif
+
+
+
+unsigned int ConstraintMatrix::n_constraints () const
+{
+  return lines.size();
+}
+
+
+
+bool ConstraintMatrix::is_identity_constrained (const unsigned int index) const
+{
+  if (is_constrained(index) == false)
+    return false;
+
+  const ConstraintLine &p = lines[lines_cache[calculate_line_index(index)]];
+  Assert (p.line == index, ExcInternalError());
+
+  // return if an entry for this
+  // line was found and if it has
+  // only one entry equal to 1.0
+  return ((p.entries.size() == 1) &&
+          (p.entries[0].second == 1.0));
+}
+
+
+
+unsigned int ConstraintMatrix::max_constraint_indirections () const
+{
+  unsigned int return_value = 0;
+  for (std::vector<ConstraintLine>::const_iterator i=lines.begin();
+       i!=lines.end(); ++i)
+    // use static cast, since
+    // typeof(size)==std::size_t, which is !=
+    // unsigned int on AIX
+    return_value = std::max(return_value,
+                            static_cast<unsigned int>(i->entries.size()));
+
+  return return_value;
+}
+
+
+
+bool ConstraintMatrix::has_inhomogeneities () const
+{
+  for (std::vector<ConstraintLine>::const_iterator i=lines.begin();
+       i!=lines.end(); ++i)
+    if (i->inhomogeneity != 0.)
+      return true;
+
+  return false;
+}
+
+
+void ConstraintMatrix::print (std::ostream &out) const
+{
+  for (unsigned int i=0; i!=lines.size(); ++i)
+    {
+      // output the list of
+      // constraints as pairs of dofs
+      // and their weights
+      if (lines[i].entries.size() > 0)
+        {
+          for (unsigned int j=0; j<lines[i].entries.size(); ++j)
+            out << "    " << lines[i].line
+                << " " << lines[i].entries[j].first
+                << ":  " << lines[i].entries[j].second << "\n";
+
+          // print out inhomogeneity.
+          if (lines[i].inhomogeneity != 0)
+            out << "    " << lines[i].line
+                << ": " << lines[i].inhomogeneity << "\n";
+        }
+      else
+        // but also output something
+        // if the constraint simply
+        // reads x[13]=0, i.e. where
+        // the right hand side is not
+        // a linear combination of
+        // other dofs
+        {
+          if (lines[i].inhomogeneity != 0)
+            out << "    " << lines[i].line
+                << " = " << lines[i].inhomogeneity
+                << "\n";
+          else
+            out << "    " << lines[i].line << " = 0\n";
+        }
+    }
+
+  AssertThrow (out, ExcIO());
+}
+
+
+
+void
+ConstraintMatrix::write_dot (std::ostream &out) const
+{
+  out << "digraph constraints {"
+      << std::endl;
+  for (unsigned int i=0; i!=lines.size(); ++i)
+    {
+      // same concept as in the
+      // previous function
+      if (lines[i].entries.size() > 0)
+        for (unsigned int j=0; j<lines[i].entries.size(); ++j)
+          out << "  " << lines[i].line << "->" << lines[i].entries[j].first
+              << "; // weight: "
+              << lines[i].entries[j].second
+              << "\n";
+      else
+        out << "  " << lines[i].line << "\n";
+    }
+  out << "}" << std::endl;
+}
+
+
+
+std::size_t
+ConstraintMatrix::memory_consumption () const
+{
+  return (MemoryConsumption::memory_consumption (lines) +
+          MemoryConsumption::memory_consumption (lines_cache) +
+          MemoryConsumption::memory_consumption (sorted) +
+          MemoryConsumption::memory_consumption (local_lines));
+}
+
+
+
+
+
+// explicit instantiations
+//
+// define a list of functions for vectors and matrices, respectively, where
+// the vector/matrix can be replaced using a preprocessor variable
+// VectorType/MatrixType. note that we need a space between "VectorType" and
+// ">" to disambiguate ">>" when VectorType trails in an angle bracket
+
+// TODO: The way we define all the instantiations is probably not the very
+// best one. Try to find a better description.
+
+#define VECTOR_FUNCTIONS(VectorType) \
+  template void ConstraintMatrix::condense<VectorType >(const VectorType &uncondensed,\
+                                                        VectorType       &condensed) const;\
+  template void ConstraintMatrix::condense<VectorType >(VectorType &vec) const;\
+  template void ConstraintMatrix::condense<float,VectorType >(const SparseMatrix<float> &uncondensed, \
+                                                              const VectorType &uncondensed_vector, \
+                                                              SparseMatrix<float> &condensed, \
+                                                              VectorType       &condensed_vector) const; \
+  template void ConstraintMatrix::condense<double,VectorType >(const SparseMatrix<double> &uncondensed, \
+      const VectorType &uncondensed_vector, \
+      SparseMatrix<double> &condensed, \
+      VectorType       &condensed_vector) const; \
+  template void ConstraintMatrix:: \
+  distribute_local_to_global<VectorType > (const Vector<double>            &, \
+                                           const std::vector<unsigned int> &, \
+                                           VectorType                      &, \
+                                           const FullMatrix<double>        &) const; \
+  template void ConstraintMatrix::distribute<VectorType >(const VectorType &condensed,\
+                                                          VectorType       &uncondensed) const;\
+  template void ConstraintMatrix::distribute<VectorType >(VectorType &vec) const
+
+#define PARALLEL_VECTOR_FUNCTIONS(VectorType) \
+  template void ConstraintMatrix:: \
+  distribute_local_to_global<VectorType > (const Vector<double>            &, \
+                                           const std::vector<unsigned int> &, \
+                                           VectorType                      &, \
+                                           const FullMatrix<double>        &) const
+
+
+// TODO: Can PETSc really do all the operations required by the above
+// condense/distribute function etc also on distributed vectors? Trilinos
+// can't do that - we have to rewrite those functions by hand if we want to
+// use them. The key is to use local ranges etc., which still needs to be
+// implemented.
+#ifdef DEAL_II_USE_PETSC
+VECTOR_FUNCTIONS(PETScWrappers::MPI::Vector);
+VECTOR_FUNCTIONS(PETScWrappers::MPI::BlockVector);
+#endif
+
+#ifdef DEAL_II_USE_TRILINOS
+PARALLEL_VECTOR_FUNCTIONS(TrilinosWrappers::MPI::Vector);
+PARALLEL_VECTOR_FUNCTIONS(TrilinosWrappers::MPI::BlockVector);
+#endif
+
+#define MATRIX_VECTOR_FUNCTIONS(MatrixType, VectorType) \
+  template void ConstraintMatrix:: \
+  distribute_local_to_global<MatrixType,VectorType > (const FullMatrix<double>        &, \
+                                                      const Vector<double>            &, \
+                                                      const std::vector<unsigned int> &, \
+                                                      MatrixType                      &, \
+                                                      VectorType                      &, \
+                                                      bool                             , \
+                                                      internal::bool2type<false>) const
+#define MATRIX_FUNCTIONS(MatrixType) \
+  template void ConstraintMatrix:: \
+  distribute_local_to_global<MatrixType,Vector<double> > (const FullMatrix<double>        &, \
+                                                          const Vector<double>            &, \
+                                                          const std::vector<unsigned int> &, \
+                                                          MatrixType                      &, \
+                                                          Vector<double>                  &, \
+                                                          bool                             , \
+                                                          internal::bool2type<false>) const
+#define BLOCK_MATRIX_VECTOR_FUNCTIONS(MatrixType, VectorType)   \
+  template void ConstraintMatrix:: \
+  distribute_local_to_global<MatrixType,VectorType > (const FullMatrix<double>        &, \
+                                                      const Vector<double>            &, \
+                                                      const std::vector<unsigned int> &, \
+                                                      MatrixType                      &, \
+                                                      VectorType                      &, \
+                                                      bool                             , \
+                                                      internal::bool2type<true>) const
+#define BLOCK_MATRIX_FUNCTIONS(MatrixType)      \
+  template void ConstraintMatrix:: \
+  distribute_local_to_global<MatrixType,Vector<double> > (const FullMatrix<double>        &, \
+                                                          const Vector<double>            &, \
+                                                          const std::vector<unsigned int> &, \
+                                                          MatrixType                      &, \
+                                                          Vector<double>                  &, \
+                                                          bool                             , \
+                                                          internal::bool2type<true>) const
+
+MATRIX_FUNCTIONS(SparseMatrix<double>);
+MATRIX_FUNCTIONS(SparseMatrix<float>);
+MATRIX_FUNCTIONS(FullMatrix<double>);
+MATRIX_FUNCTIONS(FullMatrix<float>);
+MATRIX_VECTOR_FUNCTIONS(SparseMatrix<float>, Vector<float>);
+
+BLOCK_MATRIX_FUNCTIONS(BlockSparseMatrix<double>);
+BLOCK_MATRIX_FUNCTIONS(BlockSparseMatrix<float>);
+BLOCK_MATRIX_VECTOR_FUNCTIONS(BlockSparseMatrix<double>, BlockVector<double>);
+BLOCK_MATRIX_VECTOR_FUNCTIONS(BlockSparseMatrix<float>,  BlockVector<float>);
+BLOCK_MATRIX_VECTOR_FUNCTIONS(BlockSparseMatrix<float>,  BlockVector<double>);
+
+MATRIX_FUNCTIONS(SparseMatrixEZ<double>);
+MATRIX_FUNCTIONS(SparseMatrixEZ<float>);
+MATRIX_VECTOR_FUNCTIONS(SparseMatrixEZ<float>,  Vector<float>);
+
+// BLOCK_MATRIX_FUNCTIONS(BlockSparseMatrixEZ<double>);
+// BLOCK_MATRIX_VECTOR_FUNCTIONS(BlockSparseMatrixEZ<float>,  Vector<float>);
+
+#ifdef DEAL_II_USE_PETSC
+MATRIX_FUNCTIONS(PETScWrappers::SparseMatrix);
+BLOCK_MATRIX_FUNCTIONS(PETScWrappers::BlockSparseMatrix);
+MATRIX_FUNCTIONS(PETScWrappers::MPI::SparseMatrix);
+BLOCK_MATRIX_FUNCTIONS(PETScWrappers::MPI::BlockSparseMatrix);
+MATRIX_VECTOR_FUNCTIONS(PETScWrappers::SparseMatrix, PETScWrappers::Vector);
+BLOCK_MATRIX_VECTOR_FUNCTIONS(PETScWrappers::BlockSparseMatrix, PETScWrappers::BlockVector);
+MATRIX_VECTOR_FUNCTIONS(PETScWrappers::MPI::SparseMatrix, PETScWrappers::MPI::Vector);
+BLOCK_MATRIX_VECTOR_FUNCTIONS(PETScWrappers::MPI::BlockSparseMatrix ,PETScWrappers::MPI::BlockVector);
+#endif
+
+#ifdef DEAL_II_USE_TRILINOS
+MATRIX_FUNCTIONS(TrilinosWrappers::SparseMatrix);
+BLOCK_MATRIX_FUNCTIONS(TrilinosWrappers::BlockSparseMatrix);
+MATRIX_VECTOR_FUNCTIONS(TrilinosWrappers::SparseMatrix, TrilinosWrappers::Vector);
+BLOCK_MATRIX_VECTOR_FUNCTIONS(TrilinosWrappers::BlockSparseMatrix, TrilinosWrappers::BlockVector);
+MATRIX_VECTOR_FUNCTIONS(TrilinosWrappers::SparseMatrix, TrilinosWrappers::MPI::Vector);
+BLOCK_MATRIX_VECTOR_FUNCTIONS(TrilinosWrappers::BlockSparseMatrix, TrilinosWrappers::MPI::BlockVector);
+#endif
+
+
+#define SPARSITY_FUNCTIONS(SparsityType) \
+  template void ConstraintMatrix::add_entries_local_to_global<SparsityType> (\
+      const std::vector<unsigned int> &, \
+      SparsityType &,                    \
+      const bool,                        \
+      const Table<2,bool> &, \
+      internal::bool2type<false>) const; \
+  template void ConstraintMatrix::add_entries_local_to_global<SparsityType> (\
+      const std::vector<unsigned int> &, \
+      const std::vector<unsigned int> &, \
+      SparsityType &,                    \
+      const bool,                        \
+      const Table<2,bool> &) const
+#define BLOCK_SPARSITY_FUNCTIONS(SparsityType) \
+  template void ConstraintMatrix::add_entries_local_to_global<SparsityType> (\
+      const std::vector<unsigned int> &, \
+      SparsityType &,                    \
+      const bool,                        \
+      const Table<2,bool> &, \
+      internal::bool2type<true>) const; \
+  template void ConstraintMatrix::add_entries_local_to_global<SparsityType> (\
+      const std::vector<unsigned int> &, \
+      const std::vector<unsigned int> &, \
+      SparsityType &,                    \
+      const bool,                        \
+      const Table<2,bool> &) const
+
+SPARSITY_FUNCTIONS(SparsityPattern);
+SPARSITY_FUNCTIONS(CompressedSparsityPattern);
+SPARSITY_FUNCTIONS(CompressedSetSparsityPattern);
+SPARSITY_FUNCTIONS(CompressedSimpleSparsityPattern);
+BLOCK_SPARSITY_FUNCTIONS(BlockSparsityPattern);
+BLOCK_SPARSITY_FUNCTIONS(BlockCompressedSparsityPattern);
+BLOCK_SPARSITY_FUNCTIONS(BlockCompressedSetSparsityPattern);
+BLOCK_SPARSITY_FUNCTIONS(BlockCompressedSimpleSparsityPattern);
+
+#ifdef DEAL_II_USE_TRILINOS
+SPARSITY_FUNCTIONS(TrilinosWrappers::SparsityPattern);
+BLOCK_SPARSITY_FUNCTIONS(TrilinosWrappers::BlockSparsityPattern);
+#endif
+
+
+#define ONLY_MATRIX_FUNCTIONS(MatrixType) \
+  template void ConstraintMatrix::distribute_local_to_global<MatrixType > (\
+      const FullMatrix<double>        &, \
+      const std::vector<unsigned int> &, \
+      const std::vector<unsigned int> &, \
+      MatrixType                      &) const
+
+ONLY_MATRIX_FUNCTIONS(SparseMatrix<float>);
+ONLY_MATRIX_FUNCTIONS(SparseMatrix<double>);
+ONLY_MATRIX_FUNCTIONS(MatrixBlock<SparseMatrix<float> >);
+ONLY_MATRIX_FUNCTIONS(MatrixBlock<SparseMatrix<double> >);
+ONLY_MATRIX_FUNCTIONS(BlockSparseMatrix<float>);
+ONLY_MATRIX_FUNCTIONS(BlockSparseMatrix<double>);
+
+#ifdef DEAL_II_USE_TRILINOS
+ONLY_MATRIX_FUNCTIONS(TrilinosWrappers::SparseMatrix);
+ONLY_MATRIX_FUNCTIONS(TrilinosWrappers::BlockSparseMatrix);
+#endif
+
+#ifdef DEAL_II_USE_PETSC
+ONLY_MATRIX_FUNCTIONS(PETScWrappers::SparseMatrix);
+ONLY_MATRIX_FUNCTIONS(PETScWrappers::BlockSparseMatrix);
+ONLY_MATRIX_FUNCTIONS(PETScWrappers::MPI::SparseMatrix);
+ONLY_MATRIX_FUNCTIONS(PETScWrappers::MPI::BlockSparseMatrix);
+#endif
+
+#include "constraint_matrix.inst"
+
+DEAL_II_NAMESPACE_CLOSE

Added: branches/s-wang2/for_deal.II/source/lac/petsc_matrix_base.cc
===================================================================
--- branches/s-wang2/for_deal.II/source/lac/petsc_matrix_base.cc	                        (rev 0)
+++ branches/s-wang2/for_deal.II/source/lac/petsc_matrix_base.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,649 @@
+//---------------------------------------------------------------------------
+//    $Id: petsc_matrix_base.cc 27628 2012-11-20 22:49:26Z heister $
+//    Version: $Name$
+//
+//    Copyright (C) 2004, 2005, 2006, 2008, 2009, 2010, 2011, 2012 by the deal.II authors
+//
+//    This file is subject to QPL and may not be  distributed
+//    without copyright and license information. Please refer
+//    to the file deal.II/doc/license.html for the  text  and
+//    further information on this license.
+//
+//---------------------------------------------------------------------------
+
+
+#include <deal.II/lac/petsc_matrix_base.h>
+
+#ifdef DEAL_II_USE_PETSC
+
+#  include <deal.II/lac/petsc_full_matrix.h>
+#  include <deal.II/lac/petsc_sparse_matrix.h>
+#  include <deal.II/lac/petsc_parallel_sparse_matrix.h>
+#  include <deal.II/lac/petsc_vector.h>
+
+DEAL_II_NAMESPACE_OPEN
+
+namespace PETScWrappers
+{
+  namespace MatrixIterators
+  {
+    void
+    MatrixBase::const_iterator::Accessor::
+    visit_present_row ()
+    {
+      // if we are asked to visit the
+      // past-the-end line, then simply
+      // release all our caches and go on
+      // with life
+      if (this->a_row == matrix->m())
+        {
+          colnum_cache.reset ();
+          value_cache.reset ();
+
+          return;
+        }
+
+      // otherwise first flush PETSc caches
+      matrix->compress ();
+
+      // get a representation of the present
+      // row
+      PetscInt           ncols;
+      const PetscInt    *colnums;
+      const PetscScalar *values;
+
+      int ierr;
+      ierr = MatGetRow(*matrix, this->a_row, &ncols, &colnums, &values);
+      AssertThrow (ierr == 0, MatrixBase::ExcPETScError(ierr));
+
+      // copy it into our caches if the line
+      // isn't empty. if it is, then we've
+      // done something wrong, since we
+      // shouldn't have initialized an
+      // iterator for an empty line (what
+      // would it point to?)
+      Assert (ncols != 0, ExcInternalError());
+      colnum_cache.reset (new std::vector<unsigned int> (colnums,
+                                                         colnums+ncols));
+      value_cache.reset (new std::vector<PetscScalar> (values, values+ncols));
+
+      // and finally restore the matrix
+      ierr = MatRestoreRow(*matrix, this->a_row, &ncols, &colnums, &values);
+      AssertThrow (ierr == 0, MatrixBase::ExcPETScError(ierr));
+    }
+  }
+
+
+
+  MatrixBase::MatrixBase ()
+    :
+    last_action (LastAction::none)
+  {}
+
+
+
+  MatrixBase::~MatrixBase ()
+  {
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    const int ierr = MatDestroy (matrix);
+#else
+    const int ierr = MatDestroy (&matrix);
+#endif
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+  }
+
+
+
+  void
+  MatrixBase::clear ()
+  {
+    // destroy the matrix...
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    int ierr = MatDestroy (matrix);
+#else
+    int ierr = MatDestroy (&matrix);
+#endif
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+    // ...and replace it by an empty
+    // sequential matrix
+    const int m=0, n=0, n_nonzero_per_row=0;
+    ierr = MatCreateSeqAIJ(PETSC_COMM_SELF, m, n, n_nonzero_per_row,
+                           0, &matrix);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+  }
+
+
+
+  MatrixBase &
+  MatrixBase::operator = (const value_type d)
+  {
+    Assert (d==value_type(), ExcScalarAssignmentOnlyForZeroValue());
+
+    // flush previously cached elements. this
+    // seems to be necessary since petsc
+    // 2.2.1, at least for parallel vectors
+    // (see test bits/petsc_64)
+    compress ();
+
+    const int ierr = MatZeroEntries (matrix);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return *this;
+  }
+
+
+
+  void
+  MatrixBase::clear_row (const unsigned int row,
+                         const PetscScalar  new_diag_value)
+  {
+    compress ();
+
+    // now set all the entries of this row to
+    // zero
+    const PetscInt petsc_row = row;
+
+    IS index_set;
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    ISCreateGeneral (get_mpi_communicator(), 1, &petsc_row, &index_set);
+#else
+    ISCreateGeneral (get_mpi_communicator(), 1, &petsc_row, PETSC_COPY_VALUES, &index_set);
+#endif
+
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    const int ierr
+      = MatZeroRowsIS(matrix, index_set, new_diag_value);
+#else
+    const int ierr
+      = MatZeroRowsIS(matrix, index_set, new_diag_value, PETSC_NULL, PETSC_NULL);
+#endif
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    ISDestroy (index_set);
+#else
+    ISDestroy (&index_set);
+#endif
+
+    compress ();
+  }
+
+
+
+  void
+  MatrixBase::clear_rows (const std::vector<unsigned int> &rows,
+                          const PetscScalar                new_diag_value)
+  {
+    compress ();
+
+    // now set all the entries of these rows
+    // to zero
+    const std::vector<PetscInt> petsc_rows (rows.begin(), rows.end());
+
+    // call the functions. note that we have
+    // to call them even if #rows is empty,
+    // since this is a collective operation
+    IS index_set;
+
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    ISCreateGeneral (get_mpi_communicator(), rows.size(),
+                     &petsc_rows[0], &index_set);
+#else
+    ISCreateGeneral (get_mpi_communicator(), rows.size(),
+                     &petsc_rows[0], PETSC_COPY_VALUES, &index_set);
+#endif
+
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    const int ierr
+      = MatZeroRowsIS(matrix, index_set, new_diag_value);
+#else
+    const int ierr
+      = MatZeroRowsIS(matrix, index_set, new_diag_value, PETSC_NULL, PETSC_NULL);
+#endif
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    ISDestroy (index_set);
+#else
+    ISDestroy (&index_set);
+#endif
+
+    compress ();
+  }
+
+
+
+  PetscScalar
+  MatrixBase::el (const unsigned int i,
+                  const unsigned int j) const
+  {
+#ifdef PETSC_USE_64BIT_INDICES
+    PetscInt
+#else
+    int
+#endif
+    petsc_i = i, petsc_j = j;
+    PetscScalar value;
+
+    const int ierr
+      = MatGetValues (matrix, 1, &petsc_i, 1, &petsc_j,
+                      &value);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return value;
+  }
+
+
+
+  PetscScalar
+  MatrixBase::diag_element (const unsigned int i) const
+  {
+    Assert (m() == n(), ExcNotQuadratic());
+
+    // this doesn't seem to work any
+    // different than any other element
+    return el(i,i);
+  }
+
+
+
+  void
+  MatrixBase::compress (::dealii::VectorOperation::values operation)	// why operation is not used?
+  {
+    // flush buffers
+    int ierr;
+    ierr = MatAssemblyBegin (matrix,MAT_FINAL_ASSEMBLY);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    ierr = MatAssemblyEnd (matrix,MAT_FINAL_ASSEMBLY);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    last_action = LastAction::none;
+  }
+
+
+
+  unsigned int
+  MatrixBase::m () const
+  {
+#ifdef PETSC_USE_64BIT_INDICES
+    PetscInt
+#else
+    int
+#endif
+    n_rows, n_cols;
+    int ierr = MatGetSize (matrix, &n_rows, &n_cols);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return n_rows;
+  }
+
+
+
+  unsigned int
+  MatrixBase::n () const
+  {
+#ifdef PETSC_USE_64BIT_INDICES
+    PetscInt
+#else
+    int
+#endif
+    n_rows, n_cols;
+    int ierr = MatGetSize (matrix, &n_rows, &n_cols);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return n_cols;
+  }
+
+
+
+  unsigned int
+  MatrixBase::local_size () const
+  {
+#ifdef PETSC_USE_64BIT_INDICES
+    PetscInt
+#else
+    int
+#endif
+    n_rows, n_cols;
+    int ierr = MatGetLocalSize (matrix, &n_rows, &n_cols);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return n_rows;
+  }
+
+
+
+  std::pair<unsigned int, unsigned int>
+  MatrixBase::local_range () const
+  {
+#ifdef PETSC_USE_64BIT_INDICES
+    PetscInt
+#else
+    int
+#endif
+    begin, end;
+    const int ierr = MatGetOwnershipRange (static_cast<const Mat &>(matrix),
+                                           &begin, &end);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return std::make_pair (begin, end);
+  }
+
+
+
+  unsigned int
+  MatrixBase::n_nonzero_elements () const
+  {
+    MatInfo mat_info;
+    const int ierr
+      = MatGetInfo (matrix, MAT_GLOBAL_SUM, &mat_info);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return static_cast<unsigned int>(mat_info.nz_used);
+  }
+
+
+
+  unsigned int
+  MatrixBase::
+  row_length (const unsigned int row) const
+  {
+//TODO: this function will probably only work if compress() was called on the
+//matrix previously. however, we can't do this here, since it would impose
+//global communication and one would have to make sure that this function is
+//called the same number of times from all processors, something that is
+//unreasonable. there should simply be a way in PETSc to query the number of
+//entries in a row bypassing the call to compress(), but I can't find one
+    Assert (row < m(), ExcInternalError());
+
+    // get a representation of the present
+    // row
+    PetscInt ncols;
+    const PetscInt    *colnums;
+    const PetscScalar *values;
+
+//TODO: this is probably horribly inefficient; we should lobby for a way to
+//query this information from PETSc
+    int ierr;
+    ierr = MatGetRow(*this, row, &ncols, &colnums, &values);
+    AssertThrow (ierr == 0, MatrixBase::ExcPETScError(ierr));
+
+    // then restore the matrix and return the
+    // number of columns in this row as
+    // queried previously
+    ierr = MatRestoreRow(*this, row, &ncols, &colnums, &values);
+    AssertThrow (ierr == 0, MatrixBase::ExcPETScError(ierr));
+
+    return ncols;
+  }
+
+
+  PetscReal
+  MatrixBase::l1_norm () const
+  {
+    PetscReal result;
+
+    const int ierr
+      = MatNorm (matrix, NORM_1, &result);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return result;
+  }
+
+
+
+  PetscReal
+  MatrixBase::linfty_norm () const
+  {
+    PetscReal result;
+
+    const int ierr
+      = MatNorm (matrix, NORM_INFINITY, &result);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return result;
+  }
+
+
+
+  PetscReal
+  MatrixBase::frobenius_norm () const
+  {
+    PetscReal result;
+
+    const int ierr
+      = MatNorm (matrix, NORM_FROBENIUS, &result);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return result;
+  }
+
+
+  PetscScalar
+  MatrixBase::matrix_norm_square (const VectorBase &v) const
+  {
+    Vector tmp(v.size());
+    vmult (tmp, v);
+    return tmp*v;
+  }
+
+
+  PetscScalar
+  MatrixBase::matrix_scalar_product (const VectorBase &u,
+                                     const VectorBase &v) const
+  {
+    Vector tmp(v.size());
+    vmult (tmp, v);
+    return u*tmp;
+  }
+
+
+#if DEAL_II_PETSC_VERSION_GTE(3,1,0)
+  PetscReal
+  MatrixBase::trace () const
+  {
+    PetscReal result;
+
+    const int ierr
+      = MatGetTrace (matrix, &result);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return result;
+  }
+#endif
+
+
+
+  MatrixBase &
+  MatrixBase::operator *= (const PetscScalar a)
+  {
+    const int ierr = MatScale (matrix, a);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return *this;
+  }
+
+
+
+  MatrixBase &
+  MatrixBase::operator /= (const PetscScalar a)
+  {
+    const PetscScalar factor = 1./a;
+    const int ierr = MatScale (matrix, factor);
+
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    return *this;
+  }
+
+
+  void
+  MatrixBase::vmult (VectorBase       &dst,
+                     const VectorBase &src) const
+  {
+    Assert (&src != &dst, ExcSourceEqualsDestination());
+
+    const int ierr = MatMult (matrix, src, dst);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+  }
+
+
+
+  void
+  MatrixBase::Tvmult (VectorBase       &dst,
+                      const VectorBase &src) const
+  {
+    Assert (&src != &dst, ExcSourceEqualsDestination());
+
+    const int ierr = MatMultTranspose (matrix, src, dst);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+  }
+
+
+
+  void
+  MatrixBase::vmult_add (VectorBase       &dst,
+                         const VectorBase &src) const
+  {
+    Assert (&src != &dst, ExcSourceEqualsDestination());
+
+    const int ierr = MatMultAdd (matrix, src, dst, dst);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+  }
+
+
+
+  void
+  MatrixBase::Tvmult_add (VectorBase       &dst,
+                          const VectorBase &src) const
+  {
+    Assert (&src != &dst, ExcSourceEqualsDestination());
+
+    const int ierr = MatMultTransposeAdd (matrix, src, dst, dst);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+  }
+
+
+  PetscScalar
+  MatrixBase::residual (VectorBase       &dst,
+                        const VectorBase &x,
+                        const VectorBase &b) const
+  {
+    // avoid the use of a temporary, and
+    // rather do one negation pass more than
+    // necessary
+    vmult (dst, x);
+    dst -= b;
+    dst *= -1;
+
+    return dst.l2_norm();
+  }
+
+
+
+  MatrixBase::operator Mat () const
+  {
+    return matrix;
+  }
+
+  void
+  MatrixBase::transpose ()
+  {
+    int ierr = MatTranspose(matrix, MAT_REUSE_MATRIX, &matrix);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+  }
+
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+  PetscTruth
+#else
+  PetscBool
+#endif
+  MatrixBase::is_symmetric (const double tolerance)
+  {
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    PetscTruth
+#else
+    PetscBool
+#endif
+    truth;
+    // First flush PETSc caches
+    compress ();
+    MatIsSymmetric (matrix, tolerance, &truth);
+    return truth;
+  }
+
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+  PetscTruth
+#else
+  PetscBool
+#endif
+  MatrixBase::is_hermitian (const double tolerance)
+  {
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    PetscTruth
+#else
+    PetscBool
+#endif
+    truth;
+
+    // First flush PETSc caches
+    compress ();
+    MatIsHermitian (matrix, tolerance, &truth);
+
+    return truth;
+  }
+
+  void
+  MatrixBase::write_ascii() const
+  {
+    // First flush PETSc caches
+//    compress ();					// shuqiangwang: call compress() before call write_ascii().
+
+    // Set options
+    PetscViewerSetFormat (PETSC_VIEWER_STDOUT_WORLD,
+                          PETSC_VIEWER_DEFAULT);
+
+    // Write to screen
+    MatView (matrix,PETSC_VIEWER_STDOUT_WORLD);
+  }
+
+
+
+  std::size_t
+  MatrixBase::memory_consumption() const
+  {
+    MatInfo info;
+    MatGetInfo(matrix, MAT_LOCAL, &info);
+
+    return sizeof(*this) + static_cast<unsigned int>(info.memory);
+  }
+
+
+  void MatrixBase::copy_from(const MatrixBase &source)	// added by shuqiangwang
+  {
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+	  int ierr = MatDestroy (matrix);
+#else
+	  int ierr = MatDestroy (&matrix);
+#endif
+	  AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+	  ierr = MatDuplicate(source.matrix,MAT_COPY_VALUES,&(this->matrix));
+	  AssertThrow (ierr == 0, ExcPETScError(ierr));
+	  this->last_action = source.last_action;
+	  this->column_indices = source.column_indices;
+	  this->column_values = source.column_values;
+  }
+
+  void MatrixBase::add(double factor, const MatrixBase &source)
+  {
+	  const int ierr = MatAXPY(this->matrix,factor,source.matrix,DIFFERENT_NONZERO_PATTERN);
+	  AssertThrow (ierr == 0, ExcPETScError(ierr));
+  }
+
+}
+
+DEAL_II_NAMESPACE_CLOSE
+
+#endif // DEAL_II_USE_PETSC

Added: branches/s-wang2/for_deal.II/source/lac/petsc_solver.cc
===================================================================
--- branches/s-wang2/for_deal.II/source/lac/petsc_solver.cc	                        (rev 0)
+++ branches/s-wang2/for_deal.II/source/lac/petsc_solver.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,890 @@
+//---------------------------------------------------------------------------
+//    $Id: petsc_solver.cc 27668 2012-11-21 23:50:16Z bangerth $
+//    Version: $Name$
+//
+//    Copyright (C) 2004, 2006, 2008, 2009, 2010, 2012 by the deal.II authors
+//
+//    This file is subject to QPL and may not be  distributed
+//    without copyright and license information. Please refer
+//    to the file deal.II/doc/license.html for the  text  and
+//    further information on this license.
+//
+//---------------------------------------------------------------------------
+
+#include <deal.II/base/logstream.h>
+#include <deal.II/lac/petsc_solver.h>
+
+#ifdef DEAL_II_USE_PETSC
+
+#  include <deal.II/lac/petsc_matrix_base.h>
+#  include <deal.II/lac/petsc_vector_base.h>
+#  include <deal.II/lac/petsc_precondition.h>
+#  include <cmath>
+
+#include <petscversion.h>
+
+DEAL_II_NAMESPACE_OPEN
+
+namespace PETScWrappers
+{
+
+  SolverBase::SolverData::~SolverData ()
+  {
+    // destroy the solver object
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+    int ierr = KSPDestroy (ksp);
+#else
+    int ierr = KSPDestroy (&ksp);
+#endif
+
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+  }
+
+
+
+  SolverBase::SolverBase (SolverControl &cn,
+                          const MPI_Comm &mpi_communicator)
+    :
+    solver_control (cn),
+    mpi_communicator (mpi_communicator)
+  {}
+
+
+
+  SolverBase::~SolverBase ()
+  {}
+
+
+
+  void
+  SolverBase::solve (const MatrixBase         &A,
+                     VectorBase               &x,
+                     const VectorBase         &b,
+                     const PreconditionerBase &preconditioner)
+  {
+    int ierr;
+    // first create a solver object if this
+    // is necessary
+    if (solver_data.get() == 0)
+      {
+        solver_data.reset (new SolverData());
+
+        ierr = KSPCreate (mpi_communicator, &solver_data->ksp);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+        // set the matrices involved. the
+        // last argument is irrelevant here,
+        // since we use the solver only once
+        // anyway
+        ierr = KSPSetOperators (solver_data->ksp, A, preconditioner,
+                                SAME_PRECONDITIONER);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+        // let derived classes set the solver
+        // type, and the preconditioning
+        // object set the type of
+        // preconditioner
+        set_solver_type (solver_data->ksp);
+
+        ierr = KSPSetPC (solver_data->ksp, preconditioner.get_pc());
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+        // then a convergence monitor
+        // function. that function simply
+        // checks with the solver_control
+        // object we have in this object for
+        // convergence
+        KSPSetConvergenceTest (solver_data->ksp, &convergence_test,
+                               reinterpret_cast<void *>(&solver_control),
+                               PETSC_NULL);
+
+        KSPSetNormType(solver_data->ksp, KSP_NORM_UNPRECONDITIONED);	// shuqiangwang
+//        int maxits;
+//        double rtol, atol, dtol;
+//        KSPGetTolerances(solver_data->ksp, &rtol, &atol, &dtol, &maxits);
+////        KSPSetTolerances(solver_data->ksp, rtol, solver_control.tolerance(), dtol, solver_control.max_steps());
+//        KSPSetTolerances(solver_data->ksp, solver_control.tolerance(), atol, dtol, solver_control.max_steps());
+      }
+
+    // set the command line option prefix name
+    ierr = KSPSetOptionsPrefix(solver_data->ksp, prefix_name.c_str());
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // set the command line options provided
+    // by the user to override the defaults
+    ierr = KSPSetFromOptions (solver_data->ksp);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // then do the real work: set up solver
+    // internal data and solve the
+    // system.
+    ierr = KSPSetUp (solver_data->ksp);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    ierr = KSPSolve (solver_data->ksp, b, x);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // do not destroy solver object
+//    solver_data.reset ();
+
+    // in case of failure: throw
+    // exception
+    if (solver_control.last_check() != SolverControl::success)
+      throw SolverControl::NoConvergence (solver_control.last_step(),
+                                          solver_control.last_value());
+    // otherwise exit as normal
+  }
+
+
+  void
+  SolverBase::set_prefix(const std::string &prefix)
+  {
+    prefix_name = prefix ;
+  }
+
+
+  void
+  SolverBase::reset()
+  {
+    solver_data.reset ();
+  }
+
+
+  SolverControl &
+  SolverBase::control() const
+  {
+    return solver_control;
+  }
+
+
+  int
+  SolverBase::convergence_test (KSP                 /*ksp*/,
+#ifdef PETSC_USE_64BIT_INDICES
+                                const PetscInt      iteration,
+#else
+                                const int           iteration,
+#endif
+                                const PetscReal     residual_norm,
+                                KSPConvergedReason *reason,
+                                void               *solver_control_x)
+  {
+    SolverControl &solver_control = *reinterpret_cast<SolverControl *>(solver_control_x);
+
+    const SolverControl::State state
+      = solver_control.check (iteration, residual_norm);
+
+    switch (state)
+      {
+      case ::dealii::SolverControl::iterate:
+        *reason = KSP_CONVERGED_ITERATING;
+        break;
+
+      case ::dealii::SolverControl::success:
+        *reason = static_cast<KSPConvergedReason>(1);
+        break;
+
+      case ::dealii::SolverControl::failure:
+        if (solver_control.last_step() > solver_control.max_steps())
+          *reason = KSP_DIVERGED_ITS;
+        else
+          *reason = KSP_DIVERGED_DTOL;
+        break;
+
+      default:
+        Assert (false, ExcNotImplemented());
+      }
+
+    // return without failure
+    return 0;
+  }
+
+
+
+  /* ---------------------- SolverRichardson ------------------------ */
+
+  SolverRichardson::AdditionalData::
+  AdditionalData (const double omega)
+    :
+    omega (omega)
+  {}
+
+
+
+  SolverRichardson::SolverRichardson (SolverControl        &cn,
+                                      const MPI_Comm       &mpi_communicator,
+                                      const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data)
+  {}
+
+
+  void
+  SolverRichardson::set_solver_type (KSP &ksp) const
+  {
+    int ierr;
+    ierr = KSPSetType (ksp, KSPRICHARDSON);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // set the damping factor from the data
+    ierr = KSPRichardsonSetScale (ksp, additional_data.omega);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // in the deal.II solvers, we always
+    // honor the initial guess in the
+    // solution vector. do so here as well:
+    KSPSetInitialGuessNonzero (ksp, PETSC_TRUE);
+
+    // Hand over the absolute
+    // tolerance and the maximum
+    // iteration number to the PETSc
+    // convergence criterion. The
+    // custom deal.II SolverControl
+    // object is ignored by the PETSc
+    // Richardson method (when no
+    // PETSc monitoring is present),
+    // since in this case PETSc
+    // uses a faster version of
+    // the Richardson iteration,
+    // where no residual is
+    // available.
+    KSPSetTolerances(ksp, PETSC_DEFAULT, this->solver_control.tolerance(),
+                     PETSC_DEFAULT, this->solver_control.max_steps()+1);
+  }
+
+
+  /* ---------------------- SolverChebychev ------------------------ */
+
+  SolverChebychev::SolverChebychev (SolverControl        &cn,
+                                    const MPI_Comm       &mpi_communicator,
+                                    const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data)
+  {}
+
+
+  void
+  SolverChebychev::set_solver_type (KSP &ksp) const
+  {
+    // set the type of solver. note the
+    // completely pointless change in
+    // spelling Chebyshev between PETSc 3.2
+    // and 3.3...
+    int ierr;
+
+#if DEAL_II_PETSC_VERSION_LT(3,3,0)
+    ierr = KSPSetType (ksp, KSPCHEBYCHEV);
+#else
+    ierr = KSPSetType (ksp, KSPCHEBYSHEV);
+#endif
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // in the deal.II solvers, we always
+    // honor the initial guess in the
+    // solution vector. do so here as well:
+    KSPSetInitialGuessNonzero (ksp, PETSC_TRUE);
+  }
+
+
+  /* ---------------------- SolverCG ------------------------ */
+
+  SolverCG::SolverCG (SolverControl        &cn,
+                      const MPI_Comm       &mpi_communicator,
+                      const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data)
+  {}
+
+
+  void
+  SolverCG::set_solver_type (KSP &ksp) const
+  {
+    int ierr;
+    ierr = KSPSetType (ksp, KSPCG);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // in the deal.II solvers, we always
+    // honor the initial guess in the
+    // solution vector. do so here as well:
+    KSPSetInitialGuessNonzero (ksp, PETSC_TRUE);
+  }
+
+
+  /* ---------------------- SolverBiCG ------------------------ */
+
+  SolverBiCG::SolverBiCG (SolverControl        &cn,
+                          const MPI_Comm       &mpi_communicator,
+                          const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data)
+  {}
+
+
+  void
+  SolverBiCG::set_solver_type (KSP &ksp) const
+  {
+    int ierr;
+    ierr = KSPSetType (ksp, KSPBICG);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // in the deal.II solvers, we always
+    // honor the initial guess in the
+    // solution vector. do so here as well:
+    KSPSetInitialGuessNonzero (ksp, PETSC_TRUE);
+  }
+
+
+  /* ---------------------- SolverGMRES ------------------------ */
+
+  SolverGMRES::AdditionalData::
+  AdditionalData (const unsigned int restart_parameter,
+                  const bool right_preconditioning)
+    :
+    restart_parameter (restart_parameter),
+    right_preconditioning (right_preconditioning)
+  {}
+
+
+
+  SolverGMRES::SolverGMRES (SolverControl        &cn,
+                            const MPI_Comm       &mpi_communicator,
+                            const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data)
+  {}
+
+
+  void
+  SolverGMRES::set_solver_type (KSP &ksp) const
+  {
+    int ierr;
+    ierr = KSPSetType (ksp, KSPGMRES);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // set the restart parameter from the
+    // data. we would like to use the simple
+    // code that is commented out, but this
+    // leads to nasty warning and error
+    // messages due to some stupidity on
+    // PETSc's side: KSPGMRESSetRestart is
+    // implemented as a macro in which return
+    // statements are hidden. This may work
+    // if people strictly follow the PETSc
+    // coding style of always having
+    // functions return an integer error
+    // code, but the present function isn't
+    // like this.
+    /*
+        ierr = KSPGMRESSetRestart (ksp, additional_data.restart_parameter);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+    */
+    // so rather expand their macros by hand,
+    // and do some equally nasty stuff that at
+    // least doesn't yield warnings...
+    int (*fun_ptr)(KSP,int);
+    ierr = PetscObjectQueryFunction((PetscObject)(ksp),
+                                    "KSPGMRESSetRestart_C",
+                                    (void (* *)())&fun_ptr);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    ierr = (*fun_ptr)(ksp,additional_data.restart_parameter);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // Set preconditioning side to
+    // right
+    if (additional_data.right_preconditioning)
+      {
+#if DEAL_II_PETSC_VERSION_LT(3,2,0)
+        ierr = KSPSetPreconditionerSide(ksp, PC_RIGHT);
+#else
+        ierr = KSPSetPCSide(ksp, PC_RIGHT);
+#endif
+
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+      }
+
+    // in the deal.II solvers, we always
+    // honor the initial guess in the
+    // solution vector. do so here as well:
+    KSPSetInitialGuessNonzero (ksp, PETSC_TRUE);
+  }
+
+
+  /* ---------------------- SolverBicgstab ------------------------ */
+
+  SolverBicgstab::SolverBicgstab (SolverControl        &cn,
+                                  const MPI_Comm       &mpi_communicator,
+                                  const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data)
+  {}
+
+
+  void
+  SolverBicgstab::set_solver_type (KSP &ksp) const
+  {
+    int ierr;
+    ierr = KSPSetType (ksp, KSPBCGS);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // in the deal.II solvers, we always
+    // honor the initial guess in the
+    // solution vector. do so here as well:
+    KSPSetInitialGuessNonzero (ksp, PETSC_TRUE);
+  }
+
+
+  /* ---------------------- SolverCGS ------------------------ */
+
+  SolverCGS::SolverCGS (SolverControl        &cn,
+                        const MPI_Comm       &mpi_communicator,
+                        const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data)
+  {}
+
+
+  void
+  SolverCGS::set_solver_type (KSP &ksp) const
+  {
+    int ierr;
+    ierr = KSPSetType (ksp, KSPCGS);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // in the deal.II solvers, we always
+    // honor the initial guess in the
+    // solution vector. do so here as well:
+    KSPSetInitialGuessNonzero (ksp, PETSC_TRUE);
+  }
+
+
+  /* ---------------------- SolverTFQMR ------------------------ */
+
+  SolverTFQMR::SolverTFQMR (SolverControl        &cn,
+                            const MPI_Comm       &mpi_communicator,
+                            const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data)
+  {}
+
+
+  void
+  SolverTFQMR::set_solver_type (KSP &ksp) const
+  {
+    int ierr;
+    ierr = KSPSetType (ksp, KSPTFQMR);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // in the deal.II solvers, we always
+    // honor the initial guess in the
+    // solution vector. do so here as well:
+    KSPSetInitialGuessNonzero (ksp, PETSC_TRUE);
+  }
+
+
+  /* ---------------------- SolverTCQMR ------------------------ */
+
+  SolverTCQMR::SolverTCQMR (SolverControl        &cn,
+                            const MPI_Comm       &mpi_communicator,
+                            const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data)
+  {}
+
+
+  void
+  SolverTCQMR::set_solver_type (KSP &ksp) const
+  {
+    int ierr;
+    ierr = KSPSetType (ksp, KSPTCQMR);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // in the deal.II solvers, we always
+    // honor the initial guess in the
+    // solution vector. do so here as well:
+    KSPSetInitialGuessNonzero (ksp, PETSC_TRUE);
+  }
+
+
+  /* ---------------------- SolverCR ------------------------ */
+
+  SolverCR::SolverCR (SolverControl        &cn,
+                      const MPI_Comm       &mpi_communicator,
+                      const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data)
+  {}
+
+
+  void
+  SolverCR::set_solver_type (KSP &ksp) const
+  {
+    int ierr;
+    ierr = KSPSetType (ksp, KSPCR);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // in the deal.II solvers, we always
+    // honor the initial guess in the
+    // solution vector. do so here as well:
+    KSPSetInitialGuessNonzero (ksp, PETSC_TRUE);
+  }
+
+
+  /* ---------------------- SolverLSQR ------------------------ */
+
+  SolverLSQR::SolverLSQR (SolverControl        &cn,
+                          const MPI_Comm       &mpi_communicator,
+                          const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data)
+  {}
+
+
+  void
+  SolverLSQR::set_solver_type (KSP &ksp) const
+  {
+    int ierr;
+    ierr = KSPSetType (ksp, KSPLSQR);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // in the deal.II solvers, we always
+    // honor the initial guess in the
+    // solution vector. do so here as well:
+    KSPSetInitialGuessNonzero (ksp, PETSC_TRUE);
+  }
+
+
+  /* ---------------------- SolverPreOnly ------------------------ */
+
+  SolverPreOnly::SolverPreOnly (SolverControl        &cn,
+                                const MPI_Comm       &mpi_communicator,
+                                const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data)
+  {}
+
+
+  void
+  SolverPreOnly::set_solver_type (KSP &ksp) const
+  {
+    int ierr;
+    ierr = KSPSetType (ksp, KSPPREONLY);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    // The KSPPREONLY solver of
+    // PETSc never calls the convergence
+    // monitor, which leads to failure
+    // even when everything was ok.
+    // Therefore the SolverControl status
+    // is set to some nice values, which
+    // guarantee a nice result at the end
+    // of the solution process.
+    solver_control.check (1, 0.0);
+
+    // Using the PREONLY solver with
+    // a nonzero initial guess leads
+    // PETSc to produce some error messages.
+    KSPSetInitialGuessNonzero (ksp, PETSC_FALSE);
+  }
+
+
+  /* ---------------------- SparseDirectMUMPS------------------------ */
+
+  SparseDirectMUMPS::SparseDirectMUMPS (SolverControl     &cn,
+                                        const MPI_Comm       &mpi_communicator,
+                                        const AdditionalData &data)
+    :
+    SolverBase (cn, mpi_communicator),
+    additional_data (data),
+    symmetric_mode(false)
+  {}
+
+
+  void
+  SparseDirectMUMPS::set_solver_type (KSP &ksp) const
+  {
+    /**
+    * KSPPREONLY implements a stub
+    * method that applies only the
+    * preconditioner.  Its use is due
+    * to SparseDirectMUMPS being
+    * a direct (rather than iterative)
+    * solver
+    */
+    int ierr;
+    ierr = KSPSetType (ksp, KSPPREONLY);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    /**
+     * The KSPPREONLY solver of
+     * PETSc never calls the convergence
+     * monitor, which leads to failure
+     * even when everything was ok.
+     * Therefore, the SolverControl
+     * status is set to some nice
+     * values, which guarantee a nice
+     * result at the end of the solution
+     * process.
+     */
+    solver_control.check (1, 0.0);
+
+    /**
+     * Using a PREONLY solver with a
+     * nonzero initial guess leads PETSc
+     * to produce some error messages.
+     */
+    KSPSetInitialGuessNonzero (ksp, PETSC_FALSE);
+  }
+
+  void
+  SparseDirectMUMPS::solve (const MatrixBase &A,
+                            VectorBase       &x,
+                            const VectorBase &b)
+  {
+#ifdef PETSC_HAVE_MUMPS
+    /**
+     * had some trouble with the
+     * deallog printing to console
+     * the outcome of the solve function
+     * for every process. Brought
+     * down the depth level to zero
+     * to alleviate this
+     */
+    deallog.depth_console (0);
+    int ierr;
+
+    /**
+     * factorization matrix to be
+     * obtained from MUMPS
+     */
+    Mat F;
+
+    /**
+     * setting MUMPS integer control
+     * parameters ICNTL to be passed
+     * to MUMPS.  Setting
+     * entry 7 of MUMPS ICNTL array
+     * (of size 40) to a value of 2.
+     * This sets use of Approximate
+     * Minimum Fill (AMF)
+     */
+    PetscInt ival=2, icntl=7;
+    /**
+     * number of iterations to
+     * solution (should be 1)
+     * for a direct solver
+     */
+    PetscInt its;
+    /**
+     * norm of residual
+     */
+    PetscReal rnorm;
+
+    /**
+     * creating a solver object
+     * if this is necessary
+     */
+    if (solver_data.get() == 0)
+      {
+        solver_data.reset (new SolverDataMUMPS ());
+
+        /**
+         * creates the default KSP
+         * context and puts it in
+         * the location solver_data->ksp
+         */
+        ierr = KSPCreate (mpi_communicator, &solver_data->ksp);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+        /**
+         * set the matrices involved.
+         * the last argument is irrelevant
+         * here, since we use the solver
+         * only once anyway
+         */
+        ierr = KSPSetOperators (solver_data->ksp, A, A,
+                                DIFFERENT_NONZERO_PATTERN);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+        /**
+         * setting the solver type
+         */
+        set_solver_type (solver_data->ksp);
+
+        /**
+        * getting the associated
+        * preconditioner context
+        */
+        ierr = KSPGetPC (solver_data->ksp, & solver_data->pc);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+        /**
+         * build PETSc PC for particular
+                 * PCLU or PCCHOLESKY preconditioner
+                 * depending on whether the
+                 * symmetric mode has been set
+                 */
+        if (symmetric_mode)
+          ierr = PCSetType (solver_data->pc, PCCHOLESKY);
+        else
+          ierr = PCSetType (solver_data->pc, PCLU);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+        /**
+         * convergence monitor function
+         * that checks with the solver_control
+         * object for convergence
+         */
+        KSPSetConvergenceTest (solver_data->ksp, &convergence_test,
+                               reinterpret_cast<void *>(&solver_control),
+                               PETSC_NULL);
+
+        /**
+         * set the software that is to be
+         * used to perform the lu factorization
+         * here we start to see differences
+         * with the base class solve function
+         */
+        ierr = PCFactorSetMatSolverPackage (solver_data->pc, MATSOLVERMUMPS);
+        AssertThrow (ierr == 0, ExcPETScError (ierr));
+
+        /**
+         * set up the package to call
+         * for the factorization
+         */
+        ierr = PCFactorSetUpMatSolverPackage (solver_data->pc);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+        /**
+         * get the factored matrix F from the
+         * preconditioner context.  This routine
+         * is valid only for LU, ILU, Cholesky,
+         * and imcomplete Cholesky
+         */
+        ierr = PCFactorGetMatrix(solver_data->pc, &F);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+        /**
+         * Passing the control parameters
+         * to MUMPS
+         */
+        ierr = MatMumpsSetIcntl (F, icntl, ival);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+        /**
+         * set the command line option prefix name
+         */
+        ierr = KSPSetOptionsPrefix(solver_data->ksp, prefix_name.c_str());
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+        /**
+         * set the command line options provided
+         * by the user to override the defaults
+         */
+        ierr = KSPSetFromOptions (solver_data->ksp);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+      }
+
+    /**
+     * solve the linear system
+     */
+    ierr = KSPSolve (solver_data->ksp, b, x);
+    AssertThrow (ierr == 0, ExcPETScError(ierr));
+
+    /**
+    * in case of failure
+    * throw exception
+    */
+    if (solver_control.last_check() != SolverControl::success)
+      throw SolverControl::NoConvergence (solver_control.last_step(),
+                                          solver_control.last_value());
+    else
+      {
+        /**
+         * obtain convergence
+         * information. obtain
+         * the number of iterations
+         * and residual norm
+         */
+        ierr = KSPGetIterationNumber (solver_data->ksp, &its);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+        ierr = KSPGetResidualNorm (solver_data->ksp, &rnorm);
+        AssertThrow (ierr == 0, ExcPETScError(ierr));
+      }
+
+#else  // PETSC_HAVE_MUMPS
+    Assert (false,
+            ExcMessage ("Your PETSc installation does not include a copy of "
+                        "MUMPS package necessary for this solver"));
+
+    // Cast to void to silence compiler
+    // warnings
+    (void) A;
+    (void) x;
+    (void) b;
+#endif
+
+  }
+
+  int SparseDirectMUMPS::convergence_test (KSP            /*ksp*/,
+#ifdef PETSC_USE_64BIT_INDICES
+                                           const PetscInt       iteration,
+#else
+                                           const int            iteration,
+#endif
+                                           const PetscReal      residual_norm,
+                                           KSPConvergedReason   *reason,
+                                           void                 *solver_control_x)
+  {
+    SolverControl &solver_control = *reinterpret_cast<SolverControl *>(solver_control_x);
+
+    const SolverControl::State state
+      = solver_control.check (iteration, residual_norm);
+
+    switch (state)
+      {
+      case ::dealii::SolverControl::iterate:
+        *reason = KSP_CONVERGED_ITERATING;
+        break;
+
+      case ::dealii::SolverControl::success:
+        *reason = static_cast<KSPConvergedReason>(1);
+        break;
+
+      case ::dealii::SolverControl::failure:
+        if (solver_control.last_step() > solver_control.max_steps())
+          *reason = KSP_DIVERGED_ITS;
+        else
+          *reason = KSP_DIVERGED_DTOL;
+        break;
+
+      default:
+        Assert (false, ExcNotImplemented());
+      }
+
+    return 0;
+  }
+
+  void
+  SparseDirectMUMPS::set_symmetric_mode(const bool flag)
+  {
+    symmetric_mode = flag;
+  }
+
+}
+
+DEAL_II_NAMESPACE_CLOSE
+
+#endif // DEAL_II_USE_PETSC

Added: branches/s-wang2/for_deal.II/source/lac/trilinos_sparse_matrix.cc
===================================================================
--- branches/s-wang2/for_deal.II/source/lac/trilinos_sparse_matrix.cc	                        (rev 0)
+++ branches/s-wang2/for_deal.II/source/lac/trilinos_sparse_matrix.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,1574 @@
+//---------------------------------------------------------------------------
+//    $Id: trilinos_sparse_matrix.cc 27628 2012-11-20 22:49:26Z heister $
+//    Version: $Name$
+//
+//    Copyright (C) 2008, 2009, 2010, 2011, 2012 by the deal.II authors
+//
+//    This file is subject to QPL and may not be  distributed
+//    without copyright and license information. Please refer
+//    to the file deal.II/doc/license.html for the  text  and
+//    further information on this license.
+//
+//---------------------------------------------------------------------------
+
+
+#include <deal.II/lac/trilinos_sparse_matrix.h>
+
+#ifdef DEAL_II_USE_TRILINOS
+
+#  include <deal.II/base/utilities.h>
+#  include <deal.II/lac/sparse_matrix.h>
+#  include <deal.II/lac/trilinos_sparsity_pattern.h>
+#  include <deal.II/lac/sparsity_pattern.h>
+#  include <deal.II/lac/compressed_sparsity_pattern.h>
+#  include <deal.II/lac/compressed_set_sparsity_pattern.h>
+#  include <deal.II/lac/compressed_simple_sparsity_pattern.h>
+
+#  include <ml_epetra_utils.h>
+#  include <ml_struct.h>
+#  include <Teuchos_RCP.hpp>
+
+DEAL_II_NAMESPACE_OPEN
+
+namespace TrilinosWrappers
+{
+  namespace MatrixIterators
+  {
+    void
+    SparseMatrix::const_iterator::Accessor::
+    visit_present_row ()
+    {
+      // if we are asked to visit the
+      // past-the-end line, then simply
+      // release all our caches and go on
+      // with life
+      if (this->a_row == matrix->m())
+        {
+          colnum_cache.reset ();
+          value_cache.reset ();
+
+          return;
+        }
+
+      // otherwise first flush Trilinos caches
+      matrix->compress ();
+
+      // get a representation of the present
+      // row
+      int ncols;
+      int colnums = matrix->n();
+      if (value_cache.get() == 0)
+        {
+          value_cache.reset (new std::vector<TrilinosScalar> (matrix->n()));
+          colnum_cache.reset (new std::vector<unsigned int> (matrix->n()));
+        }
+      else
+        {
+          value_cache->resize (matrix->n());
+          colnum_cache->resize (matrix->n());
+        }
+
+      int ierr = matrix->trilinos_matrix().
+                 ExtractGlobalRowCopy((int)this->a_row,
+                                      colnums,
+                                      ncols, &((*value_cache)[0]),
+                                      reinterpret_cast<int *>(&((*colnum_cache)[0])));
+      value_cache->resize (ncols);
+      colnum_cache->resize (ncols);
+      AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+      // copy it into our caches if the
+      // line isn't empty. if it is, then
+      // we've done something wrong, since
+      // we shouldn't have initialized an
+      // iterator for an empty line (what
+      // would it point to?)
+    }
+  }
+
+
+  // The constructor is actually the
+  // only point where we have to check
+  // whether we build a serial or a
+  // parallel Trilinos matrix.
+  // Actually, it does not even matter
+  // how many threads there are, but
+  // only if we use an MPI compiler or
+  // a standard compiler. So, even one
+  // thread on a configuration with
+  // MPI will still get a parallel
+  // interface.
+  SparseMatrix::SparseMatrix ()
+    :
+    column_space_map (new Epetra_Map (0, 0,
+                                      Utilities::Trilinos::comm_self())),
+    matrix (new Epetra_FECrsMatrix(View, *column_space_map,
+                                   *column_space_map, 0)),
+    last_action (Zero),
+    compressed (true)
+  {
+    matrix->FillComplete();
+  }
+
+
+
+  SparseMatrix::SparseMatrix (const Epetra_Map &input_map,
+                              const unsigned int n_max_entries_per_row)
+    :
+    column_space_map (new Epetra_Map (input_map)),
+    matrix (new Epetra_FECrsMatrix(Copy, *column_space_map,
+                                   int(n_max_entries_per_row), false)),
+    last_action (Zero),
+    compressed (false)
+  {}
+
+
+
+  SparseMatrix::SparseMatrix (const Epetra_Map                &input_map,
+                              const std::vector<unsigned int> &n_entries_per_row)
+    :
+    column_space_map (new Epetra_Map (input_map)),
+    matrix (new Epetra_FECrsMatrix
+            (Copy, *column_space_map,
+             (int *)const_cast<unsigned int *>(&(n_entries_per_row[0])),
+             false)),
+    last_action (Zero),
+    compressed (false)
+  {}
+
+
+
+  SparseMatrix::SparseMatrix (const Epetra_Map &input_row_map,
+                              const Epetra_Map &input_col_map,
+                              const unsigned int n_max_entries_per_row)
+    :
+    column_space_map (new Epetra_Map (input_col_map)),
+    matrix (new Epetra_FECrsMatrix(Copy, input_row_map,
+                                   int(n_max_entries_per_row), false)),
+    last_action (Zero),
+    compressed (false)
+  {}
+
+
+
+  SparseMatrix::SparseMatrix (const Epetra_Map                &input_row_map,
+                              const Epetra_Map                &input_col_map,
+                              const std::vector<unsigned int> &n_entries_per_row)
+    :
+    column_space_map (new Epetra_Map (input_col_map)),
+    matrix (new Epetra_FECrsMatrix(Copy, input_row_map,
+                                   (int *)const_cast<unsigned int *>(&(n_entries_per_row[0])),
+                                   false)),
+    last_action (Zero),
+    compressed (false)
+  {}
+
+
+
+  SparseMatrix::SparseMatrix (const unsigned int m,
+                              const unsigned int n,
+                              const unsigned int n_max_entries_per_row)
+    :
+    column_space_map (new Epetra_Map (static_cast<int>(n), 0,
+                                      Utilities::Trilinos::comm_self())),
+
+    // on one processor only, we know how the
+    // columns of the matrix will be
+    // distributed (everything on one
+    // processor), so we can hand in this
+    // information to the constructor. we
+    // can't do so in parallel, where the
+    // information from columns is only
+    // available when entries have been added
+    matrix (new Epetra_FECrsMatrix(Copy,
+                                   Epetra_Map (static_cast<int>(m), 0,
+                                               Utilities::Trilinos::comm_self()),
+                                   *column_space_map,
+                                   n_max_entries_per_row,
+                                   false)),
+    last_action (Zero),
+    compressed (false)
+  {}
+
+
+
+  SparseMatrix::SparseMatrix (const unsigned int               m,
+                              const unsigned int               n,
+                              const std::vector<unsigned int> &n_entries_per_row)
+    :
+    column_space_map (new Epetra_Map (static_cast<int>(n), 0,
+                                      Utilities::Trilinos::comm_self())),
+    matrix (new Epetra_FECrsMatrix(Copy,
+                                   Epetra_Map (static_cast<int>(m), 0,
+                                               Utilities::Trilinos::comm_self()),
+                                   *column_space_map,
+                                   (int *)const_cast<unsigned int *>(&(n_entries_per_row[0])),
+                                   false)),
+    last_action (Zero),
+    compressed (false)
+  {}
+
+
+
+  SparseMatrix::SparseMatrix (const IndexSet     &parallel_partitioning,
+                              const MPI_Comm     &communicator,
+                              const unsigned int n_max_entries_per_row)
+    :
+    column_space_map (new Epetra_Map(parallel_partitioning.
+                                     make_trilinos_map(communicator, false))),
+    matrix (new Epetra_FECrsMatrix(Copy,
+                                   *column_space_map,
+                                   n_max_entries_per_row,
+                                   false)),
+    last_action (Zero),
+    compressed (false)
+  {}
+
+
+
+  SparseMatrix::SparseMatrix (const IndexSet     &parallel_partitioning,
+                              const MPI_Comm     &communicator,
+                              const std::vector<unsigned int> &n_entries_per_row)
+    :
+    column_space_map (new Epetra_Map(parallel_partitioning.
+                                     make_trilinos_map(communicator, false))),
+    matrix (new Epetra_FECrsMatrix(Copy,
+                                   *column_space_map,
+                                   (int *)const_cast<unsigned int *>(&(n_entries_per_row[0])),
+                                   false)),
+    last_action (Zero),
+    compressed (false)
+  {}
+
+
+
+  SparseMatrix::SparseMatrix (const IndexSet     &row_parallel_partitioning,
+                              const IndexSet     &col_parallel_partitioning,
+                              const MPI_Comm     &communicator,
+                              const unsigned int n_max_entries_per_row)
+    :
+    column_space_map (new Epetra_Map(col_parallel_partitioning.
+                                     make_trilinos_map(communicator, false))),
+    matrix (new Epetra_FECrsMatrix(Copy,
+                                   row_parallel_partitioning.
+                                   make_trilinos_map(communicator, false),
+                                   n_max_entries_per_row,
+                                   false)),
+    last_action (Zero),
+    compressed (false)
+  {}
+
+
+
+  SparseMatrix::SparseMatrix (const IndexSet     &row_parallel_partitioning,
+                              const IndexSet     &col_parallel_partitioning,
+                              const MPI_Comm     &communicator,
+                              const std::vector<unsigned int> &n_entries_per_row)
+    :
+    column_space_map (new Epetra_Map(col_parallel_partitioning.
+                                     make_trilinos_map(communicator, false))),
+    matrix (new Epetra_FECrsMatrix(Copy,
+                                   row_parallel_partitioning.
+                                   make_trilinos_map(communicator, false),
+                                   (int *)const_cast<unsigned int *>(&(n_entries_per_row[0])),
+                                   false)),
+    last_action (Zero),
+    compressed (false)
+  {}
+
+
+
+  SparseMatrix::SparseMatrix (const SparsityPattern &sparsity_pattern)
+    :
+    column_space_map (new Epetra_Map (sparsity_pattern.domain_partitioner())),
+    matrix (new Epetra_FECrsMatrix(Copy,
+                                   sparsity_pattern.trilinos_sparsity_pattern(),
+                                   false)),
+    last_action (Zero),
+    compressed (true)
+  {
+    Assert(sparsity_pattern.trilinos_sparsity_pattern().Filled() == true,
+           ExcMessage("The Trilinos sparsity pattern has not been compressed."));
+    compress();
+  }
+
+
+
+  SparseMatrix::SparseMatrix (const SparseMatrix &input_matrix)
+    :
+    Subscriptor(),
+    column_space_map (new Epetra_Map (input_matrix.domain_partitioner())),
+    matrix (new Epetra_FECrsMatrix(*input_matrix.matrix)),
+    last_action (Zero),
+    compressed (true)
+  {}
+
+
+
+  SparseMatrix::~SparseMatrix ()
+  {}
+
+
+
+  void
+  SparseMatrix::copy_from (const SparseMatrix &m)
+  {
+
+    // check whether we need to update the
+    // partitioner or can just copy the data:
+    // in case we have the same distribution,
+    // we can just copy the data.
+    if (local_range() == m.local_range())
+      *matrix = *m.matrix;
+    else
+      {
+        column_space_map.reset (new Epetra_Map (m.domain_partitioner()));
+
+        // release memory before reallocation
+        matrix.reset ();
+        temp_vector.clear ();
+        matrix.reset (new Epetra_FECrsMatrix(*m.matrix));
+      }
+
+    compress();
+  }
+
+
+
+  template <typename SparsityType>
+  void
+  SparseMatrix::reinit (const SparsityType &sparsity_pattern)
+  {
+    const Epetra_Map rows (static_cast<int>(sparsity_pattern.n_rows()),
+                           0,
+                           Utilities::Trilinos::comm_self());
+    const Epetra_Map columns (static_cast<int>(sparsity_pattern.n_cols()),
+                              0,
+                              Utilities::Trilinos::comm_self());
+
+    reinit (rows, columns, sparsity_pattern);
+  }
+
+
+
+  template <typename SparsityType>
+  void
+  SparseMatrix::reinit (const Epetra_Map    &input_map,
+                        const SparsityType &sparsity_pattern,
+                        const bool           exchange_data)
+  {
+    reinit (input_map, input_map, sparsity_pattern, exchange_data);
+  }
+
+
+
+  template <typename SparsityType>
+  void
+  SparseMatrix::reinit (const Epetra_Map    &input_row_map,
+                        const Epetra_Map    &input_col_map,
+                        const SparsityType &sparsity_pattern,
+                        const bool           exchange_data)
+  {
+    // release memory before reallocation
+    temp_vector.clear();
+    matrix.reset();
+
+    // if we want to exchange data, build
+    // a usual Trilinos sparsity pattern
+    // and let that handle the
+    // exchange. otherwise, manually
+    // create a CrsGraph, which consumes
+    // considerably less memory because it
+    // can set correct number of indices
+    // right from the start
+    if (exchange_data)
+      {
+        SparsityPattern trilinos_sparsity;
+        trilinos_sparsity.reinit (input_row_map, input_col_map,
+                                  sparsity_pattern, exchange_data);
+        reinit (trilinos_sparsity);
+
+        return;
+      }
+
+    Assert (exchange_data == false, ExcNotImplemented());
+    if (input_row_map.Comm().MyPID() == 0)
+      {
+        AssertDimension (sparsity_pattern.n_rows(),
+                         static_cast<unsigned int>(input_row_map.NumGlobalElements()));
+        AssertDimension (sparsity_pattern.n_cols(),
+                         static_cast<unsigned int>(input_col_map.NumGlobalElements()));
+      }
+
+    column_space_map.reset (new Epetra_Map (input_col_map));
+
+    const unsigned int first_row = input_row_map.MinMyGID(),
+                       last_row = input_row_map.MaxMyGID()+1;
+    std::vector<int> n_entries_per_row(last_row-first_row);
+
+    for (unsigned int row=first_row; row<last_row; ++row)
+      n_entries_per_row[row-first_row] = sparsity_pattern.row_length(row);
+
+    // The deal.II notation of a Sparsity
+    // pattern corresponds to the Epetra
+    // concept of a Graph. Hence, we generate
+    // a graph by copying the sparsity pattern
+    // into it, and then build up the matrix
+    // from the graph. This is considerable
+    // faster than directly filling elements
+    // into the matrix. Moreover, it consumes
+    // less memory, since the internal
+    // reordering is done on ints only, and we
+    // can leave the doubles aside.
+
+    // for more than one processor, need to
+    // specify only row map first and let the
+    // matrix entries decide about the column
+    // map (which says which columns are
+    // present in the matrix, not to be
+    // confused with the col_map that tells
+    // how the domain dofs of the matrix will
+    // be distributed). for only one
+    // processor, we can directly assign the
+    // columns as well. Compare this with bug
+    // # 4123 in the Sandia Bugzilla.
+    std_cxx1x::shared_ptr<Epetra_CrsGraph> graph;
+    if (input_row_map.Comm().NumProc() > 1)
+      graph.reset (new Epetra_CrsGraph (Copy, input_row_map,
+                                        &n_entries_per_row[0], true));
+    else
+      graph.reset (new Epetra_CrsGraph (Copy, input_row_map, input_col_map,
+                                        &n_entries_per_row[0], true));
+
+    // This functions assumes that the
+    // sparsity pattern sits on all processors
+    // (completely). The parallel version uses
+    // an Epetra graph that is already
+    // distributed.
+
+    // now insert the indices
+    std::vector<int>   row_indices;
+
+    for (unsigned int row=first_row; row<last_row; ++row)
+      {
+        const int row_length = sparsity_pattern.row_length(row);
+        if (row_length == 0)
+          continue;
+
+        row_indices.resize (row_length, -1);
+
+        typename SparsityType::row_iterator col_num = sparsity_pattern.row_begin (row),
+                                            row_end = sparsity_pattern.row_end(row);
+        for (unsigned int col = 0; col_num != row_end; ++col_num, ++col)
+          row_indices[col] = *col_num;
+
+        graph->Epetra_CrsGraph::InsertGlobalIndices (row, row_length,
+                                                     &row_indices[0]);
+      }
+
+    // Eventually, optimize the graph
+    // structure (sort indices, make memory
+    // contiguous, etc).
+    graph->FillComplete(input_col_map, input_row_map);
+    graph->OptimizeStorage();
+
+    // check whether we got the number of
+    // columns right.
+    AssertDimension (sparsity_pattern.n_cols(),
+                     static_cast<unsigned int>(graph->NumGlobalCols()));
+
+    // And now finally generate the matrix.
+    matrix.reset (new Epetra_FECrsMatrix(Copy, *graph, false));
+    last_action = Zero;
+
+    // In the end, the matrix needs to
+    // be compressed in order to be
+    // really ready.
+    compress();
+  }
+
+
+
+  void
+  SparseMatrix::reinit (const SparsityPattern &sparsity_pattern)
+  {
+    temp_vector.clear ();
+    matrix.reset ();
+
+    // reinit with a (parallel) Trilinos
+    // sparsity pattern.
+    column_space_map.reset (new Epetra_Map
+                            (sparsity_pattern.domain_partitioner()));
+    matrix.reset (new Epetra_FECrsMatrix
+                  (Copy, sparsity_pattern.trilinos_sparsity_pattern(), false));
+    compress();
+  }
+
+
+
+  void
+  SparseMatrix::reinit (const SparseMatrix &sparse_matrix)
+  {
+    column_space_map.reset (new Epetra_Map (sparse_matrix.domain_partitioner()));
+    temp_vector.clear ();
+    matrix.reset ();
+    matrix.reset (new Epetra_FECrsMatrix
+                  (Copy, sparse_matrix.trilinos_sparsity_pattern(), false));
+
+    compress();
+  }
+
+
+
+  template <typename number>
+  void
+  SparseMatrix::reinit (const ::dealii::SparseMatrix<number> &dealii_sparse_matrix,
+                        const double                          drop_tolerance,
+                        const bool                            copy_values,
+                        const ::dealii::SparsityPattern      *use_this_sparsity)
+  {
+    const Epetra_Map rows (static_cast<int>(dealii_sparse_matrix.m()),
+                           0,
+                           Utilities::Trilinos::comm_self());
+    const Epetra_Map columns (static_cast<int>(dealii_sparse_matrix.n()),
+                              0,
+                              Utilities::Trilinos::comm_self());
+    reinit (rows, columns, dealii_sparse_matrix, drop_tolerance,
+            copy_values, use_this_sparsity);
+  }
+
+
+
+  template <typename number>
+  void
+  SparseMatrix::reinit (const Epetra_Map                     &input_map,
+                        const ::dealii::SparseMatrix<number> &dealii_sparse_matrix,
+                        const double                          drop_tolerance,
+                        const bool                            copy_values,
+                        const ::dealii::SparsityPattern      *use_this_sparsity)
+  {
+    reinit (input_map, input_map, dealii_sparse_matrix, drop_tolerance,
+            copy_values, use_this_sparsity);
+  }
+
+
+
+  template <typename number>
+  void
+  SparseMatrix::reinit (const Epetra_Map                     &input_row_map,
+                        const Epetra_Map                     &input_col_map,
+                        const ::dealii::SparseMatrix<number> &dealii_sparse_matrix,
+                        const double                          drop_tolerance,
+                        const bool                            copy_values,
+                        const ::dealii::SparsityPattern      *use_this_sparsity)
+  {
+    if (copy_values == false)
+      {
+        // in case we do not copy values, just
+        // call the other function.
+        if (use_this_sparsity == 0)
+          reinit (input_row_map, input_col_map,
+                  dealii_sparse_matrix.get_sparsity_pattern());
+        else
+          reinit (input_row_map, input_col_map,
+                  *use_this_sparsity);
+        return;
+      }
+
+    unsigned int n_rows = dealii_sparse_matrix.m();
+
+    Assert (input_row_map.NumGlobalElements() == (int)n_rows,
+            ExcDimensionMismatch (input_row_map.NumGlobalElements(),
+                                  n_rows));
+    Assert (input_col_map.NumGlobalElements() == (int)dealii_sparse_matrix.n(),
+            ExcDimensionMismatch (input_col_map.NumGlobalElements(),
+                                  dealii_sparse_matrix.n()));
+
+    const ::dealii::SparsityPattern &sparsity_pattern =
+      (use_this_sparsity!=0)? *use_this_sparsity :
+      dealii_sparse_matrix.get_sparsity_pattern();
+
+    if (matrix.get() != 0 && m() == n_rows &&
+        n_nonzero_elements() == sparsity_pattern.n_nonzero_elements())
+      goto set_matrix_values;
+
+    {
+      SparsityPattern trilinos_sparsity;
+      trilinos_sparsity.reinit (input_row_map, input_col_map, sparsity_pattern);
+      reinit (trilinos_sparsity);
+    }
+
+set_matrix_values:
+    // fill the values. the same as above: go
+    // through all rows of the matrix, and then
+    // all columns. since the sparsity patterns of
+    // the input matrix and the specified sparsity
+    // pattern might be different, need to go
+    // through the row for both these sparsity
+    // structures simultaneously in order to
+    // really set the correct values.
+    const std::size_t *const in_rowstart_indices
+      = dealii_sparse_matrix.get_sparsity_pattern().get_rowstart_indices();
+    const unsigned int *const in_cols
+      = dealii_sparse_matrix.get_sparsity_pattern().get_column_numbers();
+    const unsigned int *cols = sparsity_pattern.get_column_numbers();
+    const std::size_t *rowstart_indices =
+      sparsity_pattern.get_rowstart_indices();
+
+    unsigned int maximum_row_length = matrix->MaxNumEntries();
+    std::vector<unsigned int> row_indices (maximum_row_length);
+    std::vector<TrilinosScalar> values (maximum_row_length);
+    std::size_t in_index, index;
+
+    for (unsigned int row=0; row<n_rows; ++row)
+      if (input_row_map.MyGID(static_cast<int>(row)))
+        {
+          index = rowstart_indices[row];
+          in_index = in_rowstart_indices[row];
+          unsigned int col = 0;
+          if (sparsity_pattern.optimize_diagonal())
+            {
+              values[col] = dealii_sparse_matrix.global_entry(in_index);
+              row_indices[col++] = row;
+              ++index;
+              ++in_index;
+            }
+
+          while (in_index < in_rowstart_indices[row+1] &&
+                 index < rowstart_indices[row+1])
+            {
+              while (cols[index] < in_cols[in_index] && index < rowstart_indices[row+1])
+                ++index;
+              while (in_cols[in_index] < cols[index] && in_index < in_rowstart_indices[row+1])
+                ++in_index;
+
+              if (std::fabs(dealii_sparse_matrix.global_entry(in_index)) > drop_tolerance)
+                {
+                  values[col] = dealii_sparse_matrix.global_entry(in_index);
+                  row_indices[col++] = in_cols[in_index];
+                }
+              ++index;
+              ++in_index;
+            }
+          set (row, col, reinterpret_cast<unsigned int *>(&row_indices[0]),
+               &values[0], false);
+        }
+
+    compress();
+  }
+
+
+
+  void
+  SparseMatrix::reinit (const Epetra_CrsMatrix &input_matrix,
+                        const bool              copy_values)
+  {
+    Assert (input_matrix.Filled()==true,
+            ExcMessage("Input CrsMatrix has not called FillComplete()!"));
+
+    column_space_map.reset (new Epetra_Map (input_matrix.DomainMap()));
+
+    const Epetra_CrsGraph *graph = &input_matrix.Graph();
+
+    temp_vector.clear ();
+    matrix.reset ();
+    matrix.reset (new Epetra_FECrsMatrix(Copy, *graph, false));
+
+    matrix->FillComplete (*column_space_map, input_matrix.RangeMap(), true);
+
+    if (copy_values == true)
+      {
+        // point to the first data entry in the two
+        // matrices and copy the content
+        const TrilinosScalar *in_values = input_matrix[0];
+        TrilinosScalar *values = (*matrix)[0];
+        const unsigned int my_nonzeros = input_matrix.NumMyNonzeros();
+        std::memcpy (&values[0], &in_values[0],
+                     my_nonzeros*sizeof (TrilinosScalar));
+      }
+
+    compress();
+  }
+
+
+
+  void
+  SparseMatrix::clear ()
+  {
+    // When we clear the matrix, reset
+    // the pointer and generate an
+    // empty matrix.
+    column_space_map.reset (new Epetra_Map (0, 0,
+                                            Utilities::Trilinos::comm_self()));
+    temp_vector.clear();
+    matrix.reset (new Epetra_FECrsMatrix(View, *column_space_map, 0));
+
+    matrix->FillComplete();
+
+    compressed = true;
+  }
+
+
+
+  void
+  SparseMatrix::clear_row (const unsigned int   row,
+                           const TrilinosScalar new_diag_value)
+  {
+    Assert (matrix->Filled()==true, ExcMatrixNotCompressed());
+
+    // Only do this on the rows owned
+    // locally on this processor.
+    int local_row = matrix->LRID(static_cast<int>(row));
+    if (local_row >= 0)
+      {
+        TrilinosScalar *values;
+        int *col_indices;
+        int num_entries;
+        const int ierr = matrix->ExtractMyRowView(local_row, num_entries,
+                                                  values, col_indices);
+
+        Assert (ierr == 0,
+                ExcTrilinosError(ierr));
+
+        int *diag_find = std::find(col_indices,col_indices+num_entries,
+                                   local_row);
+        int diag_index = (int)(diag_find - col_indices);
+
+        for (int j=0; j<num_entries; ++j)
+          if (diag_index != j || new_diag_value == 0)
+            values[j] = 0.;
+
+        if (diag_find && std::fabs(values[diag_index]) == 0.0 &&
+            new_diag_value != 0.0)
+          values[diag_index] = new_diag_value;
+      }
+  }
+
+
+
+  void
+  SparseMatrix::clear_rows (const std::vector<unsigned int> &rows,
+                            const TrilinosScalar             new_diag_value)
+  {
+    compress();
+    for (unsigned int row=0; row<rows.size(); ++row)
+      clear_row(rows[row], new_diag_value);
+
+    // This function needs to be called
+    // on all processors. We change some
+    // data, so we need to flush the
+    // buffers to make sure that the
+    // right data is used.
+    compress();
+  }
+
+
+
+  TrilinosScalar
+  SparseMatrix::operator() (const unsigned int i,
+                            const unsigned int j) const
+  {
+    // Extract local indices in
+    // the matrix.
+    int trilinos_i = matrix->LRID(static_cast<int>(i)), trilinos_j = matrix->LCID(static_cast<int>(j));
+    TrilinosScalar value = 0.;
+
+    // If the data is not on the
+    // present processor, we throw
+    // an exception. This is one of
+    // the two tiny differences to
+    // the el(i,j) call, which does
+    // not throw any assertions.
+    if (trilinos_i == -1)
+      {
+        Assert (false, ExcAccessToNonLocalElement(i, j, local_range().first,
+                                                  local_range().second));
+      }
+    else
+      {
+        // Check whether the matrix has
+        // already been transformed to local
+        // indices.
+        Assert (matrix->Filled(), ExcMatrixNotCompressed());
+
+        // Prepare pointers for extraction
+        // of a view of the row.
+        int nnz_present = matrix->NumMyEntries(trilinos_i);
+        int nnz_extracted;
+        int *col_indices;
+        TrilinosScalar *values;
+
+        // Generate the view and make
+        // sure that we have not generated
+        // an error.
+        int ierr = matrix->ExtractMyRowView(trilinos_i, nnz_extracted,
+                                            values, col_indices);
+        Assert (ierr==0, ExcTrilinosError(ierr));
+
+        Assert (nnz_present == nnz_extracted,
+                ExcDimensionMismatch(nnz_present, nnz_extracted));
+
+        // Search the index where we
+        // look for the value, and then
+        // finally get it.
+
+        int *el_find = std::find(col_indices, col_indices + nnz_present,
+                                 trilinos_j);
+
+        int local_col_index = (int)(el_find - col_indices);
+
+        // This is actually the only
+        // difference to the el(i,j)
+        // function, which means that
+        // we throw an exception in
+        // this case instead of just
+        // returning zero for an
+        // element that is not present
+        // in the sparsity pattern.
+        if (local_col_index == nnz_present)
+          {
+            Assert (false, ExcInvalidIndex (i,j));
+          }
+        else
+          value = values[local_col_index];
+      }
+
+    return value;
+  }
+
+
+
+  TrilinosScalar
+  SparseMatrix::el (const unsigned int i,
+                    const unsigned int j) const
+  {
+    // Extract local indices in
+    // the matrix.
+    int trilinos_i = matrix->LRID(static_cast<int>(i)), trilinos_j = matrix->LCID(static_cast<int>(j));
+    TrilinosScalar value = 0.;
+
+    // If the data is not on the
+    // present processor, we can't
+    // continue. Just print out zero
+    // as discussed in the
+    // documentation of this
+    // function. if you want error
+    // checking, use operator().
+    if ((trilinos_i == -1 ) || (trilinos_j == -1))
+      return 0.;
+    else
+      {
+        // Check whether the matrix
+        // already is transformed to
+        // local indices.
+        Assert (matrix->Filled(), ExcMatrixNotCompressed());
+
+        // Prepare pointers for extraction
+        // of a view of the row.
+        int nnz_present = matrix->NumMyEntries(trilinos_i);
+        int nnz_extracted;
+        int *col_indices;
+        TrilinosScalar *values;
+
+        // Generate the view and make
+        // sure that we have not generated
+        // an error.
+        int ierr = matrix->ExtractMyRowView(trilinos_i, nnz_extracted,
+                                            values, col_indices);
+        Assert (ierr==0, ExcTrilinosError(ierr));
+
+        Assert (nnz_present == nnz_extracted,
+                ExcDimensionMismatch(nnz_present, nnz_extracted));
+
+        // Search the index where we
+        // look for the value, and then
+        // finally get it.
+        int *el_find = std::find(col_indices, col_indices + nnz_present,
+                                 trilinos_j);
+
+        int local_col_index = (int)(el_find - col_indices);
+
+
+        // This is actually the only
+        // difference to the () function
+        // querying (i,j), where we throw an
+        // exception instead of just
+        // returning zero for an element
+        // that is not present in the
+        // sparsity pattern.
+        if (local_col_index == nnz_present)
+          value = 0;
+        else
+          value = values[local_col_index];
+      }
+
+    return value;
+  }
+
+
+
+  TrilinosScalar
+  SparseMatrix::diag_element (const unsigned int i) const
+  {
+    Assert (m() == n(), ExcNotQuadratic());
+
+#ifdef DEBUG
+    // use operator() in debug mode because
+    // it checks if this is a valid element
+    // (in parallel)
+    return operator()(i,i);
+#else
+    // Trilinos doesn't seem to have a
+    // more efficient way to access the
+    // diagonal than by just using the
+    // standard el(i,j) function.
+    return el(i,i);
+#endif
+  }
+
+
+
+  unsigned int
+  SparseMatrix::row_length (const unsigned int row) const
+  {
+    Assert (row < m(), ExcInternalError());
+
+    // get a representation of the
+    // present row
+    int ncols = -1;
+    int local_row = matrix->LRID(static_cast<int>(row));
+
+    // on the processor who owns this
+    // row, we'll have a non-negative
+    // value.
+    if (local_row >= 0)
+      {
+        int ierr = matrix->NumMyRowEntries (local_row, ncols);
+        AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+      }
+
+    return ncols;
+  }
+
+
+
+  namespace internals
+  {
+    void perform_mmult (const SparseMatrix &inputleft,
+                        const SparseMatrix &inputright,
+                        SparseMatrix       &result,
+                        const VectorBase   &V,
+                        const bool          transpose_left)
+    {
+      const bool use_vector = (V.size() == inputright.m() ? true : false);
+      if (transpose_left == false)
+        {
+          Assert (inputleft.n() == inputright.m(),
+                  ExcDimensionMismatch(inputleft.n(), inputright.m()));
+          Assert (inputleft.domain_partitioner().SameAs(inputright.range_partitioner()),
+                  ExcMessage ("Parallel partitioning of A and B does not fit."));
+        }
+      else
+        {
+          Assert (inputleft.m() == inputright.m(),
+                  ExcDimensionMismatch(inputleft.m(), inputright.m()));
+          Assert (inputleft.range_partitioner().SameAs(inputright.range_partitioner()),
+                  ExcMessage ("Parallel partitioning of A and B does not fit."));
+        }
+
+      result.clear();
+
+      // create a suitable operator B: in case
+      // we do not use a vector, all we need to
+      // do is to set the pointer. Otherwise,
+      // we insert the data from B, but
+      // multiply each row with the respective
+      // vector element.
+      Teuchos::RCP<Epetra_CrsMatrix> mod_B;
+      if (use_vector == false)
+        {
+          mod_B = Teuchos::rcp(const_cast<Epetra_CrsMatrix *>
+                               (&inputright.trilinos_matrix()),
+                               false);
+        }
+      else
+        {
+          mod_B = Teuchos::rcp(new Epetra_CrsMatrix
+                               (Copy, inputright.trilinos_sparsity_pattern()),
+                               true);
+          mod_B->FillComplete(inputright.domain_partitioner(),
+                              inputright.range_partitioner());
+          Assert (inputright.local_range() == V.local_range(),
+                  ExcMessage ("Parallel distribution of matrix B and vector V "
+                              "does not match."));
+
+          const int local_N = inputright.local_size();
+          for (int i=0; i<local_N; ++i)
+            {
+              int N_entries = -1;
+              double *new_data, *B_data;
+              mod_B->ExtractMyRowView (i, N_entries, new_data);
+              inputright.trilinos_matrix().ExtractMyRowView (i, N_entries, B_data);
+              double value = V.trilinos_vector()[0][i];
+              for (int j=0; j<N_entries; ++j)
+                new_data[j] = value * B_data[j];
+            }
+        }
+
+      // use ML built-in method for performing
+      // the matrix-matrix product.
+      // create ML operators on top of the
+      // Epetra matrices. if we use a
+      // transposed matrix, let ML know it
+      ML_Comm *comm;
+      ML_Comm_Create(&comm);
+#ifdef ML_MPI
+      const Epetra_MpiComm *epcomm = dynamic_cast<const Epetra_MpiComm *>(&(inputleft.trilinos_matrix().Comm()));
+      // Get the MPI communicator, as it may not be MPI_COMM_W0RLD, and update the ML comm object
+      if (epcomm) ML_Comm_Set_UsrComm(comm,epcomm->Comm());
+#endif
+      ML_Operator *A_ = ML_Operator_Create(comm);
+      ML_Operator *B_ = ML_Operator_Create(comm);
+      ML_Operator *C_ = ML_Operator_Create(comm);
+      SparseMatrix transposed_mat;
+
+      if (transpose_left == false)
+        ML_Operator_WrapEpetraCrsMatrix
+        (const_cast<Epetra_CrsMatrix *>(&inputleft.trilinos_matrix()),A_,
+         false);
+      else
+        {
+          // create transposed matrix
+          SparsityPattern sparsity_transposed (inputleft.domain_partitioner(),
+                                               inputleft.range_partitioner());
+          Assert (inputleft.domain_partitioner().LinearMap() == true,
+                  ExcMessage("Matrix must be partitioned contiguously between procs."));
+          for (unsigned int i=0; i<inputleft.local_size(); ++i)
+            {
+              int num_entries, * indices;
+              inputleft.trilinos_sparsity_pattern().ExtractMyRowView(i, num_entries,
+                                                                     indices);
+              Assert (num_entries >= 0, ExcInternalError());
+              const unsigned int GID = inputleft.row_partitioner().GID(i);
+              for (int j=0; j<num_entries; ++j)
+                sparsity_transposed.add (inputleft.col_partitioner().GID(indices[j]),
+                                         GID);
+            }
+
+          sparsity_transposed.compress();
+          transposed_mat.reinit (sparsity_transposed);
+          for (unsigned int i=0; i<inputleft.local_size(); ++i)
+            {
+              int num_entries, * indices;
+              double *values;
+              inputleft.trilinos_matrix().ExtractMyRowView(i, num_entries,
+                                                           values, indices);
+              Assert (num_entries >= 0, ExcInternalError());
+              const unsigned int GID = inputleft.row_partitioner().GID(i);
+              for (int j=0; j<num_entries; ++j)
+                transposed_mat.set (inputleft.col_partitioner().GID(indices[j]),
+                                    GID, values[j]);
+            }
+          transposed_mat.compress();
+          ML_Operator_WrapEpetraCrsMatrix
+          (const_cast<Epetra_CrsMatrix *>(&transposed_mat.trilinos_matrix()),
+           A_,false);
+        }
+      ML_Operator_WrapEpetraCrsMatrix(mod_B.get(),B_,false);
+
+      // We implement the multiplication by
+      // hand in a similar way as is done in
+      // ml/src/Operator/ml_rap.c for a triple
+      // matrix product. This means that the
+      // code is very similar to the one found
+      // in ml/src/Operator/ml_rap.c
+
+      // import data if necessary
+      ML_Operator *Btmp, *Ctmp, *Ctmp2, *tptr;
+      ML_CommInfoOP *getrow_comm;
+      int max_per_proc;
+      int N_input_vector = B_->invec_leng;
+      getrow_comm = B_->getrow->pre_comm;
+      if ( getrow_comm != NULL)
+        for (int i = 0; i < getrow_comm->N_neighbors; i++)
+          for (int j = 0; j < getrow_comm->neighbors[i].N_send; j++)
+            AssertThrow (getrow_comm->neighbors[i].send_list[j] < N_input_vector,
+                         ExcInternalError());
+
+      ML_create_unique_col_id(N_input_vector, &(B_->getrow->loc_glob_map),
+                              getrow_comm, &max_per_proc, B_->comm);
+      B_->getrow->use_loc_glob_map = ML_YES;
+      if (A_->getrow->pre_comm != NULL)
+        ML_exchange_rows( B_, &Btmp, A_->getrow->pre_comm);
+      else Btmp = B_;
+
+      // perform matrix-matrix product
+      ML_matmat_mult(A_, Btmp , &Ctmp);
+
+      // release temporary structures we needed
+      // for multiplication
+      ML_free(B_->getrow->loc_glob_map);
+      B_->getrow->loc_glob_map = NULL;
+      B_->getrow->use_loc_glob_map = ML_NO;
+      if (A_->getrow->pre_comm != NULL)
+        {
+          tptr = Btmp;
+          while ( (tptr!= NULL) && (tptr->sub_matrix != B_))
+            tptr = tptr->sub_matrix;
+          if (tptr != NULL) tptr->sub_matrix = NULL;
+          ML_RECUR_CSR_MSRdata_Destroy(Btmp);
+          ML_Operator_Destroy(&Btmp);
+        }
+
+      // make correct data structures
+      if (A_->getrow->post_comm != NULL)
+        ML_exchange_rows(Ctmp, &Ctmp2, A_->getrow->post_comm);
+      else
+        Ctmp2 = Ctmp;
+
+      ML_back_to_csrlocal(Ctmp2, C_, max_per_proc);
+
+      ML_RECUR_CSR_MSRdata_Destroy (Ctmp);
+      ML_Operator_Destroy (&Ctmp);
+
+      if (A_->getrow->post_comm != NULL)
+        {
+          ML_RECUR_CSR_MSRdata_Destroy(Ctmp2);
+          ML_Operator_Destroy (&Ctmp2);
+        }
+
+      // create an Epetra matrix from the ML
+      // matrix that we got as a result.
+      Epetra_CrsMatrix *C_mat;
+      ML_Operator2EpetraCrsMatrix(C_, C_mat);
+      C_mat->FillComplete();
+      C_mat->OptimizeStorage();
+      result.reinit (*C_mat);
+
+      // destroy allocated memory
+      delete C_mat;
+      ML_Operator_Destroy (&A_);
+      ML_Operator_Destroy (&B_);
+      ML_Operator_Destroy (&C_);
+      ML_Comm_Destroy (&comm);
+    }
+  }
+
+
+  void
+  SparseMatrix::mmult (SparseMatrix       &C,
+                       const SparseMatrix &B,
+                       const VectorBase   &V) const
+  {
+    internals::perform_mmult (*this, B, C, V, false);
+  }
+
+
+
+  void
+  SparseMatrix::Tmmult (SparseMatrix       &C,
+                        const SparseMatrix &B,
+                        const VectorBase   &V) const
+  {
+    internals::perform_mmult (*this, B, C, V, true);
+  }
+
+
+
+  void
+  SparseMatrix::add (const TrilinosScalar  factor,
+                     const SparseMatrix   &rhs)
+  {
+    Assert (rhs.m() == m(), ExcDimensionMismatch (rhs.m(), m()));
+    Assert (rhs.n() == n(), ExcDimensionMismatch (rhs.n(), n()));
+
+    const std::pair<unsigned int, unsigned int>
+    local_range = rhs.local_range();
+
+    int ierr;
+
+    // If both matrices have been transformed
+    // to local index space (in Trilinos
+    // speak: they are filled), we're having
+    // matrices based on the same indices
+    // with the same number of nonzeros
+    // (actually, we'd need sparsity pattern,
+    // but that is too expensive to check),
+    // we can extract views of the column
+    // data on both matrices and simply
+    // manipulate the values that are
+    // addressed by the pointers.
+    if (matrix->Filled() == true &&
+        rhs.matrix->Filled() == true &&
+        this->local_range() == local_range &&
+        matrix->NumMyNonzeros() == rhs.matrix->NumMyNonzeros())
+      for (unsigned int row=local_range.first;
+           row < local_range.second; ++row)
+        {
+          Assert (matrix->NumGlobalEntries(row) ==
+                  rhs.matrix->NumGlobalEntries(row),
+                  ExcDimensionMismatch(matrix->NumGlobalEntries(row),
+                                       rhs.matrix->NumGlobalEntries(row)));
+
+          const int row_local = matrix->RowMap().LID(static_cast<int>(row));
+          int n_entries, rhs_n_entries;
+          TrilinosScalar *value_ptr, *rhs_value_ptr;
+
+          // In debug mode, we want to check
+          // whether the indices really are the
+          // same in the calling matrix and the
+          // input matrix. The reason for doing
+          // this only in debug mode is that both
+          // extracting indices and comparing
+          // indices is relatively slow compared to
+          // just working with the values.
+#ifdef DEBUG
+          int *index_ptr, *rhs_index_ptr;
+          ierr = rhs.matrix->ExtractMyRowView (row_local, rhs_n_entries,
+                                               rhs_value_ptr, rhs_index_ptr);
+          Assert (ierr == 0, ExcTrilinosError(ierr));
+
+          ierr = matrix->ExtractMyRowView (row_local, n_entries, value_ptr,
+                                           index_ptr);
+          Assert (ierr == 0, ExcTrilinosError(ierr));
+#else
+          rhs.matrix->ExtractMyRowView (row_local, rhs_n_entries,rhs_value_ptr);
+          matrix->ExtractMyRowView (row_local, n_entries, value_ptr);
+#endif
+
+          AssertThrow (n_entries == rhs_n_entries,
+                       ExcDimensionMismatch (n_entries, rhs_n_entries));
+
+          for (int i=0; i<n_entries; ++i)
+            {
+              *value_ptr++ += *rhs_value_ptr++ * factor;
+#ifdef DEBUG
+              Assert (*index_ptr++ == *rhs_index_ptr++,
+                      ExcInternalError());
+#endif
+            }
+        }
+    // If we have different sparsity patterns
+    // (expressed by a different number of
+    // nonzero elements), we have to be more
+    // careful and extract a copy of the row
+    // data, multiply it by the factor and
+    // then add it to the matrix using the
+    // respective add() function.
+    else
+      {
+        unsigned int max_row_length = 0;
+        for (unsigned int row=local_range.first;
+             row < local_range.second; ++row)
+          max_row_length
+            = std::max (max_row_length,
+                        static_cast<unsigned int>(rhs.matrix->NumGlobalEntries(row)));
+
+        std::vector<int>            column_indices (max_row_length);
+        std::vector<TrilinosScalar> values (max_row_length);
+
+        if (matrix->Filled() == true && rhs.matrix->Filled() == true &&
+            this->local_range() == local_range)
+          for (unsigned int row=local_range.first;
+               row < local_range.second; ++row)
+            {
+              const int row_local = matrix->RowMap().LID(static_cast<int>(row));
+              int n_entries;
+
+              ierr = rhs.matrix->ExtractMyRowCopy (row_local, max_row_length,
+                                                   n_entries,
+                                                   &values[0],
+                                                   &column_indices[0]);
+              Assert (ierr == 0, ExcTrilinosError(ierr));
+
+              for (int i=0; i<n_entries; ++i)
+                values[i] *= factor;
+
+              TrilinosScalar *value_ptr = &values[0];
+
+              ierr = matrix->SumIntoMyValues (row_local, n_entries, value_ptr,
+                                              &column_indices[0]);
+              Assert (ierr == 0, ExcTrilinosError(ierr));
+            }
+        else
+          {
+            for (unsigned int row=local_range.first;
+                 row < local_range.second; ++row)
+              {
+                int n_entries;
+                ierr = rhs.matrix->Epetra_CrsMatrix::ExtractGlobalRowCopy
+                       ((int)row, max_row_length, n_entries, &values[0], &column_indices[0]);
+                Assert (ierr == 0, ExcTrilinosError(ierr));
+
+                for (int i=0; i<n_entries; ++i)
+                  values[i] *= factor;
+
+                ierr = matrix->Epetra_CrsMatrix::SumIntoGlobalValues
+                       ((int)row, n_entries, &values[0], &column_indices[0]);
+                Assert (ierr == 0, ExcTrilinosError(ierr));
+              }
+            compress ();
+
+          }
+      }
+  }
+
+
+
+  void
+  SparseMatrix::transpose ()
+  {
+    // This only flips a flag that tells
+    // Trilinos that any vmult operation
+    // should be done with the
+    // transpose. However, the matrix
+    // structure is not reset.
+    int ierr;
+
+    if (!matrix->UseTranspose())
+      {
+        ierr = matrix->SetUseTranspose (true);
+        AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+      }
+    else
+      {
+        ierr = matrix->SetUseTranspose (false);
+        AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+      }
+  }
+
+/**
+ * 	shuqiangwag: remove the zeros before 'e' in the scientific notation of a floating 
+ * 	point value for output.
+ */
+  static void simplify_scientific_string(int len, const char inStr[20], char outStr[20])
+  {
+	  int i, j;
+	  for(i=0; i<len; i++)
+		  outStr[i] = inStr[i];
+
+	  // looking for "e" or "E" from the end
+	  for(i=len-1; i>=0; i--)
+		  if(outStr[i]=='e' || outStr[i]=='E')
+		  {
+			  i--;
+			  break;
+		  }
+
+	  // replace every leading 0 with \0
+	  for(; i>=0; i--)
+	  {
+		  if(outStr[i]=='0')
+			  outStr[i]='\0';
+		  else
+			  break;
+	  }
+
+	  // now output
+	  bool bPassedE = false;
+	  j = 0;
+	  for(i=0; i<len; i++)
+	  {
+		  if(outStr[i]=='e' || outStr[i]=='E')
+			  bPassedE = true;
+		  if(bPassedE)
+		  {
+			  outStr[j] = outStr[i];
+			  j++;
+		  }
+		  else if(outStr[i]!='\0')
+		  {
+			  outStr[j] = outStr[i];
+			  j++;
+		  }
+	  }
+	  outStr[j] = '\0';
+  }
+
+  /**
+   * shuqiangwang: output to be the same as petsc style.
+   */
+  void
+  SparseMatrix::write_ascii () const
+  {
+//    Assert (false, ExcNotImplemented());
+	  double * values;
+	  int * indices;
+	  int num_entries;
+
+	  const int len = 20;
+	  char inStr[len], outStr[len];
+
+	  for (int i=0; i<matrix->NumMyRows(); ++i)
+	  {
+		  std::cout << "row " << matrix->GRID(i) << ":";
+		  matrix->ExtractMyRowView (i, num_entries, values, indices);
+		  for (int j=0; j<num_entries; ++j)
+		  {
+			  std::cout << " (" << matrix->GCID(indices[j]);
+			  snprintf(inStr, len, "%.5e", values[j]);
+			  simplify_scientific_string(len,inStr,outStr);
+//			  std::cout << ", " << std::scientific << values[j];
+			  std::cout << ", " << outStr;
+			  std::cout << ") ";
+		  }
+		  std::cout << std::endl;
+	  }
+  }
+
+
+  // As of now, no particularly neat
+  // ouput is generated in case of
+  // multiple processors.
+  void
+  SparseMatrix::print (std::ostream &out,
+                       const bool    print_detailed_trilinos_information) const
+  {
+    if (print_detailed_trilinos_information == true)
+      out << *matrix;
+    else
+      {
+        double *values;
+        int *indices;
+        int num_entries;
+
+        for (int i=0; i<matrix->NumMyRows(); ++i)
+          {
+            matrix->ExtractMyRowView (i, num_entries, values, indices);
+            for (int j=0; j<num_entries; ++j)
+              out << "(" << matrix->GRID(i) << "," << matrix->GCID(indices[j]) << ") "
+                  << values[j] << std::endl;
+          }
+      }
+
+    AssertThrow (out, ExcIO());
+  }
+
+
+
+  std::size_t
+  SparseMatrix::memory_consumption () const
+  {
+    unsigned int static_memory = sizeof(this) + sizeof (*matrix)
+                                 + sizeof(*matrix->Graph().DataPtr());
+    return ((sizeof(TrilinosScalar)+sizeof(int))*matrix->NumMyNonzeros() +
+            sizeof(int)*local_size() +
+            static_memory);
+  }
+
+
+
+
+  // explicit instantiations
+  //
+  template void
+  SparseMatrix::reinit (const dealii::SparsityPattern &);
+  template void
+  SparseMatrix::reinit (const CompressedSparsityPattern &);
+  template void
+  SparseMatrix::reinit (const CompressedSetSparsityPattern &);
+  template void
+  SparseMatrix::reinit (const CompressedSimpleSparsityPattern &);
+
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const dealii::SparsityPattern &,
+                        const bool);
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const CompressedSparsityPattern &,
+                        const bool);
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const CompressedSetSparsityPattern &,
+                        const bool);
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const CompressedSimpleSparsityPattern &,
+                        const bool);
+
+
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const Epetra_Map &,
+                        const dealii::SparsityPattern &,
+                        const bool);
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const Epetra_Map &,
+                        const CompressedSparsityPattern &,
+                        const bool);
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const Epetra_Map &,
+                        const CompressedSimpleSparsityPattern &,
+                        const bool);
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const Epetra_Map &,
+                        const CompressedSetSparsityPattern &,
+                        const bool);
+
+  template void
+  SparseMatrix::reinit (const dealii::SparseMatrix<float> &,
+                        const double,
+                        const bool,
+                        const dealii::SparsityPattern *);
+  template void
+  SparseMatrix::reinit (const dealii::SparseMatrix<double> &,
+                        const double,
+                        const bool,
+                        const dealii::SparsityPattern *);
+  template void
+  SparseMatrix::reinit (const dealii::SparseMatrix<long double> &,
+                        const double,
+                        const bool,
+                        const dealii::SparsityPattern *);
+
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const dealii::SparseMatrix<float> &,
+                        const double,
+                        const bool,
+                        const dealii::SparsityPattern *);
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const dealii::SparseMatrix<double> &,
+                        const double,
+                        const bool,
+                        const dealii::SparsityPattern *);
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const dealii::SparseMatrix<long double> &,
+                        const double,
+                        const bool,
+                        const dealii::SparsityPattern *);
+
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const Epetra_Map &,
+                        const dealii::SparseMatrix<float> &,
+                        const double,
+                        const bool,
+                        const dealii::SparsityPattern *);
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const Epetra_Map &,
+                        const dealii::SparseMatrix<double> &,
+                        const double,
+                        const bool,
+                        const dealii::SparsityPattern *);
+  template void
+  SparseMatrix::reinit (const Epetra_Map &,
+                        const Epetra_Map &,
+                        const dealii::SparseMatrix<long double> &,
+                        const double,
+                        const bool,
+                        const dealii::SparsityPattern *);
+
+
+}
+
+DEAL_II_NAMESPACE_CLOSE
+
+#endif // DEAL_II_USE_TRILINOS

Added: branches/s-wang2/for_deal.II/source/lac/trilinos_vector_base.cc
===================================================================
--- branches/s-wang2/for_deal.II/source/lac/trilinos_vector_base.cc	                        (rev 0)
+++ branches/s-wang2/for_deal.II/source/lac/trilinos_vector_base.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,460 @@
+//---------------------------------------------------------------------------
+//    $Id: trilinos_vector_base.cc 27628 2012-11-20 22:49:26Z heister $
+//    Version: $Name$
+//
+//    Copyright (C) 2008, 2010, 2011, 2012 by the deal.II authors
+//
+//    This file is subject to QPL and may not be  distributed
+//    without copyright and license information. Please refer
+//    to the file deal.II/doc/license.html for the  text  and
+//    further information on this license.
+//
+//---------------------------------------------------------------------------
+
+
+#include <deal.II/base/memory_consumption.h>
+#include <deal.II/lac/trilinos_vector_base.h>
+
+#ifdef DEAL_II_USE_TRILINOS
+
+#  include <cmath>
+#  include <Epetra_Import.h>
+
+
+DEAL_II_NAMESPACE_OPEN
+
+namespace TrilinosWrappers
+{
+  namespace internal
+  {
+    VectorReference::operator TrilinosScalar () const
+    {
+      Assert (index < vector.size(),
+              ExcIndexRange (index, 0, vector.size()));
+
+      // Trilinos allows for vectors
+      // to be referenced by the [] or
+      // () operators but only ()
+      // checks index bounds. We check
+      // these bounds by ourselves, so
+      // we can use []. Note that we
+      // can only get local values.
+
+      const int local_index = vector.vector->Map().LID(static_cast<int>(index));
+      Assert (local_index >= 0,
+              ExcAccessToNonLocalElement (index,
+                                          vector.vector->Map().MinMyGID(),
+                                          vector.vector->Map().MaxMyGID()));
+
+
+      return (*(vector.vector))[0][local_index];
+    }
+  }
+
+
+
+  VectorBase::VectorBase ()
+    :
+    last_action (Zero),
+    compressed  (true),
+    has_ghosts  (false),
+#ifdef DEAL_II_COMPILER_SUPPORTS_MPI
+    vector(new Epetra_FEVector(
+             Epetra_Map(0,0,Epetra_MpiComm(MPI_COMM_SELF))))
+#else
+    vector(new Epetra_FEVector(
+             Epetra_Map(0,0,Epetra_SerialComm())))
+#endif
+  {}
+
+
+
+  VectorBase::VectorBase (const VectorBase &v)
+    :
+    Subscriptor(),
+    last_action (Zero),
+    compressed (true),
+    has_ghosts  (v.has_ghosts),
+    vector(new Epetra_FEVector(*v.vector))
+  {}
+
+
+
+  VectorBase::~VectorBase ()
+  {}
+
+
+
+  void
+  VectorBase::clear ()
+  {
+    // When we clear the vector,
+    // reset the pointer and generate
+    // an empty vector.
+#ifdef DEAL_II_COMPILER_SUPPORTS_MPI
+    Epetra_Map map (0, 0, Epetra_MpiComm(MPI_COMM_SELF));
+#else
+    Epetra_Map map (0, 0, Epetra_SerialComm());
+#endif
+
+    has_ghosts = false;
+    vector.reset (new Epetra_FEVector(map));
+    last_action = Zero;
+  }
+
+
+
+  VectorBase &
+  VectorBase::operator = (const VectorBase &v)
+  {
+    Assert (vector.get() != 0,
+            ExcMessage("Vector is not constructed properly."));
+
+    if (local_range() != v.local_range())
+      {
+        last_action = Zero;
+        vector.reset (new Epetra_FEVector(*v.vector));
+        has_ghosts = v.has_ghosts;
+      }
+    else
+      {
+        Assert (vector->Map().SameAs(v.vector->Map()) == true,
+                ExcMessage ("The Epetra maps in the assignment operator ="
+                            " do not match, even though the local_range "
+                            " seems to be the same. Check vector setup!"));
+        int ierr;
+        ierr = vector->GlobalAssemble(last_action);
+        AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+        ierr = vector->Update(1.0, *v.vector, 0.0);
+        AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+        last_action = Zero;
+      }
+
+    return *this;
+  }
+
+
+
+  template <typename number>
+  VectorBase &
+  VectorBase::operator = (const ::dealii::Vector<number> &v)
+  {
+    Assert (size() == v.size(),
+            ExcDimensionMismatch(size(), v.size()));
+
+    // this is probably not very efficient
+    // but works. in particular, we could do
+    // better if we know that
+    // number==TrilinosScalar because then we
+    // could elide the copying of elements
+    //
+    // let's hope this isn't a
+    // particularly frequent operation
+    std::pair<unsigned int, unsigned int>
+    local_range = this->local_range ();
+    for (unsigned int i=local_range.first; i<local_range.second; ++i)
+      (*vector)[0][i-local_range.first] = v(i);
+
+    return *this;
+  }
+
+
+
+  TrilinosScalar
+  VectorBase::el (const unsigned int index) const
+  {
+    // Extract local indices in
+    // the vector.
+    int trilinos_i = vector->Map().LID(static_cast<int>(index));
+    TrilinosScalar value = 0.;
+
+    // If the element is not
+    // present on the current
+    // processor, we can't
+    // continue. Just print out 0.
+
+    // TODO: Is this reasonable?
+    if (trilinos_i == -1 )
+      {
+        return 0.;
+        //Assert (false, ExcAccessToNonlocalElement(index, local_range().first,
+        //                                local_range().second-1));
+      }
+    else
+      value = (*vector)[0][trilinos_i];
+
+    return value;
+  }
+
+
+
+  TrilinosScalar
+  VectorBase::operator () (const unsigned int index) const
+  {
+    // Extract local indices in
+    // the vector.
+    int trilinos_i = vector->Map().LID(static_cast<int>(index));
+    TrilinosScalar value = 0.;
+
+    // If the element is not present
+    // on the current processor, we
+    // can't continue. This is the
+    // main difference to the el()
+    // function.
+    if (trilinos_i == -1 )
+      {
+        Assert (false, ExcAccessToNonlocalElement(index, local_range().first,
+                                                  local_range().second-1));
+      }
+    else
+      value = (*vector)[0][trilinos_i];
+
+    return value;
+  }
+
+
+
+  void
+  VectorBase::add (const VectorBase &v,
+                   const bool        allow_different_maps)
+  {
+    if (allow_different_maps == false)
+      *this += v;
+    else
+      {
+        AssertThrow (size() == v.size(),
+                     ExcDimensionMismatch (size(), v.size()));
+
+        Epetra_Import data_exchange (vector->Map(), v.vector->Map());
+
+        int ierr = vector->Import(*v.vector, data_exchange, Add);
+        AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+        last_action = Insert;
+      }
+  }
+
+
+
+  bool
+  VectorBase::operator == (const VectorBase &v) const
+  {
+    Assert (size() == v.size(),
+            ExcDimensionMismatch(size(), v.size()));
+    if (local_size() != v.local_size())
+      return false;
+
+    unsigned int i;
+    for (i=0; i<local_size(); i++)
+      if ((*(v.vector))[0][i]!=(*vector)[0][i]) return false;
+
+    return true;
+  }
+
+
+
+  bool
+  VectorBase::operator != (const VectorBase &v) const
+  {
+    Assert (size() == v.size(),
+            ExcDimensionMismatch(size(), v.size()));
+
+    return (!(*this==v));
+  }
+
+// shuqiangwang
+  VectorBase::real_type VectorBase::min () const
+  {
+	  Assert (!has_ghost_elements(), ExcGhostsPresent());
+
+	  TrilinosScalar d;
+	  const int ierr = vector->MinValue (&d);
+	  AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+	  return d;
+  }
+
+  VectorBase::real_type VectorBase::max () const
+  {
+	  Assert (!has_ghost_elements(), ExcGhostsPresent());
+
+	  TrilinosScalar d;
+	  const int ierr = vector->MaxValue (&d);
+	  AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+	  return d;
+  }
+
+
+  bool
+  VectorBase::all_zero () const
+  {
+    // get a representation of the vector and
+    // loop over all the elements
+    TrilinosScalar *start_ptr = (*vector)[0];
+    const TrilinosScalar *ptr  = start_ptr,
+                          *eptr = start_ptr + local_size();
+    unsigned int flag = 0;
+    while (ptr != eptr)
+      {
+        if (*ptr != 0)
+          {
+            flag = 1;
+            break;
+          }
+        ++ptr;
+      }
+
+#ifdef DEAL_II_COMPILER_SUPPORTS_MPI
+    // in parallel, check that the vector
+    // is zero on _all_ processors.
+    const Epetra_MpiComm *mpi_comm
+      = dynamic_cast<const Epetra_MpiComm *>(&vector->Map().Comm());
+    unsigned int num_nonzero = Utilities::MPI::sum(flag, mpi_comm->Comm());
+    return num_nonzero == 0;
+#else
+    return flag == 0;
+#endif
+
+  }
+
+
+
+  bool
+  VectorBase::is_non_negative () const
+  {
+#ifdef DEAL_II_COMPILER_SUPPORTS_MPI
+    // if this vector is a parallel one, then
+    // we need to communicate to determine
+    // the answer to the current
+    // function. this still has to be
+    // implemented
+    AssertThrow(local_size() == size(), ExcNotImplemented());
+#endif
+    // get a representation of the vector and
+    // loop over all the elements
+    TrilinosScalar *start_ptr;
+    int leading_dimension;
+    int ierr = vector->ExtractView (&start_ptr, &leading_dimension);
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+
+    // TODO: This
+    // won't work in parallel like
+    // this. Find out a better way to
+    // this in that case.
+    const TrilinosScalar *ptr  = start_ptr,
+                          *eptr = start_ptr + size();
+    bool flag = true;
+    while (ptr != eptr)
+      {
+        if (*ptr < 0.0)
+          {
+            flag = false;
+            break;
+          }
+        ++ptr;
+      }
+
+    return flag;
+  }
+
+
+
+  // TODO: up to now only local
+  // data printed out! Find a
+  // way to neatly output
+  // distributed data...
+  void
+  VectorBase::print (const char *format) const
+  {
+    Assert (vector->GlobalLength()!=0, ExcEmptyObject());
+
+    for (unsigned int j=0; j<size(); ++j)
+      {
+        double t = (*vector)[0][j];
+
+        if (format != 0)
+          std::printf (format, t);
+        else
+          std::printf (" %5.2f", double(t));
+      }
+    std::printf ("\n");
+  }
+
+
+
+  void
+  VectorBase::print (std::ostream      &out,
+                     const unsigned int precision,
+                     const bool         scientific,
+                     const bool         across) const
+  {
+    AssertThrow (out, ExcIO());
+
+    // get a representation of the
+    // vector and loop over all
+    // the elements TODO: up to
+    // now only local data printed
+    // out! Find a way to neatly
+    // output distributed data...
+    TrilinosScalar *val;
+    int leading_dimension;
+    int ierr = vector->ExtractView (&val, &leading_dimension);
+
+    AssertThrow (ierr == 0, ExcTrilinosError(ierr));
+    out.precision (precision);
+    if (scientific)
+      out.setf (std::ios::scientific, std::ios::floatfield);
+    else
+      out.setf (std::ios::fixed, std::ios::floatfield);
+
+    if (across)
+      for (unsigned int i=0; i<size(); ++i)
+        out << static_cast<double>(val[i]) << ' ';
+    else
+      for (unsigned int i=0; i<size(); ++i)
+        out << static_cast<double>(val[i]) << std::endl;
+    out << std::endl;
+
+    // restore the representation
+    // of the vector
+    AssertThrow (out, ExcIO());
+  }
+
+
+
+  void
+  VectorBase::swap (VectorBase &v)
+  {
+    std::swap(last_action, v.last_action);
+    std::swap(compressed, v.compressed);
+    std::swap(vector, v.vector);
+  }
+
+
+
+  std::size_t
+  VectorBase::memory_consumption () const
+  {
+    //TODO[TH]: No accurate memory
+    //consumption for Trilinos vectors
+    //yet. This is a rough approximation with
+    //one index and the value per local
+    //entry.
+    return sizeof(*this)
+           + this->local_size()*( sizeof(double)+sizeof(int) );
+  }
+
+} /* end of namespace TrilinosWrappers */
+
+
+namespace TrilinosWrappers
+{
+#include "trilinos_vector_base.inst"
+}
+
+DEAL_II_NAMESPACE_CLOSE
+
+#endif // DEAL_II_USE_TRILINOS

Added: branches/s-wang2/for_deal.II/source/numerics/derivative_approximation.inst.in
===================================================================
--- branches/s-wang2/for_deal.II/source/numerics/derivative_approximation.inst.in	                        (rev 0)
+++ branches/s-wang2/for_deal.II/source/numerics/derivative_approximation.inst.in	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,181 @@
+//---------------------------------------------------------------------------
+//    $Id: derivative_approximation.inst.in 25612 2012-06-07 16:46:33Z heister $
+//    Version: $Name$
+//
+//    Copyright (C) 2010, 2012 by the deal.II authors
+//
+//    This file is subject to QPL and may not be  distributed
+//    without copyright and license information. Please refer
+//    to the file deal.II/doc/license.html for the  text  and
+//    further information on this license.
+//
+//---------------------------------------------------------------------------
+
+for (deal_II_dimension : DIMENSIONS)
+{
+#define INSTANTIATE(InputVector,DH)                      \
+template                                                 \
+void                                                     \
+DerivativeApproximation::                                \
+approximate_gradient<deal_II_dimension>                  \
+(const Mapping<deal_II_dimension> &mapping,              \
+ const DH<deal_II_dimension> &dof_handler,               \
+ const InputVector  &solution,                           \
+ Vector<float>         &derivative_norm,                 \
+ const unsigned int     component);                      \
+                                                         \
+template                                                 \
+void                                                     \
+DerivativeApproximation::                                \
+approximate_gradient<deal_II_dimension>                  \
+(const DH<deal_II_dimension> &dof_handler,               \
+ const InputVector     &solution,                        \
+ Vector<float>         &derivative_norm,                 \
+ const unsigned int     component);                      \
+                                                         \
+template                                                 \
+void                                                     \
+DerivativeApproximation::                                \
+approximate_second_derivative<deal_II_dimension>         \
+(const Mapping<deal_II_dimension> &mapping,              \
+ const DH<deal_II_dimension> &dof_handler,               \
+ const InputVector  &solution,                           \
+ Vector<float>         &derivative_norm,                 \
+ const unsigned int     component);                      \
+                                                         \
+template                                                 \
+void                                                     \
+DerivativeApproximation::                                \
+approximate_second_derivative<deal_II_dimension>         \
+(const DH<deal_II_dimension> &dof_handler,               \
+ const InputVector     &solution,                        \
+ Vector<float>         &derivative_norm,                 \
+ const unsigned int     component);                      \
+                                                         \
+template                                                 \
+void                                                     \
+DerivativeApproximation::                                \
+approximate_derivative_tensor<deal_II_dimension>         \
+(const Mapping<deal_II_dimension> & 	mapping,	 \
+ const DH<deal_II_dimension> &dof_handler,               \
+ const InputVector     &solution,                        \
+ const DH<deal_II_dimension>::active_cell_iterator &cell,\
+ Tensor<1,deal_II_dimension> &derivative,                \
+ const unsigned int     component);                      \
+                                                         \
+template                                                 \
+void                                                     \
+DerivativeApproximation::                                \
+approximate_derivative_tensor<deal_II_dimension>         \
+(const Mapping<deal_II_dimension> & 	mapping,	 \
+ const DH<deal_II_dimension> &dof_handler,               \
+ const InputVector     &solution,                        \
+ const DH<deal_II_dimension>::active_cell_iterator &cell,\
+ Tensor<2,deal_II_dimension> &derivative,                \
+ const unsigned int     component);                      \
+                                                         \
+template                                                 \
+void                                                     \
+DerivativeApproximation::                                \
+approximate_derivative_tensor<deal_II_dimension>         \
+(const Mapping<deal_II_dimension> & 	mapping,	 \
+ const DH<deal_II_dimension> &dof_handler,               \
+ const InputVector     &solution,                        \
+ const DH<deal_II_dimension>::active_cell_iterator &cell,\
+ Tensor<3,deal_II_dimension> &derivative,                \
+ const unsigned int     component);			 \
+							 \
+template                                                 \
+void                                                     \
+DerivativeApproximation::                                \
+approximate_derivative_tensor<deal_II_dimension>         \
+(const DH<deal_II_dimension> &dof_handler,               \
+ const InputVector     &solution,                        \
+ const DH<deal_II_dimension>::active_cell_iterator &cell,\
+ Tensor<1,deal_II_dimension> &derivative,                \
+ const unsigned int     component);                      \
+                                                         \
+template                                                 \
+void                                                     \
+DerivativeApproximation::                                \
+approximate_derivative_tensor<deal_II_dimension>         \
+(const DH<deal_II_dimension> &dof_handler,               \
+ const InputVector     &solution,                        \
+ const DH<deal_II_dimension>::active_cell_iterator &cell,\
+ Tensor<2,deal_II_dimension> &derivative,                \
+ const unsigned int     component);                      \
+                                                         \
+template                                                 \
+void                                                     \
+DerivativeApproximation::                                \
+approximate_derivative_tensor<deal_II_dimension>         \
+(const DH<deal_II_dimension> &dof_handler,               \
+ const InputVector     &solution,                        \
+ const DH<deal_II_dimension>::active_cell_iterator &cell,\
+ Tensor<3,deal_II_dimension> &derivative,                \
+ const unsigned int     component)
+
+
+INSTANTIATE(Vector<double>, DoFHandler);
+INSTANTIATE(Vector<float>, DoFHandler);
+INSTANTIATE(BlockVector<double>, DoFHandler);
+INSTANTIATE(BlockVector<float>, DoFHandler);
+
+INSTANTIATE(Vector<double>, hp::DoFHandler);
+INSTANTIATE(Vector<float>, hp::DoFHandler);
+INSTANTIATE(BlockVector<double>, hp::DoFHandler);
+INSTANTIATE(BlockVector<float>, hp::DoFHandler);
+
+#ifdef DEAL_II_USE_PETSC
+INSTANTIATE(PETScWrappers::Vector, DoFHandler);
+INSTANTIATE(PETScWrappers::BlockVector, DoFHandler);
+INSTANTIATE(PETScWrappers::MPI::Vector, DoFHandler);			// shuqiangwang
+INSTANTIATE(PETScWrappers::MPI::BlockVector, DoFHandler);
+
+INSTANTIATE(PETScWrappers::Vector, hp::DoFHandler);
+INSTANTIATE(PETScWrappers::BlockVector, hp::DoFHandler);
+#endif
+
+#ifdef DEAL_II_USE_TRILINOS
+INSTANTIATE(TrilinosWrappers::Vector, DoFHandler);
+INSTANTIATE(TrilinosWrappers::BlockVector, DoFHandler);
+INSTANTIATE(TrilinosWrappers::MPI::Vector, DoFHandler);
+INSTANTIATE(TrilinosWrappers::MPI::BlockVector, DoFHandler);
+
+//TODO: test hp before instantiating
+#endif
+
+#undef INSTANTIATE
+
+template
+double
+DerivativeApproximation::
+derivative_norm(const Tensor<1,deal_II_dimension> &derivative);
+
+template
+double
+DerivativeApproximation::
+derivative_norm(const Tensor<2,deal_II_dimension> &derivative);
+
+template
+double
+DerivativeApproximation::
+derivative_norm(const Tensor<3,deal_II_dimension> &derivative);
+
+
+// static variables
+//
+// on AIX, the linker is unhappy about some missing symbols. they
+// should really be there, but explicitly instantiating them will also
+// not hurt
+template
+const UpdateFlags
+DerivativeApproximation::Gradient<deal_II_dimension>::update_flags;
+
+template
+const UpdateFlags
+DerivativeApproximation::SecondDerivative<deal_II_dimension>::update_flags;
+template
+const UpdateFlags
+DerivativeApproximation::ThirdDerivative<deal_II_dimension>::update_flags;
+}

Modified: branches/s-wang2/include/aspect/global.h
===================================================================
--- branches/s-wang2/include/aspect/global.h	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/include/aspect/global.h	2012-11-29 05:55:56 UTC (rev 1394)
@@ -24,9 +24,10 @@
 #define __aspect__global_h
 
 
-#include <deal.II/lac/trilinos_block_vector.h>
-#include <deal.II/lac/trilinos_block_sparse_matrix.h>
-#include <deal.II/lac/trilinos_precondition.h>
+#include <deal.II/lac/petsc_parallel_block_vector.h>
+#include <deal.II/lac/petsc_parallel_block_sparse_matrix.h>
+#include <deal.II/lac/petsc_solver.h>
+#include <deal.II/lac/petsc_precondition.h>
 
 #include <boost/archive/binary_oarchive.hpp>
 #include <boost/archive/binary_iarchive.hpp>
@@ -74,46 +75,47 @@
     /**
      * Typedef for the vector type used.
      */
-    typedef TrilinosWrappers::MPI::Vector Vector;
+    typedef PETScWrappers::MPI::Vector Vector;
 
     /**
      * Typedef for the type used to describe vectors that
      * consist of multiple blocks.
      */
-    typedef TrilinosWrappers::MPI::BlockVector BlockVector;
+    typedef PETScWrappers::MPI::BlockVector BlockVector;
 
     /**
      * Typedef for the sparse matrix type used.
      */
-    typedef TrilinosWrappers::SparseMatrix SparseMatrix;
+    typedef PETScWrappers::MPI::SparseMatrix SparseMatrix;
 
     /**
      * Typedef for the type used to describe sparse matrices that
      * consist of multiple blocks.
      */
-    typedef TrilinosWrappers::BlockSparseMatrix BlockSparseMatrix;
+    typedef PETScWrappers::MPI::BlockSparseMatrix BlockSparseMatrix;
 
+//    typedef PETScWrappers::SolverCG SolverCG;
+
     /**
      * Typedef for the AMG preconditioner type used for the
      * top left block of the Stokes matrix.
      */
-    typedef TrilinosWrappers::PreconditionAMG PreconditionAMG;
+    typedef PETScWrappers::PreconditionBoomerAMG PreconditionAMG;
 
     /**
      * Typedef for the Incomplete Cholesky preconditioner used
      * for other blocks of the system matrix.
      */
-    typedef TrilinosWrappers::PreconditionIC PreconditionIC;
+    typedef PETScWrappers::PreconditionICC PreconditionIC;
 
     /**
      * Typedef for the Incomplete LU decomposition preconditioner used
      * for other blocks of the system matrix.
      */
-    typedef TrilinosWrappers::PreconditionILU PreconditionILU;
+    typedef PETScWrappers::PreconditionILU PreconditionILU;
   }
 }
 
-
 /**
  * A macro that is used in instantiating the ASPECT classes and functions
  * for both 2d and 3d. Call this macro with the name of another macro that

Added: branches/s-wang2/include/aspect/global_trilinos.h
===================================================================
--- branches/s-wang2/include/aspect/global_trilinos.h	                        (rev 0)
+++ branches/s-wang2/include/aspect/global_trilinos.h	2012-11-29 05:55:56 UTC (rev 1394)
@@ -0,0 +1,127 @@
+/*
+  Copyright (C) 2011, 2012 by the authors of the ASPECT code.
+
+  This file is part of ASPECT.
+
+  ASPECT is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  ASPECT is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with ASPECT; see the file doc/COPYING.  If not see
+  <http://www.gnu.org/licenses/>.
+*/
+/*  $Id: global.h 895 2012-04-10 12:53:27Z bangerth $  */
+
+
+#ifndef __aspect__global_h
+#define __aspect__global_h
+
+
+#include <deal.II/lac/trilinos_block_vector.h>
+#include <deal.II/lac/trilinos_block_sparse_matrix.h>
+#include <deal.II/lac/trilinos_precondition.h>
+
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/archive/text_oarchive.hpp>
+#include <boost/archive/text_iarchive.hpp>
+namespace aspect
+{
+  /**
+   * A variable whose value denotes the number of seconds in one year.
+   */
+  extern const double year_in_seconds;
+
+  /**
+   * A variable that denotes whether we should periodically
+   * output statistics about memory consumption, run times, etc
+   * via the Simulator::output_statistics() function or other
+   * means.
+   */
+  extern const bool output_parallel_statistics;
+
+
+  /**
+   * A typedef that denotes the BOOST stream type for reading data
+   * during serialization. The type chosen here is a binary archive
+   * which we subsequently will have to un-compress.
+   */
+  typedef boost::archive::binary_iarchive iarchive;
+
+  /**
+   * A typedef that denotes the BOOST stream type for writing data
+   * during serialization. The type chosen here is a binary archive
+   * which we compress before writing it into a file.
+   */
+  typedef boost::archive::binary_oarchive oarchive;
+
+  /**
+   * A namespace that contains typedefs for classes used in
+   * the linear algebra description.
+   */
+  namespace LinearAlgebra
+  {
+    using namespace dealii;
+
+
+    /**
+     * Typedef for the vector type used.
+     */
+    typedef TrilinosWrappers::MPI::Vector Vector;
+
+    /**
+     * Typedef for the type used to describe vectors that
+     * consist of multiple blocks.
+     */
+    typedef TrilinosWrappers::MPI::BlockVector BlockVector;
+
+    /**
+     * Typedef for the sparse matrix type used.
+     */
+    typedef TrilinosWrappers::SparseMatrix SparseMatrix;
+
+    /**
+     * Typedef for the type used to describe sparse matrices that
+     * consist of multiple blocks.
+     */
+    typedef TrilinosWrappers::BlockSparseMatrix BlockSparseMatrix;
+
+    /**
+     * Typedef for the AMG preconditioner type used for the
+     * top left block of the Stokes matrix.
+     */
+    typedef TrilinosWrappers::PreconditionAMG PreconditionAMG;
+
+    /**
+     * Typedef for the Incomplete Cholesky preconditioner used
+     * for other blocks of the system matrix.
+     */
+    typedef TrilinosWrappers::PreconditionIC PreconditionIC;
+
+    /**
+     * Typedef for the Incomplete LU decomposition preconditioner used
+     * for other blocks of the system matrix.
+     */
+    typedef TrilinosWrappers::PreconditionILU PreconditionILU;
+  }
+}
+
+
+/**
+ * A macro that is used in instantiating the ASPECT classes and functions
+ * for both 2d and 3d. Call this macro with the name of another macro that
+ * when called with a single integer argument instantiates the respective
+ * classes in the given space dimension.
+ */
+#define ASPECT_INSTANTIATE(INSTANTIATIONS) \
+  INSTANTIATIONS(2) \
+  INSTANTIATIONS(3)
+
+#endif

Modified: branches/s-wang2/include/aspect/particle/integrator.h
===================================================================
--- branches/s-wang2/include/aspect/particle/integrator.h	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/include/aspect/particle/integrator.h	2012-11-29 05:55:56 UTC (rev 1394)
@@ -22,6 +22,7 @@
 #ifndef __aspect__particle_integrator_h
 #define __aspect__particle_integrator_h
 
+#include <aspect/global.h>
 #include <aspect/particle/particle.h>
 #include <aspect/simulator.h>
 
@@ -393,7 +394,7 @@
         const parallel::distributed::Triangulation<dim>   *_tria;
         const DoFHandler<dim>           *_dh;
         const Mapping<dim>              *_mapping;
-        const TrilinosWrappers::MPI::BlockVector *_solution;
+        const LinearAlgebra::BlockVector *_solution;
 
         virtual IntegrationScheme select_scheme(const std::vector<Point<dim> > &cell_vertices, const std::vector<Point<dim> > &cell_velocities, const double timestep)
         {
@@ -401,7 +402,7 @@
         };
 
       public:
-        HybridIntegrator(const parallel::distributed::Triangulation<dim> *new_tria, const DoFHandler<dim> *new_dh, const Mapping<dim> *new_mapping, const TrilinosWrappers::MPI::BlockVector *new_solution)
+        HybridIntegrator(const parallel::distributed::Triangulation<dim> *new_tria, const DoFHandler<dim> *new_dh, const Mapping<dim> *new_mapping, const LinearAlgebra::BlockVector *new_solution)
         {
           _step = 0;
           _loc0.clear();
@@ -424,7 +425,7 @@
           IntegrationScheme                   cur_scheme;
           //typename parallel::distributed::Triangulation<dim>::cell_iterator found_cell;
           typename DoFHandler<dim>::active_cell_iterator found_cell;
-          Functions::FEFieldFunction<dim, DoFHandler<dim>, TrilinosWrappers::MPI::BlockVector> fe_value(*_dh, *_solution, *_mapping);
+          Functions::FEFieldFunction<dim, DoFHandler<dim>, LinearAlgebra::BlockVector> fe_value(*_dh, *_solution, *_mapping);
 
           // If this is the first step, go through all the cells and determine
           // which integration scheme the particles in each cell should use

Modified: branches/s-wang2/include/aspect/particle/world.h
===================================================================
--- branches/s-wang2/include/aspect/particle/world.h	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/include/aspect/particle/world.h	2012-11-29 05:55:56 UTC (rev 1394)
@@ -375,7 +375,7 @@
         };
 
         // Advance particles by the specified timestep using the current integration scheme.
-        void advance_timestep(double timestep, const TrilinosWrappers::MPI::BlockVector &solution)
+        void advance_timestep(double timestep, const LinearAlgebra::BlockVector &solution)
         {
           bool        continue_integrator = true;
 
@@ -565,7 +565,7 @@
           free(recv_data);
         };
 
-        void get_particle_velocities(const TrilinosWrappers::MPI::BlockVector &solution)
+        void get_particle_velocities(const LinearAlgebra::BlockVector &solution)
         {
           Vector<double>                single_res(dim+2);
           std::vector<Vector<double> >  result;
@@ -577,7 +577,7 @@
           std::vector<Point<dim> >      particle_points;
 
           // Prepare the field function
-          Functions::FEFieldFunction<dim, DoFHandler<dim>, TrilinosWrappers::MPI::BlockVector> fe_value(*_dh, solution, *_mapping);
+          Functions::FEFieldFunction<dim, DoFHandler<dim>, LinearAlgebra::BlockVector> fe_value(*_dh, solution, *_mapping);
 
           // Get the velocity for each cell at a time so we can take advantage of knowing the active cell
           for (it=_particles.begin(); it!=_particles.end();)

Modified: branches/s-wang2/include/aspect/postprocess/interface.h
===================================================================
--- branches/s-wang2/include/aspect/postprocess/interface.h	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/include/aspect/postprocess/interface.h	2012-11-29 05:55:56 UTC (rev 1394)
@@ -33,8 +33,8 @@
 #include <deal.II/base/std_cxx1x/shared_ptr.h>
 #include <deal.II/base/table_handler.h>
 #include <deal.II/base/parameter_handler.h>
-#include <deal.II/lac/trilinos_vector.h>
-#include <deal.II/lac/trilinos_block_vector.h>
+//#include <deal.II/lac/trilinos_vector.h>
+//#include <deal.II/lac/trilinos_block_vector.h>
 #include <deal.II/dofs/dof_handler.h>
 #include <deal.II/distributed/tria.h>
 #include <deal.II/fe/mapping.h>

Modified: branches/s-wang2/include/aspect/simulator.h
===================================================================
--- branches/s-wang2/include/aspect/simulator.h	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/include/aspect/simulator.h	2012-11-29 05:55:56 UTC (rev 1394)
@@ -28,9 +28,9 @@
 #include <deal.II/base/conditional_ostream.h>
 #include <deal.II/base/symmetric_tensor.h>
 
-#include <deal.II/lac/trilinos_block_vector.h>
-#include <deal.II/lac/trilinos_block_sparse_matrix.h>
-#include <deal.II/lac/trilinos_precondition.h>
+//#include <deal.II/lac/trilinos_block_vector.h>
+//#include <deal.II/lac/trilinos_block_sparse_matrix.h>
+//#include <deal.II/lac/trilinos_precondition.h>
 
 #include <deal.II/distributed/tria.h>
 
@@ -1217,7 +1217,7 @@
        * <code>source/simulator/assembly.cc</code>.
        */
       void
-      compute_material_model_input_values (const TrilinosWrappers::MPI::BlockVector                    &input_solution,
+      compute_material_model_input_values (const LinearAlgebra::BlockVector                    &input_solution,
                                            const FEValues<dim,dim>                                     &input_finite_element_values,
                                            const bool                                                   compute_strainrate,
                                            typename MaterialModel::Interface<dim>::MaterialModelInputs &material_model_inputs) const;
@@ -1368,7 +1368,7 @@
       LinearAlgebra::BlockVector                                old_old_solution;
       LinearAlgebra::BlockVector                                system_rhs;
 
-      TrilinosWrappers::MPI::BlockVector                        current_linearization_point;
+      LinearAlgebra::BlockVector                        current_linearization_point;
 
       // only used if is_compressible()
       LinearAlgebra::BlockVector                                pressure_shape_function_integrals;

Modified: branches/s-wang2/source/main.cc
===================================================================
--- branches/s-wang2/source/main.cc	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/source/main.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -24,12 +24,14 @@
 
 #include <deal.II/base/utilities.h>
 #include <deal.II/base/mpi.h>
+#include <deal.II/lac/vector_memory.h>
 
 
 int main (int argc, char *argv[])
 {
   using namespace dealii;
   Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv);
+  PetscInitialize(&argc,&argv,0,0);
 
   try
     {
@@ -169,5 +171,9 @@
       return 1;
     }
 
+  dealii::GrowingVectorMemory<dealii::PETScWrappers::MPI::Vector>::release_unused_memory ();
+  dealii::GrowingVectorMemory<dealii::PETScWrappers::Vector>::release_unused_memory ();
+  PetscFinalize();
+
   return 0;
 }

Modified: branches/s-wang2/source/postprocess/composition_statistics.cc
===================================================================
--- branches/s-wang2/source/postprocess/composition_statistics.cc	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/source/postprocess/composition_statistics.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -94,16 +94,17 @@
       std::vector<double> local_max_compositions (this->n_compositional_fields(),
                                                   std::numeric_limits<double>::min());
 
-      for (unsigned int c=0; c<this->n_compositional_fields(); ++c)
-        for (unsigned int i=0; i<this->get_solution().block(3+c).local_size(); ++i)
-          {
-            local_min_compositions[c]
-              = std::min<double> (local_min_compositions[c],
-                                  this->get_solution().block(3+c).trilinos_vector()[0][i]);
-            local_max_compositions[c]
-              = std::max<double> (local_max_compositions[c],
-                                  this->get_solution().block(3+c).trilinos_vector()[0][i]);
-          }
+//      for (unsigned int c=0; c<this->n_compositional_fields(); ++c)
+//        for (unsigned int i=0; i<this->get_solution().block(3+c).local_size(); ++i)
+//          {
+// shuqiangwang
+//            local_min_compositions[c]
+//              = std::min<double> (local_min_compositions[c],
+//                                  this->get_solution().block(3+c).trilinos_vector()[0][i]);
+//            local_max_compositions[c]
+//              = std::max<double> (local_max_compositions[c],
+//                                  this->get_solution().block(3+c).trilinos_vector()[0][i]);
+//          }
 
       // now do the reductions over all processors. we can use Utilities::MPI::max
       // for the maximal values. unfortunately, there is currently no matching
@@ -115,21 +116,26 @@
                                                    std::numeric_limits<double>::min());
 
       {
-        for (unsigned int c=0; c<this->n_compositional_fields(); ++c)
-          local_min_compositions[c] = -local_min_compositions[c];
-        Utilities::MPI::max (local_min_compositions,
-                             this->get_mpi_communicator(),
-                             global_min_compositions);
-        for (unsigned int c=0; c<this->n_compositional_fields(); ++c)
-          {
-            local_min_compositions[c] = -local_min_compositions[c];
-            global_min_compositions[c] = -global_min_compositions[c];
-          }
-
-        // it's simpler for the maximal values
-        Utilities::MPI::max (local_max_compositions,
-                             this->get_mpi_communicator(),
-                             global_max_compositions);
+//        for (unsigned int c=0; c<this->n_compositional_fields(); ++c)
+//          local_min_compositions[c] = -local_min_compositions[c];
+//        Utilities::MPI::max (local_min_compositions,
+//                             this->get_mpi_communicator(),
+//                             global_min_compositions);
+//        for (unsigned int c=0; c<this->n_compositional_fields(); ++c)
+//          {
+//            local_min_compositions[c] = -local_min_compositions[c];
+//            global_min_compositions[c] = -global_min_compositions[c];
+//          }
+//
+//        // it's simpler for the maximal values
+//        Utilities::MPI::max (local_max_compositions,
+//                             this->get_mpi_communicator(),
+//                             global_max_compositions);
+    	  for (unsigned int c=0; c<this->n_compositional_fields(); ++c)
+    	  {
+    		  global_min_compositions[c] = this->get_solution().block(3+c).min();
+    		  global_max_compositions[c] = this->get_solution().block(3+c).max();
+    	  }
       }
 
       // finally produce something for the statistics file

Modified: branches/s-wang2/source/postprocess/temperature_statistics.cc
===================================================================
--- branches/s-wang2/source/postprocess/temperature_statistics.cc	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/source/postprocess/temperature_statistics.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -84,15 +84,16 @@
       // picture of their true values
       double local_min_temperature = std::numeric_limits<double>::max();
       double local_max_temperature = std::numeric_limits<double>::min();
-      for (unsigned int i=0; i<this->get_solution().block(2).local_size(); ++i)
-        {
-          local_min_temperature
-            = std::min<double> (local_min_temperature,
-                                this->get_solution().block(2).trilinos_vector()[0][i]);
-          local_max_temperature
-            = std::max<double> (local_max_temperature,
-                                this->get_solution().block(2).trilinos_vector()[0][i]);
-        }
+//      for (unsigned int i=0; i<this->get_solution().block(2).local_size(); ++i)
+//        {
+//// shuqiangwang
+//          local_min_temperature
+//            = std::min<double> (local_min_temperature,
+//                                this->get_solution().block(2).trilinos_vector()[0][i]);
+//          local_max_temperature
+//            = std::max<double> (local_max_temperature,
+//                                this->get_solution().block(2).trilinos_vector()[0][i]);
+//        }
 
       const double global_temperature_integral
         = Utilities::MPI::sum (local_temperature_integral, this->get_mpi_communicator());
@@ -104,13 +105,15 @@
       // one communication by multiplying
       // one value by -1
       {
-        double local_values[2] = { -local_min_temperature, local_max_temperature };
-        double global_values[2];
-
-        Utilities::MPI::max (local_values, this->get_mpi_communicator(), global_values);
-
-        global_min_temperature = -global_values[0];
-        global_max_temperature = global_values[1];
+//        double local_values[2] = { -local_min_temperature, local_max_temperature };
+//        double global_values[2];
+//
+//        Utilities::MPI::max (local_values, this->get_mpi_communicator(), global_values);
+//
+//        global_min_temperature = -global_values[0];
+//        global_max_temperature = global_values[1];
+    	  global_min_temperature = this->get_solution().block(2).min();
+    	  global_max_temperature = this->get_solution().block(2).max();
       }
 
       statistics.add_value ("Minimal temperature (K)",

Modified: branches/s-wang2/source/simulator/assembly.cc
===================================================================
--- branches/s-wang2/source/simulator/assembly.cc	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/source/simulator/assembly.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -643,7 +643,7 @@
   template <int dim>
   void
   Simulator<dim>::
-  compute_material_model_input_values (const TrilinosWrappers::MPI::BlockVector                    &input_solution,
+  compute_material_model_input_values (const LinearAlgebra::BlockVector                    &input_solution,
                                        const FEValues<dim>                                         &input_finite_element_values,
                                        const bool                                                   compute_strainrate,
                                        typename MaterialModel::Interface<dim>::MaterialModelInputs &material_model_inputs) const
@@ -828,12 +828,12 @@
     Mp_preconditioner.reset (new LinearAlgebra::PreconditionILU());
     Amg_preconditioner.reset (new LinearAlgebra::PreconditionAMG());
 
-    LinearAlgebra::PreconditionAMG::AdditionalData Amg_data;
-    Amg_data.constant_modes = constant_modes;
-    Amg_data.elliptic = true;
-    Amg_data.higher_order_elements = true;
-    Amg_data.smoother_sweeps = 2;
-    Amg_data.aggregation_threshold = 0.02;
+    LinearAlgebra::PreconditionAMG::AdditionalData Amg_data(true);
+    //Amg_data.constant_modes = constant_modes;
+    //Amg_data.elliptic = true;
+    //Amg_data.higher_order_elements = true;
+    //Amg_data.smoother_sweeps = 2;
+    //Amg_data.aggregation_threshold = 0.02;
 
     Mp_preconditioner->initialize (system_preconditioner_matrix.block(1,1));
     Amg_preconditioner->initialize (system_preconditioner_matrix.block(0,0),
@@ -1023,11 +1023,11 @@
          internal::Assembly::CopyData::
          StokesSystem<dim> (finite_element));
 
-    system_matrix.compress();
-    system_rhs.compress(Add);
+    system_matrix.compress(dealii::VectorOperation::add);	//shuqiangwang
+    system_rhs.compress(dealii::VectorOperation::add);
 
     if (material_model->is_compressible())
-      pressure_shape_function_integrals.compress(Add);
+      pressure_shape_function_integrals.compress(dealii::VectorOperation::add);
 
     rebuild_stokes_matrix = false;
 
@@ -1044,7 +1044,7 @@
     else
       computing_timer.enter_section ("   Build composition preconditioner");
     {
-      preconditioner.reset (new TrilinosWrappers::PreconditionILU());
+      preconditioner.reset (new LinearAlgebra::PreconditionILU());
       preconditioner->initialize (system_matrix.block(2+index,2+index));
     }
     computing_timer.exit_section();
@@ -1393,7 +1393,7 @@
          AdvectionSystem<dim> (finite_element));
 
     system_matrix.compress();
-    system_rhs.compress(Add);
+    system_rhs.compress(dealii::VectorOperation::add);
 
     computing_timer.exit_section();
   }

Modified: branches/s-wang2/source/simulator/core.cc
===================================================================
--- branches/s-wang2/source/simulator/core.cc	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/source/simulator/core.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -55,7 +55,73 @@
 
 using namespace dealii;
 
+/**
+ * Temporary utility for replacing TrilinosWrappers with PETScWrappers.
+ */
+namespace CIG
+{
+/**
+ *	convert a block_partition used for trilinos into data used for petsc.
+ *	It is assumed that block_partition.size()==2.
+ */
+void convert_block_partitioning(
+		const std::vector<unsigned int> 	&system_dofs_per_block,
+		const std::vector<dealii::IndexSet> &system_partitioning,
+		std::vector<unsigned int> 			&block_sizes,
+		std::vector<unsigned int> 			&local_sizes)
+{
+	// init,
+	block_sizes.clear();
+	local_sizes.clear();
 
+	// block_sizes
+	block_sizes = system_dofs_per_block;
+
+	// local_sizes
+	for(unsigned int i=0; i<system_partitioning.size(); i++)
+		local_sizes.push_back(system_partitioning[i].n_elements());
+}
+
+
+void setup_petsc_matrix(
+		MPI_Comm mpi_communicator,
+		const std::vector<unsigned int> &block_sizes,
+		const std::vector<unsigned int> &local_sizes,
+		int max_coupling_between_dofs,
+		dealii::PETScWrappers::MPI::BlockSparseMatrix  &matrix)
+{
+	int size = block_sizes.size();
+
+	matrix.reinit(size,size);
+
+	for(int i=0; i<size; i++)
+		for(int j=0; j<size; j++)
+		{
+			matrix.block(i,j).reinit(
+					mpi_communicator,
+					block_sizes[i], block_sizes[j],
+					local_sizes[i], local_sizes[j],
+					max_coupling_between_dofs);
+		}
+
+	matrix.collect_sizes();
+}
+
+void setup_petsc_vector(
+		MPI_Comm mpi_communicator,
+		std::vector<unsigned int> &block_sizes,
+		std::vector<dealii::IndexSet> &partitioning,
+		std::vector<dealii::IndexSet> &relevant_partitioning,
+		dealii::PETScWrappers::MPI::BlockVector &vector)
+{
+	vector.reinit(block_sizes,mpi_communicator);
+	for(unsigned int i=0; i<block_sizes.size(); i++)
+		vector.block(i).reinit(mpi_communicator,partitioning[i],relevant_partitioning[i]);
+	vector.collect_sizes();
+}
+}
+
+
 namespace aspect
 {
   namespace
@@ -507,7 +573,7 @@
                                      this_mpi_process(mpi_communicator));
     sp.compress();
 
-    system_matrix.reinit (sp);
+    //shuqiangwang: this function is not used. system_matrix.reinit (sp);
   }
 
 
@@ -541,7 +607,7 @@
                                      this_mpi_process(mpi_communicator));
     sp.compress();
 
-    system_preconditioner_matrix.reinit (sp);
+    //shuqiangwang; this function is not used. system_preconditioner_matrix.reinit (sp);
   }
 
 
@@ -717,19 +783,21 @@
     }
 
     // finally initialize vectors, matrices, etc.
+    std::vector<unsigned int> block_sizes, local_sizes;
+    CIG::convert_block_partitioning(system_dofs_per_block,system_partitioning,block_sizes,local_sizes);
 
-    setup_system_matrix (system_partitioning);
-    setup_system_preconditioner (system_partitioning);
+    CIG::setup_petsc_matrix(mpi_communicator,block_sizes,local_sizes,dof_handler.max_couplings_between_dofs(),system_matrix);						//setup_system_matrix (system_partitioning);
+    CIG::setup_petsc_matrix(mpi_communicator,block_sizes,local_sizes,dof_handler.max_couplings_between_dofs(),system_preconditioner_matrix);		//setup_system_preconditioner (system_partitioning);
 
-    system_rhs.reinit(system_partitioning, mpi_communicator);
-    solution.reinit(system_relevant_partitioning, mpi_communicator);
-    old_solution.reinit(system_relevant_partitioning, mpi_communicator);
-    old_old_solution.reinit(system_relevant_partitioning, mpi_communicator);
+    system_rhs.reinit(block_sizes,mpi_communicator,local_sizes);									//system_rhs.reinit(system_partitioning, mpi_communicator);
+    CIG::setup_petsc_vector(mpi_communicator,block_sizes,system_partitioning,system_relevant_partitioning,solution);			//solution.reinit(system_relevant_partitioning, mpi_communicator);
+    CIG::setup_petsc_vector(mpi_communicator,block_sizes,system_partitioning,system_relevant_partitioning,old_solution); 		//old_solution.reinit(system_relevant_partitioning, mpi_communicator);
+    CIG::setup_petsc_vector(mpi_communicator,block_sizes,system_partitioning,system_relevant_partitioning,old_old_solution); 	//old_old_solution.reinit(system_relevant_partitioning, mpi_communicator);
 
-    current_linearization_point.reinit (system_relevant_partitioning, MPI_COMM_WORLD);
+    CIG::setup_petsc_vector(mpi_communicator,block_sizes,system_partitioning,system_relevant_partitioning,current_linearization_point); 	//current_linearization_point.reinit (system_relevant_partitioning, MPI_COMM_WORLD);
 
     if (material_model->is_compressible())
-      pressure_shape_function_integrals.reinit (system_partitioning, mpi_communicator);
+    	pressure_shape_function_integrals.reinit(block_sizes,mpi_communicator,local_sizes);			//pressure_shape_function_integrals.reinit (system_partitioning, mpi_communicator);
 
     rebuild_stokes_matrix         = true;
     rebuild_stokes_preconditioner = true;
@@ -1414,6 +1482,8 @@
         {
           old_old_solution      = old_solution;
           old_solution          = solution;
+          old_old_solution.update_ghost_values();			//shuqiangwang: need to check when this is needed.
+          old_solution.update_ghost_values();
         }
 
         // periodically generate snapshots so that we can resume here

Modified: branches/s-wang2/source/simulator/helper_functions.cc
===================================================================
--- branches/s-wang2/source/simulator/helper_functions.cc	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/source/simulator/helper_functions.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -78,12 +78,12 @@
           << "* Matrix " << system_matrix.memory_consumption()/mb << std::endl
           << "* 5 Vectors " << 5*solution.memory_consumption()/mb << std::endl
           << "* preconditioner " << (system_preconditioner_matrix.memory_consumption()
-                                     + Amg_preconditioner->memory_consumption()
+                                     //+ Amg_preconditioner->memory_consumption()
                                      /*+Mp_preconditioner->memory_consumption()
                                                                       +T_preconditioner->memory_consumption()*/)/mb
           << std::endl
           << "  - matrix " << system_preconditioner_matrix.memory_consumption()/mb << std::endl
-          << "  - prec vel " << Amg_preconditioner->memory_consumption()/mb << std::endl
+      //          << "  - prec vel " << Amg_preconditioner->memory_consumption()/mb << std::endl
           << "  - prec mass " << 0/*Mp_preconditioner->memory_consumption()/mb*/ << std::endl
           << "  - prec T " << 0/*T_preconditioner->memory_consumption()/mb*/ << std::endl
           << std::endl;

Modified: branches/s-wang2/source/simulator/initial_conditions.cc
===================================================================
--- branches/s-wang2/source/simulator/initial_conditions.cc	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/source/simulator/initial_conditions.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -153,8 +153,14 @@
 
         // then apply constraints and copy the
         // result into vectors with ghost elements
+//        constraints.print(std::cout);
         constraints.distribute(initial_solution);
 
+//        static int debug_index = 0;
+//        debug_index++;
+//        if(debug_index==2)
+//        	exit(0);
+
         // copy temperature/composition block only
         solution.block(2+n) = initial_solution.block(2+n);
         old_solution.block(2+n) = initial_solution.block(2+n);
@@ -196,8 +202,12 @@
                                                                                dim+2+parameters.n_compositional_fields),
                                   system_tmp);
 
+        system_tmp.compress();			// shuqiangwang: do I need this?
+//        system_tmp.print(std::cout,7,false,false);
         // we may have hanging nodes, so apply constraints
         constraints.distribute (system_tmp);
+        
+        system_tmp.compress();
 
         old_solution.block(1) = system_tmp.block(1);
       }
@@ -289,6 +299,8 @@
         old_solution.block(1) = system_tmp.block(1);
       }
 
+    old_solution.compress();
+
     // normalize the pressure in such a way that the surface pressure
     // equals a known and desired value
     normalize_pressure(old_solution);

Modified: branches/s-wang2/source/simulator/solver.cc
===================================================================
--- branches/s-wang2/source/simulator/solver.cc	2012-11-29 05:40:39 UTC (rev 1393)
+++ branches/s-wang2/source/simulator/solver.cc	2012-11-29 05:55:56 UTC (rev 1394)
@@ -25,7 +25,7 @@
 
 #include <deal.II/lac/solver_gmres.h>
 #include <deal.II/lac/constraint_matrix.h>
-#include <deal.II/lac/trilinos_solver.h>
+//#include <deal.II/lac/trilinos_solver.h>
 #include <deal.II/lac/pointer_matrix.h>
 
 
@@ -67,9 +67,9 @@
         /**
          * Compute the residual with the Stokes block.
          */
-        double residual (TrilinosWrappers::MPI::BlockVector       &dst,
-                         const TrilinosWrappers::MPI::BlockVector &x,
-                         const TrilinosWrappers::MPI::BlockVector &b) const;
+        double residual (LinearAlgebra::BlockVector       &dst,
+                         const LinearAlgebra::BlockVector &x,
+                         const LinearAlgebra::BlockVector &b) const;
 
         void clear() {};
 
@@ -130,9 +130,9 @@
 
 
 
-    double StokesBlock::residual (TrilinosWrappers::MPI::BlockVector       &dst,
-                                  const TrilinosWrappers::MPI::BlockVector &x,
-                                  const TrilinosWrappers::MPI::BlockVector &b) const
+    double StokesBlock::residual (LinearAlgebra::BlockVector       &dst,
+                                  const LinearAlgebra::BlockVector &x,
+                                  const LinearAlgebra::BlockVector &b) const
     {
       // compute b-Ax where A is only the top left 2x2 block
       this->vmult (dst, x);
@@ -222,7 +222,7 @@
       {
         SolverControl solver_control(5000, 1e-6 * src.block(1).l2_norm());
 
-        TrilinosWrappers::SolverCG solver(solver_control);
+    	  PETScWrappers::SolverCG solver(solver_control);
 
         // Trilinos reports a breakdown
         // in case src=dst=0, even
@@ -247,7 +247,7 @@
       if (do_solve_A == true)
         {
           SolverControl solver_control(5000, utmp.l2_norm()*1e-2);
-          TrilinosWrappers::SolverCG solver(solver_control);
+          PETScWrappers::SolverCG solver(solver_control);
           solver.solve(stokes_matrix.block(0,0), dst.block(0), utmp,
                        a_preconditioner);
         }
@@ -286,13 +286,13 @@
 // overwrite the vector in residual(), then call set_zero again, and then throw away
 // the result
       LinearAlgebra::BlockVector
-      distributed_solution (system_rhs);
-      current_constraints.set_zero(distributed_solution);
+      distributed_solution (system_rhs);	distributed_solution.compress();
+      current_constraints.set_zero(distributed_solution);		distributed_solution.compress();
       // create vector with distribution of system_rhs.
       LinearAlgebra::Vector block_remap (system_rhs.block (index+2));
       // copy block of current_linearization_point into it, because
       // current_linearization is distributed differently.
-      block_remap = current_linearization_point.block (index+2);
+      block_remap = current_linearization_point.block (index+2);	block_remap.compress();
       // (ab)use the distributed solution vector to temporarily put a residual in
       initial_residual = system_matrix.block(index+2,index+2).residual (distributed_solution.block(index+2),
                                                                         block_remap,
@@ -300,12 +300,12 @@
       current_constraints.set_zero(distributed_solution);
 
       // then overwrite it again with the current best guess and solve the linear system
-      distributed_solution.block(index+2) = block_remap;
+      distributed_solution.block(index+2) = block_remap;		distributed_solution.compress();
       solver.solve (system_matrix.block(index+2,index+2), distributed_solution.block(index+2),
                     system_rhs.block(index+2), index==0?*T_preconditioner:*C_preconditioner);
 
       current_constraints.distribute (distributed_solution);
-      solution.block(index+2) = distributed_solution.block(index+2);
+      solution.block(index+2) = distributed_solution.block(index+2);		solution.compress();
 
       // print number of iterations and also record it in the
       // statistics file
@@ -354,7 +354,7 @@
     remap.block (1) = current_linearization_point.block (1);
     // before solving we scale the initial solution to the right dimensions
     remap.block (1) /= pressure_scaling;
-    current_constraints.set_zero (remap);
+    current_constraints.set_zero (remap);	remap.compress();
     // if the model is compressible then we need to adjust the right hand
     // side of the equation to make it compatible with the matrix on the
     // left
@@ -368,13 +368,13 @@
 
     // then overwrite it again with the current best guess and solve the linear system
     distributed_stokes_solution.block(0) = remap.block(0);
-    distributed_stokes_solution.block(1) = remap.block(1);
+    distributed_stokes_solution.block(1) = remap.block(1);		distributed_stokes_solution.compress();
 
     // extract Stokes parts of rhs vector
     LinearAlgebra::BlockVector distributed_stokes_rhs;
     distributed_stokes_rhs.reinit(system_rhs);
     distributed_stokes_rhs.block(0) = system_rhs.block(0);
-    distributed_stokes_rhs.block(1) = system_rhs.block(1);
+    distributed_stokes_rhs.block(1) = system_rhs.block(1);		distributed_stokes_rhs.compress();
 
     PrimitiveVectorMemory< LinearAlgebra::BlockVector > mem;