From 3aac04511909513d7106ec70370a4e60a320e1b8 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Tue, 19 Dec 2023 16:11:32 -0800
Subject: [PATCH 1/5] Clean up assembly of global parallel HypreParMatrix for
 coarse solver

---
 palace/linalg/amg.cpp              |  15 ---
 palace/linalg/amg.hpp              |   3 -
 palace/linalg/ams.cpp              |  11 +-
 palace/linalg/chebyshev.cpp        |  42 +++-----
 palace/linalg/chebyshev.hpp        |  11 +-
 palace/linalg/distrelaxation.cpp   |  14 +--
 palace/linalg/distrelaxation.hpp   |   6 +-
 palace/linalg/divfree.cpp          |   6 +-
 palace/linalg/gmg.cpp              |  13 +--
 palace/linalg/gmg.hpp              |   6 +-
 palace/linalg/hcurl.cpp            |   6 +-
 palace/linalg/jacobi.cpp           |  10 +-
 palace/linalg/ksp.cpp              |  22 +++-
 palace/linalg/mumps.cpp            |  15 ---
 palace/linalg/mumps.hpp            |   3 -
 palace/linalg/operator.cpp         |  52 +++-------
 palace/linalg/operator.hpp         |  30 +++---
 palace/linalg/rap.cpp              | 159 ++++++++++++-----------------
 palace/linalg/rap.hpp              |  49 ++++-----
 palace/linalg/solver.cpp           |  95 +++++++++++++++--
 palace/linalg/solver.hpp           |  23 ++++-
 palace/linalg/strumpack.cpp        |  44 ++++----
 palace/linalg/superlu.cpp          |  53 +++++-----
 palace/models/romoperator.cpp      |  10 +-
 palace/models/spaceoperator.cpp    |  56 +++++-----
 palace/models/waveportoperator.cpp |   3 +-
 26 files changed, 365 insertions(+), 392 deletions(-)
diff --git a/palace/linalg/amg.cpp b/palace/linalg/amg.cpp
index f34ef25b7..979d688ee 100644
--- a/palace/linalg/amg.cpp
+++ b/palace/linalg/amg.cpp
@@ -3,8 +3,6 @@
 
 #include "amg.hpp"
 
-#include "linalg/rap.hpp"
-
 namespace palace
 {
 
@@ -27,17 +25,4 @@ BoomerAmgSolver::BoomerAmgSolver(int cycle_it, int smooth_it, int print)
   // HYPRE_BoomerAMGSetCycleRelaxType(*this, coarse_relax_type, 3);
 }
 
-void BoomerAmgSolver::SetOperator(const Operator &op)
-{
-  const auto *PtAP = dynamic_cast<const ParOperator *>(&op);
-  if (PtAP)
-  {
-    mfem::HypreBoomerAMG::SetOperator(PtAP->ParallelAssemble());
-  }
-  else
-  {
-    mfem::HypreBoomerAMG::SetOperator(op);
-  }
-}
-
 }  // namespace palace
diff --git a/palace/linalg/amg.hpp b/palace/linalg/amg.hpp
index 1806c7702..e02069b70 100644
--- a/palace/linalg/amg.hpp
+++ b/palace/linalg/amg.hpp
@@ -5,7 +5,6 @@
 #define PALACE_LINALG_AMG_HPP
 
 #include <mfem.hpp>
-#include "linalg/operator.hpp"
 #include "utils/iodata.hpp"
 
 namespace palace
@@ -23,8 +22,6 @@ class BoomerAmgSolver : public mfem::HypreBoomerAMG
                       iodata.solver.linear.mg_smooth_it, print)
   {
   }
-
-  void SetOperator(const Operator &op) override;
 };
 
 }  // namespace palace
diff --git a/palace/linalg/ams.cpp b/palace/linalg/ams.cpp
index bbd97f8cb..7a357afeb 100644
--- a/palace/linalg/ams.cpp
+++ b/palace/linalg/ams.cpp
@@ -208,16 +208,7 @@ void HypreAmsSolver::SetOperator(const Operator &op)
     HYPRE_AMSDestroy(ams);
     InitializeSolver();
   }
-
-  const auto *PtAP = dynamic_cast<const ParOperator *>(&op);
-  if (PtAP)
-  {
-    A = &PtAP->ParallelAssemble();
-  }
-  else
-  {
-    A = dynamic_cast<mfem::HypreParMatrix *>(const_cast<Operator *>(&op));
-  }
+  A = const_cast<mfem::HypreParMatrix *>(dynamic_cast<const mfem::HypreParMatrix *>(&op));
   MFEM_VERIFY(A, "HypreAmsSolver requires a HypreParMatrix operator!");
   height = A->Height();
   width = A->Width();
diff --git a/palace/linalg/chebyshev.cpp b/palace/linalg/chebyshev.cpp
index 63f90e167..e9f8e8cd0 100644
--- a/palace/linalg/chebyshev.cpp
+++ b/palace/linalg/chebyshev.cpp
@@ -4,7 +4,6 @@
 #include "chebyshev.hpp"
 
 #include <mfem/general/forall.hpp>
-#include "linalg/rap.hpp"
 
 namespace palace
 {
@@ -151,9 +150,10 @@ inline void ApplyOrderK(const double sd, const double sr, const ComplexVector &d
 }  // namespace
 
 template <typename OperType>
-ChebyshevSmoother<OperType>::ChebyshevSmoother(int smooth_it, int poly_order, double sf_max)
-  : Solver<OperType>(), pc_it(smooth_it), order(poly_order), A(nullptr), lambda_max(0.0),
-    sf_max(sf_max)
+ChebyshevSmoother<OperType>::ChebyshevSmoother(MPI_Comm comm, int smooth_it, int poly_order,
+                                               double sf_max)
+  : Solver<OperType>(), comm(comm), pc_it(smooth_it), order(poly_order), A(nullptr),
+    lambda_max(0.0), sf_max(sf_max)
 {
   MFEM_VERIFY(order > 0, "Polynomial order for Chebyshev smoothing must be positive!");
 }
@@ -161,24 +161,16 @@ ChebyshevSmoother<OperType>::ChebyshevSmoother(int smooth_it, int poly_order, do
 template <typename OperType>
 void ChebyshevSmoother<OperType>::SetOperator(const OperType &op)
 {
-  using ParOperType =
-      typename std::conditional<std::is_same<OperType, ComplexOperator>::value,
-                                ComplexParOperator, ParOperator>::type;
-
   A = &op;
   r.SetSize(op.Height());
   d.SetSize(op.Height());
-
-  const auto *PtAP = dynamic_cast<const ParOperType *>(&op);
-  MFEM_VERIFY(PtAP,
-              "ChebyshevSmoother requires a ParOperator or ComplexParOperator operator!");
   dinv.SetSize(op.Height());
-  PtAP->AssembleDiagonal(dinv);
+  op.AssembleDiagonal(dinv);
   dinv.Reciprocal();
 
   // Set up Chebyshev coefficients using the computed maximum eigenvalue estimate. See
   // mfem::OperatorChebyshevSmoother or Adams et al. (2003).
-  lambda_max = sf_max * GetLambdaMax(PtAP->GetComm(), *A, dinv);
+  lambda_max = sf_max * GetLambdaMax(comm, *A, dinv);
 
   this->height = op.Height();
   this->width = op.Width();
@@ -217,10 +209,11 @@ void ChebyshevSmoother<OperType>::Mult(const VecType &x, VecType &y) const
 }
 
 template <typename OperType>
-ChebyshevSmoother1stKind<OperType>::ChebyshevSmoother1stKind(int smooth_it, int poly_order,
-                                                             double sf_max, double sf_min)
-  : Solver<OperType>(), pc_it(smooth_it), order(poly_order), A(nullptr), theta(0.0),
-    sf_max(sf_max), sf_min(sf_min)
+ChebyshevSmoother1stKind<OperType>::ChebyshevSmoother1stKind(MPI_Comm comm, int smooth_it,
+                                                             int poly_order, double sf_max,
+                                                             double sf_min)
+  : Solver<OperType>(), comm(comm), pc_it(smooth_it), order(poly_order), A(nullptr),
+    theta(0.0), sf_max(sf_max), sf_min(sf_min)
 {
   MFEM_VERIFY(order > 0, "Polynomial order for Chebyshev smoothing must be positive!");
 }
@@ -228,20 +221,11 @@ ChebyshevSmoother1stKind<OperType>::ChebyshevSmoother1stKind(int smooth_it, int
 template <typename OperType>
 void ChebyshevSmoother1stKind<OperType>::SetOperator(const OperType &op)
 {
-  using ParOperType =
-      typename std::conditional<std::is_same<OperType, ComplexOperator>::value,
-                                ComplexParOperator, ParOperator>::type;
-
   A = &op;
   r.SetSize(op.Height());
   d.SetSize(op.Height());
-
-  const auto *PtAP = dynamic_cast<const ParOperType *>(&op);
-  MFEM_VERIFY(
-      PtAP,
-      "ChebyshevSmoother1stKind requires a ParOperator or ComplexParOperator operator!");
   dinv.SetSize(op.Height());
-  PtAP->AssembleDiagonal(dinv);
+  op.AssembleDiagonal(dinv);
   dinv.Reciprocal();
 
   // Set up Chebyshev coefficients using the computed maximum eigenvalue estimate. The
@@ -250,7 +234,7 @@ void ChebyshevSmoother1stKind<OperType>::SetOperator(const OperType &op)
   {
     sf_min = 1.69 / (std::pow(order, 1.68) + 2.11 * order + 1.98);
   }
-  const double lambda_max = sf_max * GetLambdaMax(PtAP->GetComm(), *A, dinv);
+  const double lambda_max = sf_max * GetLambdaMax(comm, *A, dinv);
   const double lambda_min = sf_min * lambda_max;
   theta = 0.5 * (lambda_max + lambda_min);
   delta = 0.5 * (lambda_max - lambda_min);
diff --git a/palace/linalg/chebyshev.hpp b/palace/linalg/chebyshev.hpp
index 56df15a44..e37da8fde 100644
--- a/palace/linalg/chebyshev.hpp
+++ b/palace/linalg/chebyshev.hpp
@@ -25,6 +25,9 @@ class ChebyshevSmoother : public Solver<OperType>
   using VecType = typename Solver<OperType>::VecType;
 
 private:
+  // MPI communicator associated with the solver operator and vectors.
+  MPI_Comm comm;
+
   // Number of smoother iterations and polynomial order.
   const int pc_it, order;
 
@@ -41,7 +44,7 @@ class ChebyshevSmoother : public Solver<OperType>
   mutable VecType r, d;
 
 public:
-  ChebyshevSmoother(int smooth_it, int poly_order, double sf_max);
+  ChebyshevSmoother(MPI_Comm comm, int smooth_it, int poly_order, double sf_max);
 
   void SetOperator(const OperType &op) override;
 
@@ -65,6 +68,9 @@ class ChebyshevSmoother1stKind : public Solver<OperType>
   using VecType = typename Solver<OperType>::VecType;
 
 private:
+  // MPI communicator associated with the solver operator and vectors.
+  MPI_Comm comm;
+
   // Number of smoother iterations and polynomial order.
   const int pc_it, order;
 
@@ -82,7 +88,8 @@ class ChebyshevSmoother1stKind : public Solver<OperType>
   mutable VecType r, d;
 
 public:
-  ChebyshevSmoother1stKind(int smooth_it, int poly_order, double sf_max, double sf_min);
+  ChebyshevSmoother1stKind(MPI_Comm comm, int smooth_it, int poly_order, double sf_max,
+                           double sf_min);
 
   void SetOperator(const OperType &op) override;
 
diff --git a/palace/linalg/distrelaxation.cpp b/palace/linalg/distrelaxation.cpp
index 7ef9d3663..8eaa55696 100644
--- a/palace/linalg/distrelaxation.cpp
+++ b/palace/linalg/distrelaxation.cpp
@@ -12,7 +12,7 @@ namespace palace
 
 template <typename OperType>
 DistRelaxationSmoother<OperType>::DistRelaxationSmoother(
-    const Operator &G, int smooth_it, int cheby_smooth_it, int cheby_order,
+    MPI_Comm comm, const Operator &G, int smooth_it, int cheby_smooth_it, int cheby_order,
     double cheby_sf_max, double cheby_sf_min, bool cheby_4th_kind)
   : Solver<OperType>(), pc_it(smooth_it), G(&G), A(nullptr), A_G(nullptr),
     dbc_tdof_list_G(nullptr)
@@ -20,17 +20,17 @@ DistRelaxationSmoother<OperType>::DistRelaxationSmoother(
   // Initialize smoothers.
   if (cheby_4th_kind)
   {
-    B = std::make_unique<ChebyshevSmoother<OperType>>(cheby_smooth_it, cheby_order,
+    B = std::make_unique<ChebyshevSmoother<OperType>>(comm, cheby_smooth_it, cheby_order,
                                                       cheby_sf_max);
-    B_G = std::make_unique<ChebyshevSmoother<OperType>>(cheby_smooth_it, cheby_order,
+    B_G = std::make_unique<ChebyshevSmoother<OperType>>(comm, cheby_smooth_it, cheby_order,
                                                         cheby_sf_max);
   }
   else
   {
-    B = std::make_unique<ChebyshevSmoother1stKind<OperType>>(cheby_smooth_it, cheby_order,
-                                                             cheby_sf_max, cheby_sf_min);
-    B_G = std::make_unique<ChebyshevSmoother1stKind<OperType>>(cheby_smooth_it, cheby_order,
-                                                               cheby_sf_max, cheby_sf_min);
+    B = std::make_unique<ChebyshevSmoother1stKind<OperType>>(
+        comm, cheby_smooth_it, cheby_order, cheby_sf_max, cheby_sf_min);
+    B_G = std::make_unique<ChebyshevSmoother1stKind<OperType>>(
+        comm, cheby_smooth_it, cheby_order, cheby_sf_max, cheby_sf_min);
   }
   B_G->SetInitialGuess(false);
 }
diff --git a/palace/linalg/distrelaxation.hpp b/palace/linalg/distrelaxation.hpp
index c447f7186..cee3373ec 100644
--- a/palace/linalg/distrelaxation.hpp
+++ b/palace/linalg/distrelaxation.hpp
@@ -50,9 +50,9 @@ class DistRelaxationSmoother : public Solver<OperType>
   mutable VecType r, x_G, y_G;
 
 public:
-  DistRelaxationSmoother(const Operator &G, int smooth_it, int cheby_smooth_it,
-                         int cheby_order, double cheby_sf_max, double cheby_sf_min,
-                         bool cheby_4th_kind);
+  DistRelaxationSmoother(MPI_Comm comm, const Operator &G, int smooth_it,
+                         int cheby_smooth_it, int cheby_order, double cheby_sf_max,
+                         double cheby_sf_min, bool cheby_4th_kind);
 
   void SetOperator(const OperType &op) override
   {
diff --git a/palace/linalg/divfree.cpp b/palace/linalg/divfree.cpp
index e3ddd82f6..9eca1d4c9 100644
--- a/palace/linalg/divfree.cpp
+++ b/palace/linalg/divfree.cpp
@@ -49,14 +49,16 @@ DivFreeSolver::DivFreeSolver(const MaterialOperator &mat_op, FiniteElementSpace
   bdr_tdof_list_M = &h1_bdr_tdof_lists.back();
 
   // The system matrix for the projection is real and SPD.
-  auto amg =
-      std::make_unique<WrapperSolver<Operator>>(std::make_unique<BoomerAmgSolver>(1, 1, 0));
+  auto amg = std::make_unique<MfemWrapperSolver<Operator>>(
+      std::make_unique<BoomerAmgSolver>(1, 1, 0));
   std::unique_ptr<Solver<Operator>> pc;
   if (h1_fespaces.GetNumLevels() > 1)
   {
     const int mg_smooth_order =
         std::max(h1_fespaces.GetFinestFESpace().GetMaxElementOrder(), 2);
     pc = std::make_unique<GeometricMultigridSolver<Operator>>(
+        h1_fespaces.GetFinestFESpace().GetComm(),
+
         std::move(amg), h1_fespaces.GetProlongationOperators(), nullptr, 1, 1,
         mg_smooth_order, 1.0, 0.0, true);
   }
diff --git a/palace/linalg/gmg.cpp b/palace/linalg/gmg.cpp
index c10e6f329..27bd1961d 100644
--- a/palace/linalg/gmg.cpp
+++ b/palace/linalg/gmg.cpp
@@ -14,7 +14,7 @@ namespace palace
 
 template <typename OperType>
 GeometricMultigridSolver<OperType>::GeometricMultigridSolver(
-    std::unique_ptr<Solver<OperType>> &&coarse_solver,
+    MPI_Comm comm, std::unique_ptr<Solver<OperType>> &&coarse_solver,
     const std::vector<const Operator *> &P, const std::vector<const Operator *> *G,
     int cycle_it, int smooth_it, int cheby_order, double cheby_sf_max, double cheby_sf_min,
     bool cheby_4th_kind)
@@ -42,21 +42,21 @@ GeometricMultigridSolver<OperType>::GeometricMultigridSolver(
     {
       const int cheby_smooth_it = 1;
       B[l] = std::make_unique<DistRelaxationSmoother<OperType>>(
-          *(*G)[l], smooth_it, cheby_smooth_it, cheby_order, cheby_sf_max, cheby_sf_min,
-          cheby_4th_kind);
+          comm, *(*G)[l], smooth_it, cheby_smooth_it, cheby_order, cheby_sf_max,
+          cheby_sf_min, cheby_4th_kind);
     }
     else
     {
       const int cheby_smooth_it = smooth_it;
       if (cheby_4th_kind)
       {
-        B[l] = std::make_unique<ChebyshevSmoother<OperType>>(cheby_smooth_it, cheby_order,
-                                                             cheby_sf_max);
+        B[l] = std::make_unique<ChebyshevSmoother<OperType>>(comm, cheby_smooth_it,
+                                                             cheby_order, cheby_sf_max);
       }
       else
       {
         B[l] = std::make_unique<ChebyshevSmoother1stKind<OperType>>(
-            cheby_smooth_it, cheby_order, cheby_sf_max, cheby_sf_min);
+            comm, cheby_smooth_it, cheby_order, cheby_sf_max, cheby_sf_min);
       }
     }
   }
@@ -114,6 +114,7 @@ void GeometricMultigridSolver<OperType>::SetOperator(const OperType &op)
     Y[l].SetSize(A[l]->Height());
     R[l].SetSize(A[l]->Height());
   }
+
   this->height = op.Height();
   this->width = op.Width();
 }
diff --git a/palace/linalg/gmg.hpp b/palace/linalg/gmg.hpp
index 1e4cd170b..8e55dadb3 100644
--- a/palace/linalg/gmg.hpp
+++ b/palace/linalg/gmg.hpp
@@ -57,17 +57,17 @@ class GeometricMultigridSolver : public Solver<OperType>
   void VCycle(int l, bool initial_guess) const;
 
 public:
-  GeometricMultigridSolver(std::unique_ptr<Solver<OperType>> &&coarse_solver,
+  GeometricMultigridSolver(MPI_Comm comm, std::unique_ptr<Solver<OperType>> &&coarse_solver,
                            const std::vector<const Operator *> &P,
                            const std::vector<const Operator *> *G, int cycle_it,
                            int smooth_it, int cheby_order, double cheby_sf_max,
                            double cheby_sf_min, bool cheby_4th_kind);
-  GeometricMultigridSolver(const IoData &iodata,
+  GeometricMultigridSolver(MPI_Comm comm, const IoData &iodata,
                            std::unique_ptr<Solver<OperType>> &&coarse_solver,
                            const std::vector<const Operator *> &P,
                            const std::vector<const Operator *> *G = nullptr)
     : GeometricMultigridSolver(
-          std::move(coarse_solver), P, G, iodata.solver.linear.mg_cycle_it,
+          comm, std::move(coarse_solver), P, G, iodata.solver.linear.mg_cycle_it,
           iodata.solver.linear.mg_smooth_it, iodata.solver.linear.mg_smooth_order,
           iodata.solver.linear.mg_smooth_sf_max, iodata.solver.linear.mg_smooth_sf_min,
           iodata.solver.linear.mg_smooth_cheby_4th)
diff --git a/palace/linalg/hcurl.cpp b/palace/linalg/hcurl.cpp
index e5639e41d..adb55b9fc 100644
--- a/palace/linalg/hcurl.cpp
+++ b/palace/linalg/hcurl.cpp
@@ -67,7 +67,7 @@ WeightedHCurlNormSolver::WeightedHCurlNormSolver(
 
   // The system matrix K + M is real and SPD. We use Hypre's AMS solver as the coarse-level
   // multigrid solve.
-  auto ams = std::make_unique<WrapperSolver<Operator>>(std::make_unique<HypreAmsSolver>(
+  auto ams = std::make_unique<MfemWrapperSolver<Operator>>(std::make_unique<HypreAmsSolver>(
       nd_fespaces.GetFESpaceAtLevel(0), h1_fespaces.GetFESpaceAtLevel(0), 1, 1, 1, false,
       false, 0));
   std::unique_ptr<Solver<Operator>> pc;
@@ -77,8 +77,8 @@ WeightedHCurlNormSolver::WeightedHCurlNormSolver(
     const int mg_smooth_order =
         std::max(nd_fespaces.GetFinestFESpace().GetMaxElementOrder(), 2);
     pc = std::make_unique<GeometricMultigridSolver<Operator>>(
-        std::move(ams), nd_fespaces.GetProlongationOperators(), &G, 1, 1, mg_smooth_order,
-        1.0, 0.0, true);
+        nd_fespaces.GetFinestFESpace().GetComm(), std::move(ams),
+        nd_fespaces.GetProlongationOperators(), &G, 1, 1, mg_smooth_order, 1.0, 0.0, true);
   }
   else
   {
diff --git a/palace/linalg/jacobi.cpp b/palace/linalg/jacobi.cpp
index 384884154..39546c694 100644
--- a/palace/linalg/jacobi.cpp
+++ b/palace/linalg/jacobi.cpp
@@ -4,7 +4,6 @@
 #include "jacobi.hpp"
 
 #include <mfem/general/forall.hpp>
-#include "linalg/rap.hpp"
 
 namespace palace
 {
@@ -57,15 +56,8 @@ inline void Apply(const ComplexVector &dinv, const ComplexVector &x, ComplexVect
 template <typename OperType>
 void JacobiSmoother<OperType>::SetOperator(const OperType &op)
 {
-  using ParOperType =
-      typename std::conditional<std::is_same<OperType, ComplexOperator>::value,
-                                ComplexParOperator, ParOperator>::type;
-
-  const auto *PtAP = dynamic_cast<const ParOperType *>(&op);
-  MFEM_VERIFY(PtAP,
-              "JacobiSmoother requires a ParOperator or ComplexParOperator operator!");
   dinv.SetSize(op.Height());
-  PtAP->AssembleDiagonal(dinv);
+  op.AssembleDiagonal(dinv);
   dinv.Reciprocal();
 
   this->height = op.Height();
diff --git a/palace/linalg/ksp.cpp b/palace/linalg/ksp.cpp
index 29b793d5b..a87ccd40a 100644
--- a/palace/linalg/ksp.cpp
+++ b/palace/linalg/ksp.cpp
@@ -115,8 +115,22 @@ std::unique_ptr<IterativeSolver<OperType>> ConfigureKrylovSolver(MPI_Comm comm,
 template <typename OperType, typename T, typename... U>
 auto MakeWrapperSolver(U &&...args)
 {
-  return std::make_unique<WrapperSolver<OperType>>(
-      std::make_unique<T>(std::forward<U>(args)...));
+  // Sparse direct solver types copy the input matrix, so there is no need to save the
+  // parallel assembled operator.
+  constexpr bool save_assembled = !(false ||
+#if defined(MFEM_USE_SUPERLU)
+                                    std::is_same<T, SuperLUSolver>::value ||
+#endif
+#if defined(MFEM_USE_STRUMPACK)
+                                    std::is_same<T, StrumpackSolver>::value ||
+                                    std::is_same<T, StrumpackMixedPrecisionSolver>::value ||
+#endif
+#if defined(MFEM_USE_MUMPS)
+                                    std::is_same<T, MumpsSolver>::value ||
+#endif
+                                    false);
+  return std::make_unique<MfemWrapperSolver<OperType>>(
+      std::make_unique<T>(std::forward<U>(args)...), save_assembled);
 }
 
 template <typename OperType>
@@ -199,12 +213,12 @@ ConfigurePreconditionerSolver(MPI_Comm comm, const IoData &iodata,
                                   "primary space and auxiliary spaces for construction!");
         const auto G = aux_fespaces->GetDiscreteInterpolators();
         return std::make_unique<GeometricMultigridSolver<OperType>>(
-            iodata, std::move(pc), fespaces.GetProlongationOperators(), &G);
+            comm, iodata, std::move(pc), fespaces.GetProlongationOperators(), &G);
       }
       else
       {
         return std::make_unique<GeometricMultigridSolver<OperType>>(
-            iodata, std::move(pc), fespaces.GetProlongationOperators());
+            comm, iodata, std::move(pc), fespaces.GetProlongationOperators());
       }
     }();
     gmg->EnableTimer();  // Enable timing for primary geometric multigrid solver
diff --git a/palace/linalg/mumps.cpp b/palace/linalg/mumps.cpp
index b3a5a2815..5be5fe7c8 100644
--- a/palace/linalg/mumps.cpp
+++ b/palace/linalg/mumps.cpp
@@ -5,8 +5,6 @@
 
 #if defined(MFEM_USE_MUMPS)
 
-#include "linalg/rap.hpp"
-
 namespace palace
 {
 
@@ -50,19 +48,6 @@ MumpsSolver::MumpsSolver(MPI_Comm comm, mfem::MUMPSSolver::MatType sym,
   }
 }
 
-void MumpsSolver::SetOperator(const Operator &op)
-{
-  const auto *PtAP = dynamic_cast<const ParOperator *>(&op);
-  if (PtAP)
-  {
-    mfem::MUMPSSolver::SetOperator(PtAP->ParallelAssemble());
-  }
-  else
-  {
-    mfem::MUMPSSolver::SetOperator(op);
-  }
-}
-
 }  // namespace palace
 
 #endif
diff --git a/palace/linalg/mumps.hpp b/palace/linalg/mumps.hpp
index f98bd0266..fa054e9b1 100644
--- a/palace/linalg/mumps.hpp
+++ b/palace/linalg/mumps.hpp
@@ -8,7 +8,6 @@
 
 #if defined(MFEM_USE_MUMPS)
 
-#include "linalg/operator.hpp"
 #include "utils/iodata.hpp"
 
 namespace palace
@@ -38,8 +37,6 @@ class MumpsSolver : public mfem::MUMPSSolver
                   print)
   {
   }
-
-  void SetOperator(const Operator &op) override;
 };
 
 }  // namespace palace
diff --git a/palace/linalg/operator.cpp b/palace/linalg/operator.cpp
index bf6a0c701..8f655a4dc 100644
--- a/palace/linalg/operator.cpp
+++ b/palace/linalg/operator.cpp
@@ -10,52 +10,21 @@
 namespace palace
 {
 
-bool ComplexOperator::IsReal() const
-{
-  MFEM_ABORT("IsReal() is not implemented for base class ComplexOperator!");
-  return false;
-}
-
-bool ComplexOperator::IsImag() const
-{
-  MFEM_ABORT("IsImag() is not implemented for base class ComplexOperator!");
-  return false;
-}
-
-bool ComplexOperator::HasReal() const
-{
-  MFEM_ABORT("HasReal() is not implemented for base class ComplexOperator!");
-  return false;
-}
-
-bool ComplexOperator::HasImag() const
-{
-  MFEM_ABORT("HasImag() is not implemented for base class ComplexOperator!");
-  return false;
-}
-
 const Operator *ComplexOperator::Real() const
 {
   MFEM_ABORT("Real() is not implemented for base class ComplexOperator!");
   return nullptr;
 }
 
-Operator *ComplexOperator::Real()
-{
-  MFEM_ABORT("Real() is not implemented for base class ComplexOperator!");
-  return nullptr;
-}
-
 const Operator *ComplexOperator::Imag() const
 {
   MFEM_ABORT("Imag() is not implemented for base class ComplexOperator!");
   return nullptr;
 }
 
-Operator *ComplexOperator::Imag()
+void ComplexOperator::AssembleDiagonal(ComplexVector &diag) const
 {
-  MFEM_ABORT("Imag() is not implemented for base class ComplexOperator!");
-  return nullptr;
+  MFEM_ABORT("Base class ComplexOperator does not implement AssembleDiagonal!");
 }
 
 void ComplexOperator::MultTranspose(const ComplexVector &x, ComplexVector &y) const
@@ -88,7 +57,7 @@ void ComplexOperator::AddMultHermitianTranspose(const ComplexVector &x, ComplexV
 
 ComplexWrapperOperator::ComplexWrapperOperator(std::unique_ptr<Operator> &&dAr,
                                                std::unique_ptr<Operator> &&dAi,
-                                               Operator *pAr, Operator *pAi)
+                                               const Operator *pAr, const Operator *pAi)
   : ComplexOperator(), data_Ar(std::move(dAr)), data_Ai(std::move(dAi)),
     Ar((data_Ar != nullptr) ? data_Ar.get() : pAr),
     Ai((data_Ai != nullptr) ? data_Ai.get() : pAi)
@@ -106,11 +75,24 @@ ComplexWrapperOperator::ComplexWrapperOperator(std::unique_ptr<Operator> &&Ar,
 {
 }
 
-ComplexWrapperOperator::ComplexWrapperOperator(Operator *Ar, Operator *Ai)
+ComplexWrapperOperator::ComplexWrapperOperator(const Operator *Ar, const Operator *Ai)
   : ComplexWrapperOperator(nullptr, nullptr, Ar, Ai)
 {
 }
 
+void ComplexWrapperOperator::AssembleDiagonal(ComplexVector &diag) const
+{
+  diag = 0.0;
+  if (Ar)
+  {
+    Ar->AssembleDiagonal(diag.Real());
+  }
+  if (Ai)
+  {
+    Ai->AssembleDiagonal(diag.Imag());
+  }
+}
+
 void ComplexWrapperOperator::Mult(const ComplexVector &x, ComplexVector &y) const
 {
   constexpr bool zero_real = false;
diff --git a/palace/linalg/operator.hpp b/palace/linalg/operator.hpp
index b02575425..78dc92534 100644
--- a/palace/linalg/operator.hpp
+++ b/palace/linalg/operator.hpp
@@ -37,18 +37,16 @@ class ComplexOperator
   int Width() const { return width; }
 
   // Test whether or not the operator is purely real or imaginary.
-  virtual bool IsReal() const;
-  virtual bool IsImag() const;
+  virtual bool IsReal() const { return !Imag(); }
+  virtual bool IsImag() const { return !Real(); }
 
-  // Test whether or not we can access the real and imaginary operator parts.
-  virtual bool HasReal() const;
-  virtual bool HasImag() const;
-
-  // Get access to the real and imaginary operator parts.
+  // Get access to the real and imaginary operator parts separately (may be empty if
+  // operator is purely real or imaginary).
   virtual const Operator *Real() const;
-  virtual Operator *Real();
   virtual const Operator *Imag() const;
-  virtual Operator *Imag();
+
+  // Diagonal assembly.
+  virtual void AssembleDiagonal(ComplexVector &diag) const;
 
   // Operator application.
   virtual void Mult(const ComplexVector &x, ComplexVector &y) const = 0;
@@ -75,13 +73,13 @@ class ComplexWrapperOperator : public ComplexOperator
 private:
   // Storage and access for real and imaginary parts of the operator.
   std::unique_ptr<Operator> data_Ar, data_Ai;
-  Operator *Ar, *Ai;
+  const Operator *Ar, *Ai;
 
   // Temporary storage for operator application.
   mutable ComplexVector tx, ty;
 
   ComplexWrapperOperator(std::unique_ptr<Operator> &&dAr, std::unique_ptr<Operator> &&dAi,
-                         Operator *pAr, Operator *pAi);
+                         const Operator *pAr, const Operator *pAi);
 
 public:
   // Construct a complex operator which inherits ownership of the input real and imaginary
@@ -89,16 +87,12 @@ class ComplexWrapperOperator : public ComplexOperator
   ComplexWrapperOperator(std::unique_ptr<Operator> &&Ar, std::unique_ptr<Operator> &&Ai);
 
   // Non-owning constructor.
-  ComplexWrapperOperator(Operator *Ar, Operator *Ai);
+  ComplexWrapperOperator(const Operator *Ar, const Operator *Ai);
 
-  bool IsReal() const override { return Ai == nullptr; }
-  bool IsImag() const override { return Ar == nullptr; }
-  bool HasReal() const override { return Ar != nullptr; }
-  bool HasImag() const override { return Ai != nullptr; }
   const Operator *Real() const override { return Ar; }
-  Operator *Real() override { return Ar; }
   const Operator *Imag() const override { return Ai; }
-  Operator *Imag() override { return Ai; }
+
+  void AssembleDiagonal(ComplexVector &diag) const override;
 
   void Mult(const ComplexVector &x, ComplexVector &y) const override;
 
diff --git a/palace/linalg/rap.cpp b/palace/linalg/rap.cpp
index 30ee39246..1fe426080 100644
--- a/palace/linalg/rap.cpp
+++ b/palace/linalg/rap.cpp
@@ -8,7 +8,7 @@
 namespace palace
 {
 
-ParOperator::ParOperator(std::unique_ptr<Operator> &&dA, Operator *pA,
+ParOperator::ParOperator(std::unique_ptr<Operator> &&dA, const Operator *pA,
                          const mfem::ParFiniteElementSpace &trial_fespace,
                          const mfem::ParFiniteElementSpace &test_fespace,
                          bool test_restrict)
@@ -31,25 +31,14 @@ ParOperator::ParOperator(std::unique_ptr<Operator> &&A,
 {
 }
 
-ParOperator::ParOperator(Operator &A, const mfem::ParFiniteElementSpace &trial_fespace,
+ParOperator::ParOperator(const Operator &A,
+                         const mfem::ParFiniteElementSpace &trial_fespace,
                          const mfem::ParFiniteElementSpace &test_fespace,
                          bool test_restrict)
   : ParOperator(nullptr, &A, trial_fespace, test_fespace, test_restrict)
 {
 }
 
-const Operator &ParOperator::LocalOperator() const
-{
-  MFEM_VERIFY(A, "No local matrix available for ParOperator::LocalOperator!");
-  return *A;
-}
-
-Operator &ParOperator::LocalOperator()
-{
-  MFEM_VERIFY(A, "No local matrix available for ParOperator::LocalOperator!");
-  return *A;
-}
-
 void ParOperator::SetEssentialTrueDofs(const mfem::Array<int> &tdof_list,
                                        DiagonalPolicy policy)
 {
@@ -62,47 +51,29 @@ void ParOperator::SetEssentialTrueDofs(const mfem::Array<int> &tdof_list,
   diag_policy = policy;
 }
 
-void ParOperator::AssembleDiagonal(Vector &diag) const
+void ParOperator::EliminateRHS(const Vector &x, Vector &b) const
 {
-  if (RAP)
+  if (!dbc_tdof_list)
   {
-    RAP->AssembleDiagonal(diag);
     return;
   }
 
-  // For an AMR mesh, a convergent diagonal is assembled with |P|ᵀ dₗ, where |P| has
-  // entry-wise absolute values of the conforming prolongation operator.
-  MFEM_VERIFY(&trial_fespace == &test_fespace,
-              "Diagonal assembly is only available for square ParOperator!");
-  if (const auto *sA = dynamic_cast<const mfem::SparseMatrix *>(A))
-  {
-    sA->GetDiag(ly);
-  }
-  else
-  {
-    A->AssembleDiagonal(ly);
-  }
+  MFEM_VERIFY(A, "No local matrix available for ParOperator::EliminateRHS!");
+  ty = 0.0;
+  linalg::SetSubVector(ty, *dbc_tdof_list, x);
+  trial_fespace.GetProlongationMatrix()->Mult(ty, lx);
 
-  // Parallel assemble and eliminate essential true dofs.
-  const Operator *P = test_fespace.GetProlongationMatrix();
-  if (const auto *hP = dynamic_cast<const mfem::HypreParMatrix *>(P))
-  {
-    hP->AbsMultTranspose(1.0, ly, 0.0, diag);
-  }
-  else
+  // Apply the unconstrained operator.
+  A->Mult(lx, ly);
+
+  RestrictionMatrixAddMult(ly, b, -1.0);
+  if (diag_policy == DiagonalPolicy::DIAG_ONE)
   {
-    P->MultTranspose(ly, diag);
+    linalg::SetSubVector(b, *dbc_tdof_list, x);
   }
-  if (dbc_tdof_list)
+  else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
   {
-    if (diag_policy == DiagonalPolicy::DIAG_ONE)
-    {
-      linalg::SetSubVector(diag, *dbc_tdof_list, 1.0);
-    }
-    else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
-    {
-      linalg::SetSubVector(diag, *dbc_tdof_list, 0.0);
-    }
+    linalg::SetSubVector(b, *dbc_tdof_list, 0.0);
   }
 }
 
@@ -114,22 +85,21 @@ mfem::HypreParMatrix &ParOperator::ParallelAssemble(bool skip_zeros) const
   }
 
   // Build the square or rectangular assembled HypreParMatrix.
-  auto *sA = dynamic_cast<mfem::SparseMatrix *>(A);
+  const auto *sA = dynamic_cast<const mfem::SparseMatrix *>(A);
   std::unique_ptr<mfem::SparseMatrix> data_sA;
   if (!sA)
   {
-    auto *cA = dynamic_cast<ceed::Operator *>(A);
+    const auto *cA = dynamic_cast<const ceed::Operator *>(A);
     MFEM_VERIFY(cA, "ParOperator::ParallelAssemble requires A as an mfem::SparseMatrix or "
                     "ceed::Operator!");
-    data_sA = use_R ? DiscreteLinearOperator::FullAssemble(*cA, skip_zeros)
-                    : BilinearForm::FullAssemble(*cA, skip_zeros);
+    data_sA = BilinearForm::FullAssemble(*cA, skip_zeros, use_R);
     sA = data_sA.get();
   }
   if (&trial_fespace == &test_fespace)
   {
-    mfem::HypreParMatrix *hA =
-        new mfem::HypreParMatrix(trial_fespace.GetComm(), trial_fespace.GlobalVSize(),
-                                 trial_fespace.GetDofOffsets(), sA);
+    mfem::HypreParMatrix *hA = new mfem::HypreParMatrix(
+        trial_fespace.GetComm(), trial_fespace.GlobalVSize(), trial_fespace.GetDofOffsets(),
+        const_cast<mfem::SparseMatrix *>(sA));
     const mfem::HypreParMatrix *P = trial_fespace.Dof_TrueDof_Matrix();
     RAP = std::make_unique<mfem::HypreParMatrix>(hypre_ParCSRMatrixRAP(*P, *hA, *P), true);
     delete hA;
@@ -138,7 +108,8 @@ mfem::HypreParMatrix &ParOperator::ParallelAssemble(bool skip_zeros) const
   {
     mfem::HypreParMatrix *hA = new mfem::HypreParMatrix(
         trial_fespace.GetComm(), test_fespace.GlobalVSize(), trial_fespace.GlobalVSize(),
-        test_fespace.GetDofOffsets(), trial_fespace.GetDofOffsets(), sA);
+        test_fespace.GetDofOffsets(), trial_fespace.GetDofOffsets(),
+        const_cast<mfem::SparseMatrix *>(sA));
     const mfem::HypreParMatrix *P = trial_fespace.Dof_TrueDof_Matrix();
     if (!use_R)
     {
@@ -160,12 +131,6 @@ mfem::HypreParMatrix &ParOperator::ParallelAssemble(bool skip_zeros) const
     }
     delete hA;
   }
-  if (data_A)
-  {
-    // The local matrix is no longer needed now that we have the parallel-assembled one.
-    data_A.reset();
-    A = nullptr;
-  }
   hypre_ParCSRMatrixSetNumNonzeros(*RAP);
 
   // Eliminate boundary conditions on the assembled (square) matrix.
@@ -176,29 +141,47 @@ mfem::HypreParMatrix &ParOperator::ParallelAssemble(bool skip_zeros) const
   return *RAP;
 }
 
-void ParOperator::EliminateRHS(const Vector &x, Vector &b) const
+void ParOperator::AssembleDiagonal(Vector &diag) const
 {
-  if (!dbc_tdof_list)
+  if (RAP)
   {
+    RAP->AssembleDiagonal(diag);
     return;
   }
 
-  MFEM_VERIFY(A, "No local matrix available for ParOperator::EliminateRHS!");
-  ty = 0.0;
-  linalg::SetSubVector(ty, *dbc_tdof_list, x);
-  trial_fespace.GetProlongationMatrix()->Mult(ty, lx);
-
-  // Apply the unconstrained operator.
-  A->Mult(lx, ly);
+  // For an AMR mesh, a convergent diagonal is assembled with |P|ᵀ dₗ, where |P| has
+  // entry-wise absolute values of the conforming prolongation operator.
+  MFEM_VERIFY(&trial_fespace == &test_fespace,
+              "Diagonal assembly is only available for square ParOperator!");
+  if (const auto *sA = dynamic_cast<const mfem::SparseMatrix *>(A))
+  {
+    sA->GetDiag(ly);
+  }
+  else
+  {
+    A->AssembleDiagonal(ly);
+  }
 
-  RestrictionMatrixAddMult(ly, b, -1.0);
-  if (diag_policy == DiagonalPolicy::DIAG_ONE)
+  // Parallel assemble and eliminate essential true dofs.
+  const Operator *P = test_fespace.GetProlongationMatrix();
+  if (const auto *hP = dynamic_cast<const mfem::HypreParMatrix *>(P))
   {
-    linalg::SetSubVector(b, *dbc_tdof_list, x);
+    hP->AbsMultTranspose(1.0, ly, 0.0, diag);
   }
-  else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
+  else
   {
-    linalg::SetSubVector(b, *dbc_tdof_list, 0.0);
+    P->MultTranspose(ly, diag);
+  }
+  if (dbc_tdof_list)
+  {
+    if (diag_policy == DiagonalPolicy::DIAG_ONE)
+    {
+      linalg::SetSubVector(diag, *dbc_tdof_list, 1.0);
+    }
+    else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
+    {
+      linalg::SetSubVector(diag, *dbc_tdof_list, 0.0);
+    }
   }
 }
 
@@ -402,8 +385,8 @@ void ParOperator::RestrictionMatrixMultTranspose(const Vector &ty, Vector &ly) c
 }
 
 ComplexParOperator::ComplexParOperator(std::unique_ptr<Operator> &&dAr,
-                                       std::unique_ptr<Operator> &&dAi, Operator *pAr,
-                                       Operator *pAi,
+                                       std::unique_ptr<Operator> &&dAi, const Operator *pAr,
+                                       const Operator *pAi,
                                        const mfem::ParFiniteElementSpace &trial_fespace,
                                        const mfem::ParFiniteElementSpace &test_fespace,
                                        bool test_restrict)
@@ -414,16 +397,16 @@ ComplexParOperator::ComplexParOperator(std::unique_ptr<Operator> &&dAr,
     A(data_A.get()), trial_fespace(trial_fespace), test_fespace(test_fespace),
     use_R(test_restrict), dbc_tdof_list(nullptr),
     diag_policy(Operator::DiagonalPolicy::DIAG_ONE),
-    RAPr(A->HasReal()
+    RAPr(A->Real()
              ? std::make_unique<ParOperator>(*A->Real(), trial_fespace, test_fespace, use_R)
              : nullptr),
-    RAPi(A->HasImag()
+    RAPi(A->Imag()
              ? std::make_unique<ParOperator>(*A->Imag(), trial_fespace, test_fespace, use_R)
              : nullptr)
 {
-  // We use the non-owning constructors for real and imaginary part ParOperators. We know A
-  // is a ComplexWrapperOperator which has separate access to the real and imaginary
-  // components.
+  // We use the non-owning constructors for real and imaginary part ParOperators, since we
+  // construct A as a ComplexWrapperOperator which has separate access to the real and
+  // imaginary components.
   lx.SetSize(A->Width());
   ly.SetSize(A->Height());
   ty.SetSize(width);
@@ -439,7 +422,7 @@ ComplexParOperator::ComplexParOperator(std::unique_ptr<Operator> &&Ar,
 {
 }
 
-ComplexParOperator::ComplexParOperator(Operator *Ar, Operator *Ai,
+ComplexParOperator::ComplexParOperator(const Operator *Ar, const Operator *Ai,
                                        const mfem::ParFiniteElementSpace &trial_fespace,
                                        const mfem::ParFiniteElementSpace &test_fespace,
                                        bool test_restrict)
@@ -447,18 +430,6 @@ ComplexParOperator::ComplexParOperator(Operator *Ar, Operator *Ai,
 {
 }
 
-const ComplexOperator &ComplexParOperator::LocalOperator() const
-{
-  MFEM_ASSERT(A, "No local matrix available for ComplexParOperator::LocalOperator!");
-  return *A;
-}
-
-ComplexOperator &ComplexParOperator::LocalOperator()
-{
-  MFEM_ASSERT(A, "No local matrix available for ComplexParOperator::LocalOperator!");
-  return *A;
-}
-
 void ComplexParOperator::SetEssentialTrueDofs(const mfem::Array<int> &tdof_list,
                                               Operator::DiagonalPolicy policy)
 {
diff --git a/palace/linalg/rap.hpp b/palace/linalg/rap.hpp
index 14c27332b..bbfa00edb 100644
--- a/palace/linalg/rap.hpp
+++ b/palace/linalg/rap.hpp
@@ -27,8 +27,8 @@ class ParOperator : public Operator
 {
 private:
   // Storage and access for the local operator.
-  mutable std::unique_ptr<Operator> data_A;
-  mutable Operator *A;
+  std::unique_ptr<Operator> data_A;
+  const Operator *A;
 
   // Finite element spaces for parallel prolongation and restriction.
   const mfem::ParFiniteElementSpace &trial_fespace, &test_fespace;
@@ -52,7 +52,7 @@ class ParOperator : public Operator
   void RestrictionMatrixAddMult(const Vector &ly, Vector &ty, const double a) const;
   void RestrictionMatrixMultTranspose(const Vector &ty, Vector &ly) const;
 
-  ParOperator(std::unique_ptr<Operator> &&dA, Operator *pA,
+  ParOperator(std::unique_ptr<Operator> &&dA, const Operator *pA,
               const mfem::ParFiniteElementSpace &trial_fespace,
               const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict);
 
@@ -67,16 +67,15 @@ class ParOperator : public Operator
   }
 
   // Non-owning constructors.
-  ParOperator(Operator &A, const mfem::ParFiniteElementSpace &trial_fespace,
+  ParOperator(const Operator &A, const mfem::ParFiniteElementSpace &trial_fespace,
               const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict);
-  ParOperator(Operator &A, const mfem::ParFiniteElementSpace &fespace)
+  ParOperator(const Operator &A, const mfem::ParFiniteElementSpace &fespace)
     : ParOperator(A, fespace, fespace, false)
   {
   }
 
   // Get access to the underlying local (L-vector) operator.
-  const Operator &LocalOperator() const;
-  Operator &LocalOperator();
+  const Operator &LocalOperator() const { return *A; }
 
   // Get the associated MPI communicator.
   MPI_Comm GetComm() const { return trial_fespace.GetComm(); }
@@ -88,8 +87,9 @@ class ParOperator : public Operator
   // nullptr.
   const mfem::Array<int> *GetEssentialTrueDofs() const { return dbc_tdof_list; }
 
-  // Assemble the diagonal for the parallel operator.
-  void AssembleDiagonal(Vector &diag) const override;
+  // Eliminate essential true dofs from the RHS vector b, using the essential boundary
+  // condition values in x.
+  void EliminateRHS(const Vector &x, Vector &b) const;
 
   // Assemble the operator as a parallel sparse matrix. The memory associated with the
   // local operator is free'd.
@@ -102,9 +102,7 @@ class ParOperator : public Operator
     return std::move(RAP);
   }
 
-  // Eliminate essential true dofs from the RHS vector b, using the essential boundary
-  // condition values in x.
-  void EliminateRHS(const Vector &x, Vector &b) const;
+  void AssembleDiagonal(Vector &diag) const override;
 
   void Mult(const Vector &x, Vector &y) const override;
 
@@ -121,7 +119,7 @@ class ComplexParOperator : public ComplexOperator
 private:
   // Storage and access for the local operator.
   std::unique_ptr<ComplexWrapperOperator> data_A;
-  ComplexWrapperOperator *A;
+  const ComplexWrapperOperator *A;
 
   // Finite element spaces for parallel prolongation and restriction.
   const mfem::ParFiniteElementSpace &trial_fespace, &test_fespace;
@@ -146,7 +144,7 @@ class ComplexParOperator : public ComplexOperator
   void RestrictionMatrixMultTranspose(const ComplexVector &ty, ComplexVector &ly) const;
 
   ComplexParOperator(std::unique_ptr<Operator> &&dAr, std::unique_ptr<Operator> &&dAi,
-                     Operator *pAr, Operator *pAi,
+                     const Operator *pAr, const Operator *pAi,
                      const mfem::ParFiniteElementSpace &trial_fespace,
                      const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict);
 
@@ -163,17 +161,20 @@ class ComplexParOperator : public ComplexOperator
   }
 
   // Non-owning constructors.
-  ComplexParOperator(Operator *Ar, Operator *Ai,
+  ComplexParOperator(const Operator *Ar, const Operator *Ai,
                      const mfem::ParFiniteElementSpace &trial_fespace,
                      const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict);
-  ComplexParOperator(Operator *Ar, Operator *Ai, const mfem::ParFiniteElementSpace &fespace)
+  ComplexParOperator(const Operator *Ar, const Operator *Ai,
+                     const mfem::ParFiniteElementSpace &fespace)
     : ComplexParOperator(Ar, Ai, fespace, fespace, false)
   {
   }
 
+  const Operator *Real() const override { return RAPr.get(); }
+  const Operator *Imag() const override { return RAPi.get(); }
+
   // Get access to the underlying local (L-vector) operator.
-  const ComplexOperator &LocalOperator() const;
-  ComplexOperator &LocalOperator();
+  const ComplexOperator &LocalOperator() const { return *A; }
 
   // Get the associated MPI communicator.
   MPI_Comm GetComm() const { return trial_fespace.GetComm(); }
@@ -186,17 +187,7 @@ class ComplexParOperator : public ComplexOperator
   // nullptr.
   const mfem::Array<int> *GetEssentialTrueDofs() const { return dbc_tdof_list; }
 
-  // Assemble the diagonal for the parallel operator.
-  void AssembleDiagonal(ComplexVector &diag) const;
-
-  bool IsReal() const override { return A->IsReal(); }
-  bool IsImag() const override { return A->IsImag(); }
-  bool HasReal() const override { return RAPr != nullptr; }
-  bool HasImag() const override { return RAPi != nullptr; }
-  const Operator *Real() const override { return RAPr.get(); }
-  Operator *Real() override { return RAPr.get(); }
-  const Operator *Imag() const override { return RAPi.get(); }
-  Operator *Imag() override { return RAPi.get(); }
+  void AssembleDiagonal(ComplexVector &diag) const override;
 
   void Mult(const ComplexVector &x, ComplexVector &y) const override;
 
diff --git a/palace/linalg/solver.cpp b/palace/linalg/solver.cpp
index fcecbe39b..29584eaee 100644
--- a/palace/linalg/solver.cpp
+++ b/palace/linalg/solver.cpp
@@ -3,36 +3,113 @@
 
 #include "solver.hpp"
 
+#include "linalg/rap.hpp"
+
 namespace palace
 {
 
 template <>
-void WrapperSolver<Operator>::SetOperator(const Operator &op)
+void MfemWrapperSolver<Operator>::SetOperator(const Operator &op)
 {
-  pc->SetOperator(op);
+  // Operator is always assembled as a HypreParMatrix.
+  if (const auto *hA = dynamic_cast<const mfem::HypreParMatrix *>(&op))
+  {
+    pc->SetOperator(*hA);
+  }
+  else
+  {
+    const auto *PtAP = dynamic_cast<const ParOperator *>(&op);
+    MFEM_VERIFY(PtAP,
+                "MfemWrapperSolver must be able to construct a HypreParMatrix operator!");
+    pc->SetOperator(PtAP->ParallelAssemble());
+    if (!save_assembled)
+    {
+      PtAP->StealParallelAssemble();
+    }
+  }
   this->height = op.Height();
   this->width = op.Width();
 }
 
 template <>
-void WrapperSolver<ComplexOperator>::SetOperator(const ComplexOperator &op)
+void MfemWrapperSolver<ComplexOperator>::SetOperator(const ComplexOperator &op)
 {
-  MFEM_VERIFY(op.IsReal() && op.HasReal(),
-              "WrapperSolver::SetOperator requires an operator which is purely real for "
-              "mfem::Solver!");
-  pc->SetOperator(*op.Real());
+  // XX TODO: Test complex matrix assembly if coarse solve supports it
+  // Assemble the real and imaginary parts, then add.
+  const mfem::HypreParMatrix *hAr = nullptr, *hAi = nullptr;
+  const ParOperator *PtAPr = nullptr, *PtAPi = nullptr;
+  if (op.Real())
+  {
+    hAr = dynamic_cast<const mfem::HypreParMatrix *>(op.Real());
+    if (!hAr)
+    {
+      PtAPr = dynamic_cast<const ParOperator *>(op.Real());
+      MFEM_VERIFY(PtAPr,
+                  "MfemWrapperSolver must be able to construct a HypreParMatrix operator!");
+      hAr = &PtAPr->ParallelAssemble();
+    }
+  }
+  if (op.Imag())
+  {
+    hAi = dynamic_cast<const mfem::HypreParMatrix *>(op.Imag());
+    if (!hAi)
+    {
+      PtAPi = dynamic_cast<const ParOperator *>(op.Imag());
+      MFEM_VERIFY(PtAPi,
+                  "MfemWrapperSolver must be able to construct a HypreParMatrix operator!");
+      hAi = &PtAPi->ParallelAssemble();
+    }
+  }
+  if (hAr && hAi)
+  {
+    A.reset(mfem::Add(1.0, *hAr, 1.0, *hAi));
+    if (PtAPr)
+    {
+      PtAPr->StealParallelAssemble();
+    }
+    if (PtAPi)
+    {
+      PtAPi->StealParallelAssemble();
+    }
+    pc->SetOperator(*A);
+    if (!save_assembled)
+    {
+      A.reset();
+    }
+  }
+  else if (hAr)
+  {
+    pc->SetOperator(*hAr);
+    if (PtAPr && !save_assembled)
+    {
+      PtAPr->StealParallelAssemble();
+    }
+  }
+  else if (hAi)
+  {
+    pc->SetOperator(*hAi);
+    if (PtAPi && !save_assembled)
+    {
+      PtAPi->StealParallelAssemble();
+    }
+  }
+  else
+  {
+    MFEM_ABORT("Empty ComplexOperator for MfemWrapperSolver!");
+  }
   this->height = op.Height();
   this->width = op.Width();
 }
 
 template <>
-void WrapperSolver<Operator>::Mult(const Vector &x, Vector &y) const
+void MfemWrapperSolver<Operator>::Mult(const Vector &x, Vector &y) const
 {
   pc->Mult(x, y);
 }
 
 template <>
-void WrapperSolver<ComplexOperator>::Mult(const ComplexVector &x, ComplexVector &y) const
+void MfemWrapperSolver<ComplexOperator>::Mult(const ComplexVector &x,
+                                              ComplexVector &y) const
 {
   mfem::Array<const Vector *> X(2);
   mfem::Array<Vector *> Y(2);
diff --git a/palace/linalg/solver.hpp b/palace/linalg/solver.hpp
index 82755a1f2..33f3c2ed5 100644
--- a/palace/linalg/solver.hpp
+++ b/palace/linalg/solver.hpp
@@ -50,18 +50,28 @@ class Solver : public OperType
 };
 
 // This solver wraps a real-valued mfem::Solver for application to complex-valued problems
-// as a preconditioner inside of a Solver<OperType>
+// as a preconditioner inside of a Solver<OperType> or for assembling the matrix-free
+// preconditioner operator as an mfem::HypreParMatrix.
 template <typename OperType>
-class WrapperSolver : public Solver<OperType>
+class MfemWrapperSolver : public Solver<OperType>
 {
   using VecType = typename Solver<OperType>::VecType;
 
-protected:
+private:
+  // The actual mfem::Solver.
   std::unique_ptr<mfem::Solver> pc;
 
+  // Real-valued system matrix A = Ar + Ai in parallel assembled form.
+  std::unique_ptr<mfem::HypreParMatrix> A;
+
+  // Whether or not to save the parallel assembled matrix after calling
+  // mfem::Solver::SetOperator (some solvers copy their input).
+  bool save_assembled;
+
 public:
-  WrapperSolver(std::unique_ptr<mfem::Solver> &&pc)
-    : Solver<OperType>(pc->iterative_mode), pc(std::move(pc))
+  MfemWrapperSolver(std::unique_ptr<mfem::Solver> &&pc, bool save_assembled = true)
+    : Solver<OperType>(pc->iterative_mode), pc(std::move(pc)),
+      save_assembled(save_assembled)
   {
   }
 
@@ -71,6 +81,9 @@ class WrapperSolver : public Solver<OperType>
     pc->iterative_mode = guess;
   }
 
+  // Configure whether or not to save the assembled operator.
+  void SetSaveAssembled(bool save) { save_assembled = save; }
+
   void SetOperator(const OperType &op) override;
 
   void Mult(const VecType &x, VecType &y) const override;
diff --git a/palace/linalg/strumpack.cpp b/palace/linalg/strumpack.cpp
index 33cc1508a..8d690ab1d 100644
--- a/palace/linalg/strumpack.cpp
+++ b/palace/linalg/strumpack.cpp
@@ -5,8 +5,6 @@
 
 #if defined(MFEM_USE_STRUMPACK)
 
-#include "linalg/rap.hpp"
-
 namespace palace
 {
 
@@ -113,37 +111,33 @@ template <typename StrumpackSolverType>
 void StrumpackSolverBase<StrumpackSolverType>::SetOperator(const Operator &op)
 {
   // Convert the input operator to a distributed STRUMPACK matrix (always assume a symmetric
-  // sparsity pattern). This is very similar to the MFEM STRUMPACKRowLocMatrix from a
+  // sparsity pattern). This is very similar to the MFEM's STRUMPACKRowLocMatrix from a
   // HypreParMatrix but avoids using the communicator from the Hypre matrix in the case that
   // the solver is constructed on a different communicator.
-  const mfem::HypreParMatrix *hypA;
-  const auto *PtAP = dynamic_cast<const ParOperator *>(&op);
-  if (PtAP)
-  {
-    hypA = &PtAP->ParallelAssemble();
-  }
-  else
+  const auto *hA = dynamic_cast<const mfem::HypreParMatrix *>(&op);
+  MFEM_VERIFY(hA && hA->GetGlobalNumRows() == hA->GetGlobalNumCols(),
+              "StrumpackSolver requires a square HypreParMatrix operator!");
+  auto *parcsr = (hypre_ParCSRMatrix *)const_cast<mfem::HypreParMatrix &>(*hA);
+  hypre_CSRMatrix *csr = hypre_MergeDiagAndOffd(parcsr);
+#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
+  if (hypre_GetActualMemLocation(hypre_CSRMatrixMemoryLocation(csr)) != hypre_MEMORY_HOST)
   {
-    hypA = dynamic_cast<const mfem::HypreParMatrix *>(&op);
-    MFEM_VERIFY(hypA, "StrumpackSolver requires a HypreParMatrix operator!");
+    hypre_CSRMatrixMigrate(csr, HYPRE_MEMORY_HOST);
   }
-  auto *parcsr = (hypre_ParCSRMatrix *)const_cast<mfem::HypreParMatrix &>(*hypA);
-  hypA->HostRead();
-  hypre_CSRMatrix *csr = hypre_MergeDiagAndOffd(parcsr);
-  hypA->HypreRead();
+#endif
 
   // Create the STRUMPACKRowLocMatrix by taking the internal data from a hypre_CSRMatrix.
-  HYPRE_Int n_loc = csr->num_rows;
-  HYPRE_BigInt first_row = parcsr->first_row_index;
-  HYPRE_Int *I = csr->i;
-  HYPRE_BigInt *J = csr->big_j;
-  double *data = csr->data;
+  HYPRE_BigInt glob_n = hypre_ParCSRMatrixGlobalNumRows(parcsr);
+  HYPRE_BigInt first_row = hypre_ParCSRMatrixFirstRowIndex(parcsr);
+  HYPRE_Int n_loc = hypre_CSRMatrixNumRows(csr);
+  HYPRE_Int *I = hypre_CSRMatrixI(csr);
+  HYPRE_BigInt *J = hypre_CSRMatrixBigJ(csr);
+  double *data = hypre_CSRMatrixData(csr);
 
   // Safe to delete the matrix since STRUMPACK copies it on input. Also clean up the Hypre
   // data structure once we are done with it.
 #if !defined(HYPRE_BIGINT)
-  mfem::STRUMPACKRowLocMatrix A(comm, n_loc, first_row, hypA->GetGlobalNumRows(),
-                                hypA->GetGlobalNumCols(), I, J, data, true);
+  mfem::STRUMPACKRowLocMatrix A(comm, n_loc, first_row, glob_n, glob_n, I, J, data, true);
 #else
   int n_loc_int = static_cast<int>(n_loc);
   MFEM_ASSERT(n_loc == (HYPRE_Int)n_loc_int,
@@ -154,8 +148,8 @@ void StrumpackSolverBase<StrumpackSolverType>::SetOperator(const Operator &op)
     II[i] = static_cast<int>(I[i]);
     MFEM_ASSERT(I[i] == (HYPRE_Int)II[i], "Overflow error for local sparse matrix index!");
   }
-  mfem::STRUMPACKRowLocMatrix A(comm, n_loc_int, first_row, hypA->GetGlobalNumRows(),
-                                hypA->GetGlobalNumCols(), II, J, data, true);
+  mfem::STRUMPACKRowLocMatrix A(comm, n_loc_int, first_row, glob_n, glob_n, II.HostRead(),
+                                J, data, true);
 #endif
   StrumpackSolverType::SetOperator(A);
   hypre_CSRMatrixDestroy(csr);
diff --git a/palace/linalg/superlu.cpp b/palace/linalg/superlu.cpp
index 63eb2e331..551974c79 100644
--- a/palace/linalg/superlu.cpp
+++ b/palace/linalg/superlu.cpp
@@ -5,7 +5,6 @@
 
 #if defined(MFEM_USE_SUPERLU)
 
-#include "linalg/rap.hpp"
 #include "utils/communication.hpp"
 
 namespace palace
@@ -82,43 +81,40 @@ SuperLUSolver::SuperLUSolver(MPI_Comm comm, config::LinearSolverData::SymFactTyp
 
 void SuperLUSolver::SetOperator(const Operator &op)
 {
-  // For repeated factorizations, always reuse the sparsity pattern. This is very similar to
-  // the MFEM SuperLURowLocMatrix from a HypreParMatrix but avoids using the communicator
-  // from the Hypre matrix in the case that the solver is constructed on a different
-  // communicator.
+  // For repeated factorizations, always reuse the sparsity pattern.
   if (A)
   {
     solver.SetFact(mfem::superlu::SamePattern_SameRowPerm);
   }
-  const mfem::HypreParMatrix *hypA;
-  const auto *PtAP = dynamic_cast<const ParOperator *>(&op);
-  if (PtAP)
-  {
-    hypA = &PtAP->ParallelAssemble();
-  }
-  else
+
+  // This is very similar to the MFEM SuperLURowLocMatrix from a HypreParMatrix but avoids
+  // using the communicator from the Hypre matrix in the case that the solver is
+  // constructed on a different communicator.
+  const auto *hA = dynamic_cast<const mfem::HypreParMatrix *>(&op);
+  MFEM_VERIFY(hA && hA->GetGlobalNumRows() == hA->GetGlobalNumCols(),
+              "SuperLUSolver requires a square HypreParMatrix operator!");
+  auto *parcsr = (hypre_ParCSRMatrix *)const_cast<mfem::HypreParMatrix &>(*hA);
+  hypre_CSRMatrix *csr = hypre_MergeDiagAndOffd(parcsr);
+#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
+  if (hypre_GetActualMemLocation(hypre_CSRMatrixMemoryLocation(csr)) != hypre_MEMORY_HOST)
   {
-    hypA = dynamic_cast<const mfem::HypreParMatrix *>(&op);
-    MFEM_VERIFY(hypA, "SuperLUSolver requires a HypreParMatrix operator!");
+    hypre_CSRMatrixMigrate(csr, HYPRE_MEMORY_HOST);
   }
-  auto *parcsr = (hypre_ParCSRMatrix *)const_cast<mfem::HypreParMatrix &>(*hypA);
-  hypA->HostRead();
-  hypre_CSRMatrix *csr = hypre_MergeDiagAndOffd(parcsr);
-  hypA->HypreRead();
+#endif
 
   // Create the SuperLURowLocMatrix by taking the internal data from a hypre_CSRMatrix.
-  HYPRE_Int n_loc = csr->num_rows;
-  HYPRE_BigInt first_row = parcsr->first_row_index;
-  HYPRE_Int *I = csr->i;
-  HYPRE_BigInt *J = csr->big_j;
-  double *data = csr->data;
+  HYPRE_BigInt glob_n = hypre_ParCSRMatrixGlobalNumRows(parcsr);
+  HYPRE_BigInt first_row = hypre_ParCSRMatrixFirstRowIndex(parcsr);
+  HYPRE_Int n_loc = hypre_CSRMatrixNumRows(csr);
+  HYPRE_Int *I = hypre_CSRMatrixI(csr);
+  HYPRE_BigInt *J = hypre_CSRMatrixBigJ(csr);
+  double *data = hypre_CSRMatrixData(csr);
 
   // We need to save A because SuperLU does not copy the input matrix. Also clean up the
   // Hypre data structure once we are done with it.
 #if !defined(HYPRE_BIGINT)
-  A = std::make_unique<mfem::SuperLURowLocMatrix>(comm, n_loc, first_row,
-                                                  hypA->GetGlobalNumRows(),
-                                                  hypA->GetGlobalNumCols(), I, J, data);
+  A = std::make_unique<mfem::SuperLURowLocMatrix>(comm, n_loc, first_row, glob_n, glob_n, I,
+                                                  J, data);
 #else
   int n_loc_int = static_cast<int>(n_loc);
   MFEM_ASSERT(n_loc == (HYPRE_Int)n_loc_int,
@@ -129,9 +125,8 @@ void SuperLUSolver::SetOperator(const Operator &op)
     II[i] = static_cast<int>(I[i]);
     MFEM_ASSERT(I[i] == (HYPRE_Int)II[i], "Overflow error for local sparse matrix index!");
   }
-  A = std::make_unique<mfem::SuperLURowLocMatrix>(comm, n_loc_int, first_row,
-                                                  hypA->GetGlobalNumRows(),
-                                                  hypA->GetGlobalNumCols(), II, J, data);
+  A = std::make_unique<mfem::SuperLURowLocMatrix>(comm, n_loc_int, first_row, glob_n,
+                                                  glob_n, II.HostRead(), J, data);
 #endif
   solver.SetOperator(*A);
   height = solver.Height();
diff --git a/palace/models/romoperator.cpp b/palace/models/romoperator.cpp
index 21c15fdc6..f022f46c2 100644
--- a/palace/models/romoperator.cpp
+++ b/palace/models/romoperator.cpp
@@ -33,20 +33,20 @@ inline void ProjectMatInternal(MPI_Comm comm, const std::vector<Vector> &V,
   {
     // Fill block of Vᴴ A V = [  | Vᴴ A vj ] . We can optimize the matrix-vector product
     // since the columns of V are real.
-    MFEM_VERIFY(A.HasReal() || A.HasImag(),
+    MFEM_VERIFY(A.Real() || A.Imag(),
                 "Invalid zero ComplexOperator for PROM matrix projection!");
-    if (A.HasReal())
+    if (A.Real())
     {
       A.Real()->Mult(V[j], r.Real());
     }
-    if (A.HasImag())
+    if (A.Imag())
     {
       A.Imag()->Mult(V[j], r.Imag());
     }
     for (int i = 0; i < n; i++)
     {
-      Ar(i, j).real(A.HasReal() ? V[i] * r.Real() : 0.0);  // Local inner product
-      Ar(i, j).imag(A.HasImag() ? V[i] * r.Imag() : 0.0);
+      Ar(i, j).real(A.Real() ? V[i] * r.Real() : 0.0);  // Local inner product
+      Ar(i, j).imag(A.Imag() ? V[i] * r.Imag() : 0.0);
     }
   }
   Mpi::GlobalSum((n - n0) * n, Ar.data() + n0 * n, comm);
diff --git a/palace/models/spaceoperator.cpp b/palace/models/spaceoperator.cpp
index 27f8218fa..6f20b3e96 100644
--- a/palace/models/spaceoperator.cpp
+++ b/palace/models/spaceoperator.cpp
@@ -451,22 +451,22 @@ auto BuildParSumOperator(int h, int w, std::complex<double> a0, std::complex<dou
   {
     if (a0.real() != 0.0)
     {
-      if (K->LocalOperator().HasReal())
+      if (K->LocalOperator().Real())
       {
         sumr->AddOperator(*K->LocalOperator().Real(), a0.real());
       }
-      if (K->LocalOperator().HasImag())
+      if (K->LocalOperator().Imag())
       {
         sumi->AddOperator(*K->LocalOperator().Imag(), a0.real());
       }
     }
     if (a0.imag() != 0.0)
     {
-      if (K->LocalOperator().HasImag())
+      if (K->LocalOperator().Imag())
       {
         sumr->AddOperator(*K->LocalOperator().Imag(), -a0.imag());
       }
-      if (K->LocalOperator().HasReal())
+      if (K->LocalOperator().Real())
       {
         sumi->AddOperator(*K->LocalOperator().Real(), a0.imag());
       }
@@ -476,22 +476,22 @@ auto BuildParSumOperator(int h, int w, std::complex<double> a0, std::complex<dou
   {
     if (a1.real() != 0.0)
     {
-      if (C->LocalOperator().HasReal())
+      if (C->LocalOperator().Real())
       {
         sumr->AddOperator(*C->LocalOperator().Real(), a1.real());
       }
-      if (C->LocalOperator().HasImag())
+      if (C->LocalOperator().Imag())
       {
         sumi->AddOperator(*C->LocalOperator().Imag(), a1.real());
       }
     }
     if (a1.imag() != 0.0)
     {
-      if (C->LocalOperator().HasImag())
+      if (C->LocalOperator().Imag())
       {
         sumr->AddOperator(*C->LocalOperator().Imag(), -a1.imag());
       }
-      if (C->LocalOperator().HasReal())
+      if (C->LocalOperator().Real())
       {
         sumi->AddOperator(*C->LocalOperator().Real(), a1.imag());
       }
@@ -501,22 +501,22 @@ auto BuildParSumOperator(int h, int w, std::complex<double> a0, std::complex<dou
   {
     if (a2.real() != 0.0)
     {
-      if (M->LocalOperator().HasReal())
+      if (M->LocalOperator().Real())
       {
         sumr->AddOperator(*M->LocalOperator().Real(), a2.real());
       }
-      if (M->LocalOperator().HasImag())
+      if (M->LocalOperator().Imag())
       {
         sumi->AddOperator(*M->LocalOperator().Imag(), a2.real());
       }
     }
     if (a2.imag() != 0.0)
     {
-      if (M->LocalOperator().HasImag())
+      if (M->LocalOperator().Imag())
       {
         sumr->AddOperator(*M->LocalOperator().Imag(), -a2.imag());
       }
-      if (M->LocalOperator().HasReal())
+      if (M->LocalOperator().Real())
       {
         sumi->AddOperator(*M->LocalOperator().Real(), a2.imag());
       }
@@ -524,11 +524,11 @@ auto BuildParSumOperator(int h, int w, std::complex<double> a0, std::complex<dou
   }
   if (A2)
   {
-    if (A2->LocalOperator().HasReal())
+    if (A2->LocalOperator().Real())
     {
       sumr->AddOperator(*A2->LocalOperator().Real(), 1.0);
     }
-    if (A2->LocalOperator().HasImag())
+    if (A2->LocalOperator().Imag())
     {
       sumi->AddOperator(*A2->LocalOperator().Imag(), 1.0);
     }
@@ -614,13 +614,13 @@ std::unique_ptr<Operator> SpaceOperator::GetInnerProductMatrix(double a0, double
   if (PtAP_K && a0 != 0.0)
   {
     MFEM_VERIFY(
-        PtAP_K->LocalOperator().HasReal(),
+        PtAP_K->LocalOperator().Real(),
         "Missing real part of stiffness matrix for inner product matrix construction!");
     sum->AddOperator(*PtAP_K->LocalOperator().Real(), a0);
   }
   if (PtAP_M && a2 != 0.0)
   {
-    MFEM_VERIFY(PtAP_M->LocalOperator().HasReal(),
+    MFEM_VERIFY(PtAP_M->LocalOperator().Real(),
                 "Missing real part of mass matrix for inner product matrix construction!");
     sum->AddOperator(*PtAP_M->LocalOperator().Real(), a2);
   }
@@ -676,18 +676,7 @@ std::unique_ptr<OperType> SpaceOperator::GetPreconditionerMatrix(double a0, doub
           fi(mat_op.MaxCeedAttribute()), dfbr(mat_op.MaxCeedBdrAttribute()),
           dfbi(mat_op.MaxCeedBdrAttribute()), fbr(mat_op.MaxCeedBdrAttribute()),
           fbi(mat_op.MaxCeedBdrAttribute());
-      if (!std::is_same<OperType, ComplexOperator>::value || pc_mat_real || l == 0)
-      {
-        // Real-valued system matrix (approximation) for preconditioning.
-        AddStiffnessCoefficients(a0, dfr, fr);
-        AddStiffnessBdrCoefficients(a0, fbr);
-        AddDampingCoefficients(a1, fr);
-        AddDampingBdrCoefficients(a1, fbr);
-        AddAbsMassCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fr);
-        AddRealMassBdrCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fbr);
-        AddExtraSystemBdrCoefficients(a3, dfbr, dfbr, fbr, fbr);
-      }
-      else
+      if (std::is_same<OperType, ComplexOperator>::value && !pc_mat_real)
       {
         // Build preconditioner based on the actual complex-valued system matrix.
         AddStiffnessCoefficients(a0, dfr, fr);
@@ -699,6 +688,17 @@ std::unique_ptr<OperType> SpaceOperator::GetPreconditionerMatrix(double a0, doub
         AddImagMassCoefficients(a2, fi);
         AddExtraSystemBdrCoefficients(a3, dfbr, dfbi, fbr, fbi);
       }
+      else
+      {
+        // Real-valued system matrix (approximation) for preconditioning.
+        AddStiffnessCoefficients(a0, dfr, fr);
+        AddStiffnessBdrCoefficients(a0, fbr);
+        AddDampingCoefficients(a1, fr);
+        AddDampingBdrCoefficients(a1, fbr);
+        AddAbsMassCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fr);
+        AddRealMassBdrCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fbr);
+        AddExtraSystemBdrCoefficients(a3, dfbr, dfbr, fbr, fbr);
+      }
       int empty[2] = {(dfr.empty() && fr.empty() && dfbr.empty() && fbr.empty()),
                       (dfi.empty() && fi.empty() && dfbi.empty() && fbi.empty())};
       Mpi::GlobalMin(2, empty, GetComm());
diff --git a/palace/models/waveportoperator.cpp b/palace/models/waveportoperator.cpp
index b60bd98de..a78052faf 100644
--- a/palace/models/waveportoperator.cpp
+++ b/palace/models/waveportoperator.cpp
@@ -653,7 +653,7 @@ WavePortData::WavePortData(const config::WavePortData &data,
 #error "Wave port solver requires building with SuperLU_DIST, STRUMPACK, or MUMPS!"
 #endif
     }
-    auto pc = std::make_unique<WrapperSolver<ComplexOperator>>(
+    auto pc = std::make_unique<MfemWrapperSolver<ComplexOperator>>(
         [&]() -> std::unique_ptr<mfem::Solver>
         {
           if (pc_type == config::LinearSolverData::Type::SUPERLU)
@@ -688,6 +688,7 @@ WavePortData::WavePortData(const config::WavePortData &data,
           }
           return {};
         }());
+    pc->SetSaveAssembled(false);
     ksp = std::make_unique<ComplexKspSolver>(std::move(gmres), std::move(pc));
 
     // Define the eigenvalue solver.

From 9c5ca6677bb6070585aa68a5f0df810e93a891ea Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Tue, 19 Dec 2023 16:26:58 -0800
Subject: [PATCH 2/5] Construct coarse level multigrid operators without
 duplicating quadrature data

---
 palace/linalg/divfree.cpp          |  14 +-
 palace/linalg/hcurl.cpp            |  20 ++-
 palace/models/curlcurloperator.cpp |  26 ++--
 palace/models/laplaceoperator.cpp  |  26 ++--
 palace/models/spaceoperator.cpp    | 217 ++++++++++++++++++-----------
 palace/models/spaceoperator.hpp    |   8 +-
 6 files changed, 183 insertions(+), 128 deletions(-)

diff --git a/palace/linalg/divfree.cpp b/palace/linalg/divfree.cpp
index 9eca1d4c9..516039a2c 100644
--- a/palace/linalg/divfree.cpp
+++ b/palace/linalg/divfree.cpp
@@ -22,27 +22,29 @@ DivFreeSolver::DivFreeSolver(const MaterialOperator &mat_op, FiniteElementSpace
                              const std::vector<mfem::Array<int>> &h1_bdr_tdof_lists,
                              double tol, int max_it, int print)
 {
-  constexpr bool skip_zeros = false;
   MaterialPropertyCoefficient epsilon_func(mat_op.GetAttributeToMaterial(),
                                            mat_op.GetPermittivityReal());
   {
+    constexpr bool skip_zeros = false;
+    BilinearForm m(h1_fespaces.GetFinestFESpace());
+    m.AddDomainIntegrator<DiffusionIntegrator>(epsilon_func);
+    // m.AssembleQuadratureData();
+    auto m_vec = m.Assemble(h1_fespaces, skip_zeros);
     auto M_mg = std::make_unique<MultigridOperator>(h1_fespaces.GetNumLevels());
     for (std::size_t l = 0; l < h1_fespaces.GetNumLevels(); l++)
     {
-      // Force coarse level operator to be fully assembled always.
       const auto &h1_fespace_l = h1_fespaces.GetFESpaceAtLevel(l);
-      BilinearForm m(h1_fespace_l);
-      m.AddDomainIntegrator<DiffusionIntegrator>(epsilon_func);
-      auto M_l = std::make_unique<ParOperator>(m.Assemble(skip_zeros), h1_fespace_l);
+      auto M_l = std::make_unique<ParOperator>(std::move(m_vec[l]), h1_fespace_l);
       M_l->SetEssentialTrueDofs(h1_bdr_tdof_lists[l], Operator::DiagonalPolicy::DIAG_ONE);
       M_mg->AddOperator(std::move(M_l));
     }
     M = std::move(M_mg);
   }
   {
+    // Weak divergence operator is always partially assembled.
     BilinearForm weakdiv(nd_fespace, h1_fespaces.GetFinestFESpace());
     weakdiv.AddDomainIntegrator<MixedVectorWeakDivergenceIntegrator>(epsilon_func);
-    WeakDiv = std::make_unique<ParOperator>(weakdiv.Assemble(skip_zeros), nd_fespace,
+    WeakDiv = std::make_unique<ParOperator>(weakdiv.PartialAssemble(), nd_fespace,
                                             h1_fespaces.GetFinestFESpace(), false);
   }
   Grad = &h1_fespaces.GetFinestFESpace().GetDiscreteInterpolator();
diff --git a/palace/linalg/hcurl.cpp b/palace/linalg/hcurl.cpp
index adb55b9fc..bdc064f12 100644
--- a/palace/linalg/hcurl.cpp
+++ b/palace/linalg/hcurl.cpp
@@ -32,25 +32,23 @@ WeightedHCurlNormSolver::WeightedHCurlNormSolver(
                                            mat_op.GetInvPermeability());
     MaterialPropertyCoefficient epsilon_func(mat_op.GetAttributeToMaterial(),
                                              mat_op.GetPermittivityReal());
+    BilinearForm a(nd_fespaces.GetFinestFESpace()), a_aux(h1_fespaces.GetFinestFESpace());
+    a.AddDomainIntegrator<CurlCurlMassIntegrator>(muinv_func, epsilon_func);
+    a_aux.AddDomainIntegrator<DiffusionIntegrator>(epsilon_func);
+    // a.AssembleQuadratureData();
+    // a_aux.AssembleQuadratureData();
+    auto a_vec = a.Assemble(nd_fespaces, skip_zeros);
+    auto a_aux_vec = a_aux.Assemble(h1_fespaces, skip_zeros);
     auto A_mg = std::make_unique<MultigridOperator>(n_levels);
     for (bool aux : {false, true})
     {
       for (std::size_t l = 0; l < n_levels; l++)
       {
-        // Force coarse level operator to be fully assembled always.
         const auto &fespace_l =
             aux ? h1_fespaces.GetFESpaceAtLevel(l) : nd_fespaces.GetFESpaceAtLevel(l);
         const auto &dbc_tdof_lists_l = aux ? h1_dbc_tdof_lists[l] : nd_dbc_tdof_lists[l];
-        BilinearForm a(fespace_l);
-        if (aux)
-        {
-          a.AddDomainIntegrator<DiffusionIntegrator>(epsilon_func);
-        }
-        else
-        {
-          a.AddDomainIntegrator<CurlCurlMassIntegrator>(muinv_func, epsilon_func);
-        }
-        auto A_l = std::make_unique<ParOperator>(a.Assemble(skip_zeros), fespace_l);
+        auto A_l = std::make_unique<ParOperator>(std::move(aux ? a_aux_vec[l] : a_vec[l]),
+                                                 fespace_l);
         A_l->SetEssentialTrueDofs(dbc_tdof_lists_l, Operator::DiagonalPolicy::DIAG_ONE);
         if (aux)
         {
diff --git a/palace/models/curlcurloperator.cpp b/palace/models/curlcurloperator.cpp
index e2234c353..8a30ec005 100644
--- a/palace/models/curlcurloperator.cpp
+++ b/palace/models/curlcurloperator.cpp
@@ -154,28 +154,26 @@ void PrintHeader(const mfem::ParFiniteElementSpace &h1_fespace,
 
 std::unique_ptr<Operator> CurlCurlOperator::GetStiffnessMatrix()
 {
+  // When partially assembled, the coarse operators can reuse the fine operator quadrature
+  // data if the spaces correspond to the same mesh.
   PrintHeader(GetH1Space(), GetNDSpace(), GetRTSpace(), print_hdr);
+
+  constexpr bool skip_zeros = false;
+  MaterialPropertyCoefficient muinv_func(mat_op.GetAttributeToMaterial(),
+                                         mat_op.GetInvPermeability());
+  BilinearForm k(GetNDSpace());
+  k.AddDomainIntegrator<CurlCurlIntegrator>(muinv_func);
+  k.AssembleQuadratureData();
+  auto k_vec = k.Assemble(GetNDSpaces(), skip_zeros);
   auto K = std::make_unique<MultigridOperator>(GetNDSpaces().GetNumLevels());
   for (std::size_t l = 0; l < GetNDSpaces().GetNumLevels(); l++)
   {
-    // Force coarse level operator to be fully assembled always.
     const auto &nd_fespace_l = GetNDSpaces().GetFESpaceAtLevel(l);
     if (print_hdr)
     {
       Mpi::Print(" Level {:d} (p = {:d}): {:d} unknowns", l,
                  nd_fespace_l.GetMaxElementOrder(), nd_fespace_l.GlobalTrueVSize());
-    }
-    constexpr bool skip_zeros = false;
-    MaterialPropertyCoefficient muinv_func(mat_op.GetAttributeToMaterial(),
-                                           mat_op.GetInvPermeability());
-    BilinearForm k(nd_fespace_l);
-    k.AddDomainIntegrator<CurlCurlIntegrator>(muinv_func);
-    auto K_l = std::make_unique<ParOperator>(
-        (l > 0) ? k.Assemble(skip_zeros) : k.FullAssemble(skip_zeros), nd_fespace_l);
-    if (print_hdr)
-    {
-      if (const auto *k_spm =
-              dynamic_cast<const mfem::SparseMatrix *>(&K_l->LocalOperator()))
+      if (const auto *k_spm = dynamic_cast<const mfem::SparseMatrix *>(k_vec[l].get()))
       {
         HYPRE_BigInt nnz = k_spm->NumNonZeroElems();
         Mpi::GlobalSum(1, &nnz, nd_fespace_l.GetComm());
@@ -186,9 +184,11 @@ std::unique_ptr<Operator> CurlCurlOperator::GetStiffnessMatrix()
         Mpi::Print("\n");
       }
     }
+    auto K_l = std::make_unique<ParOperator>(std::move(k_vec[l]), nd_fespace_l);
     K_l->SetEssentialTrueDofs(dbc_tdof_lists[l], Operator::DiagonalPolicy::DIAG_ONE);
     K->AddOperator(std::move(K_l));
   }
+
   print_hdr = false;
   return K;
 }
diff --git a/palace/models/laplaceoperator.cpp b/palace/models/laplaceoperator.cpp
index a3220d5e6..509067e63 100644
--- a/palace/models/laplaceoperator.cpp
+++ b/palace/models/laplaceoperator.cpp
@@ -175,28 +175,26 @@ void PrintHeader(const mfem::ParFiniteElementSpace &h1_fespace,
 
 std::unique_ptr<Operator> LaplaceOperator::GetStiffnessMatrix()
 {
+  // When partially assembled, the coarse operators can reuse the fine operator quadrature
+  // data if the spaces correspond to the same mesh.
   PrintHeader(GetH1Space(), GetNDSpace(), print_hdr);
+
+  constexpr bool skip_zeros = false;
+  MaterialPropertyCoefficient epsilon_func(mat_op.GetAttributeToMaterial(),
+                                           mat_op.GetPermittivityReal());
+  BilinearForm k(GetH1Space());
+  k.AddDomainIntegrator<DiffusionIntegrator>(epsilon_func);
+  k.AssembleQuadratureData();
+  auto k_vec = k.Assemble(GetH1Spaces(), skip_zeros);
   auto K = std::make_unique<MultigridOperator>(GetH1Spaces().GetNumLevels());
   for (std::size_t l = 0; l < GetH1Spaces().GetNumLevels(); l++)
   {
-    // Force coarse level operator to be fully assembled always.
     const auto &h1_fespace_l = GetH1Spaces().GetFESpaceAtLevel(l);
     if (print_hdr)
     {
       Mpi::Print(" Level {:d} (p = {:d}): {:d} unknowns", l,
                  h1_fespace_l.GetMaxElementOrder(), h1_fespace_l.GlobalTrueVSize());
-    }
-    constexpr bool skip_zeros = false;
-    MaterialPropertyCoefficient epsilon_func(mat_op.GetAttributeToMaterial(),
-                                             mat_op.GetPermittivityReal());
-    BilinearForm k(h1_fespace_l);
-    k.AddDomainIntegrator<DiffusionIntegrator>(epsilon_func);
-    auto K_l = std::make_unique<ParOperator>(
-        (l > 0) ? k.Assemble(skip_zeros) : k.FullAssemble(skip_zeros), h1_fespace_l);
-    if (print_hdr)
-    {
-      if (const auto *k_spm =
-              dynamic_cast<const mfem::SparseMatrix *>(&K_l->LocalOperator()))
+      if (const auto *k_spm = dynamic_cast<const mfem::SparseMatrix *>(k_vec[l].get()))
       {
         HYPRE_BigInt nnz = k_spm->NumNonZeroElems();
         Mpi::GlobalSum(1, &nnz, h1_fespace_l.GetComm());
@@ -207,9 +205,11 @@ std::unique_ptr<Operator> LaplaceOperator::GetStiffnessMatrix()
         Mpi::Print("\n");
       }
     }
+    auto K_l = std::make_unique<ParOperator>(std::move(k_vec[l]), h1_fespace_l);
     K_l->SetEssentialTrueDofs(dbc_tdof_lists[l], Operator::DiagonalPolicy::DIAG_ONE);
     K->AddOperator(std::move(K_l));
   }
+
   print_hdr = false;
   return K;
 }
diff --git a/palace/models/spaceoperator.cpp b/palace/models/spaceoperator.cpp
index 6f20b3e96..631c669f8 100644
--- a/palace/models/spaceoperator.cpp
+++ b/palace/models/spaceoperator.cpp
@@ -198,12 +198,11 @@ void PrintHeader(const mfem::ParFiniteElementSpace &h1_fespace,
   print_hdr = false;
 }
 
-std::unique_ptr<Operator>
-BuildOperator(const FiniteElementSpace &fespace, const MaterialPropertyCoefficient *df,
-              const MaterialPropertyCoefficient *f, const MaterialPropertyCoefficient *dfb,
-              const MaterialPropertyCoefficient *fb, std::size_t l, bool skip_zeros)
+void AddIntegrators(BilinearForm &a, const MaterialPropertyCoefficient *df,
+                    const MaterialPropertyCoefficient *f,
+                    const MaterialPropertyCoefficient *dfb,
+                    const MaterialPropertyCoefficient *fb, bool assemble_q_data = false)
 {
-  BilinearForm a(fespace);
   if (df && !df->empty() && f && !f->empty())
   {
     a.AddDomainIntegrator<CurlCurlMassIntegrator>(*df, *f);
@@ -234,23 +233,15 @@ BuildOperator(const FiniteElementSpace &fespace, const MaterialPropertyCoefficie
       a.AddBoundaryIntegrator<VectorFEMassIntegrator>(*fb);
     }
   }
-  return (l > 0) ? a.Assemble(skip_zeros) : a.FullAssemble(skip_zeros);
-}
-
-std::unique_ptr<Operator>
-BuildOperator(const FiniteElementSpace &fespace, const MaterialPropertyCoefficient *df,
-              const MaterialPropertyCoefficient *f, const MaterialPropertyCoefficient *dfb,
-              const MaterialPropertyCoefficient *fb, bool skip_zeros)
-{
-  return BuildOperator(fespace, df, f, dfb, fb, 1, skip_zeros);
+  if (assemble_q_data)
+  {
+    a.AssembleQuadratureData();
+  }
 }
 
-std::unique_ptr<Operator> BuildAuxOperator(const FiniteElementSpace &fespace,
-                                           const MaterialPropertyCoefficient *f,
-                                           const MaterialPropertyCoefficient *fb,
-                                           std::size_t l, bool skip_zeros)
+void AddAuxIntegrators(BilinearForm &a, const MaterialPropertyCoefficient *f,
+                       const MaterialPropertyCoefficient *fb, bool assemble_q_data = false)
 {
-  BilinearForm a(fespace);
   if (f && !f->empty())
   {
     a.AddDomainIntegrator<DiffusionIntegrator>(*f);
@@ -259,7 +250,44 @@ std::unique_ptr<Operator> BuildAuxOperator(const FiniteElementSpace &fespace,
   {
     a.AddBoundaryIntegrator<DiffusionIntegrator>(*fb);
   }
-  return (l > 0) ? a.Assemble(skip_zeros) : a.FullAssemble(skip_zeros);
+  if (assemble_q_data)
+  {
+    a.AssembleQuadratureData();
+  }
+}
+
+auto AssembleOperator(const FiniteElementSpace &fespace,
+                      const MaterialPropertyCoefficient *df,
+                      const MaterialPropertyCoefficient *f,
+                      const MaterialPropertyCoefficient *dfb,
+                      const MaterialPropertyCoefficient *fb, bool skip_zeros = false,
+                      bool assemble_q_data = false)
+{
+  BilinearForm a(fespace);
+  AddIntegrators(a, df, f, dfb, fb, assemble_q_data);
+  return a.Assemble(skip_zeros);
+}
+
+auto AssembleOperators(const FiniteElementSpaceHierarchy &fespaces,
+                       const MaterialPropertyCoefficient *df,
+                       const MaterialPropertyCoefficient *f,
+                       const MaterialPropertyCoefficient *dfb,
+                       const MaterialPropertyCoefficient *fb, bool skip_zeros = false,
+                       bool assemble_q_data = false, std::size_t l0 = 0)
+{
+  BilinearForm a(fespaces.GetFinestFESpace());
+  AddIntegrators(a, df, f, dfb, fb, assemble_q_data);
+  return a.Assemble(fespaces, skip_zeros, l0);
+}
+
+auto AssembleAuxOperators(const AuxiliaryFiniteElementSpaceHierarchy &fespaces,
+                          const MaterialPropertyCoefficient *f,
+                          const MaterialPropertyCoefficient *fb, bool skip_zeros = false,
+                          bool assemble_q_data = false, std::size_t l0 = 0)
+{
+  BilinearForm a(fespaces.GetFinestFESpace());
+  AddAuxIntegrators(a, f, fb, assemble_q_data);
+  return a.Assemble(fespaces, skip_zeros, l0);
 }
 
 }  // namespace
@@ -280,7 +308,7 @@ SpaceOperator::GetStiffnessMatrix(Operator::DiagonalPolicy diag_policy)
     return {};
   }
   constexpr bool skip_zeros = false;
-  auto k = BuildOperator(GetNDSpace(), &df, &f, nullptr, &fb, skip_zeros);
+  auto k = AssembleOperator(GetNDSpace(), &df, &f, nullptr, &fb, skip_zeros);
   if constexpr (std::is_same<OperType, ComplexOperator>::value)
   {
     auto K = std::make_unique<ComplexParOperator>(std::move(k), nullptr, GetNDSpace());
@@ -311,7 +339,7 @@ SpaceOperator::GetDampingMatrix(Operator::DiagonalPolicy diag_policy)
     return {};
   }
   constexpr bool skip_zeros = false;
-  auto c = BuildOperator(GetNDSpace(), nullptr, &f, nullptr, &fb, skip_zeros);
+  auto c = AssembleOperator(GetNDSpace(), nullptr, &f, nullptr, &fb, skip_zeros);
   if constexpr (std::is_same<OperType, ComplexOperator>::value)
   {
     auto C = std::make_unique<ComplexParOperator>(std::move(c), nullptr, GetNDSpace());
@@ -348,11 +376,11 @@ std::unique_ptr<OperType> SpaceOperator::GetMassMatrix(Operator::DiagonalPolicy
   std::unique_ptr<Operator> mr, mi;
   if (!empty[0])
   {
-    mr = BuildOperator(GetNDSpace(), nullptr, &fr, nullptr, &fbr, skip_zeros);
+    mr = AssembleOperator(GetNDSpace(), nullptr, &fr, nullptr, &fbr, skip_zeros);
   }
   if (!empty[1])
   {
-    mi = BuildOperator(GetNDSpace(), nullptr, &fi, nullptr, &fbi, skip_zeros);
+    mi = AssembleOperator(GetNDSpace(), nullptr, &fi, nullptr, &fbi, skip_zeros);
   }
   if constexpr (std::is_same<OperType, ComplexOperator>::value)
   {
@@ -388,11 +416,11 @@ SpaceOperator::GetExtraSystemMatrix(double omega, Operator::DiagonalPolicy diag_
   std::unique_ptr<Operator> ar, ai;
   if (!empty[0])
   {
-    ar = BuildOperator(GetNDSpace(), nullptr, nullptr, &dfbr, &fbr, skip_zeros);
+    ar = AssembleOperator(GetNDSpace(), nullptr, nullptr, &dfbr, &fbr, skip_zeros);
   }
   if (!empty[1])
   {
-    ai = BuildOperator(GetNDSpace(), nullptr, nullptr, &dfbi, &fbi, skip_zeros);
+    ai = AssembleOperator(GetNDSpace(), nullptr, nullptr, &dfbi, &fbi, skip_zeros);
   }
   if constexpr (std::is_same<OperType, ComplexOperator>::value)
   {
@@ -630,14 +658,16 @@ std::unique_ptr<Operator> SpaceOperator::GetInnerProductMatrix(double a0, double
 namespace
 {
 
-auto BuildLevelOperator(const MultigridOperator &B, std::unique_ptr<Operator> &&br,
-                        std::unique_ptr<Operator> &&bi, const FiniteElementSpace &fespace)
+auto BuildLevelParOperator(const MultigridOperator &B, std::unique_ptr<Operator> &&br,
+                           std::unique_ptr<Operator> &&bi,
+                           const FiniteElementSpace &fespace)
 {
   return std::make_unique<ParOperator>(std::move(br), fespace);
 }
 
-auto BuildLevelOperator(const ComplexMultigridOperator &B, std::unique_ptr<Operator> &&br,
-                        std::unique_ptr<Operator> &&bi, const FiniteElementSpace &fespace)
+auto BuildLevelParOperator(const ComplexMultigridOperator &B,
+                           std::unique_ptr<Operator> &&br, std::unique_ptr<Operator> &&bi,
+                           const FiniteElementSpace &fespace)
 {
   return std::make_unique<ComplexParOperator>(std::move(br), std::move(bi), fespace);
 }
@@ -648,77 +678,101 @@ template <typename OperType>
 std::unique_ptr<OperType> SpaceOperator::GetPreconditionerMatrix(double a0, double a1,
                                                                  double a2, double a3)
 {
-  // XX TODO: Test complex PC matrix assembly for l == 0 if coarse solve supports it
   // XX TODO: Handle complex coeff a0/a1/a2/a3 (like GetSystemMatrix)
+
+  // When partially assembled, the coarse operators can reuse the fine operator quadrature
+  // data if the spaces correspond to the same mesh. When appropriate, we build the
+  // preconditioner on all levels based on the actual complex-valued system matrix. The
+  // coarse operator is always fully assembled.
   if (print_prec_hdr)
   {
     Mpi::Print("\nAssembling multigrid hierarchy:\n");
   }
   MFEM_VERIFY(GetH1Spaces().GetNumLevels() == GetNDSpaces().GetNumLevels(),
               "Multigrid hierarchy mismatch for auxiliary space preconditioning!");
+
   const auto n_levels = GetNDSpaces().GetNumLevels();
+  std::vector<std::unique_ptr<Operator>> br_vec(n_levels), bi_vec(n_levels),
+      br_aux_vec(n_levels), bi_aux_vec(n_levels);
+  constexpr bool skip_zeros = false, assemble_q_data = true;
+  if (std::is_same<OperType, ComplexOperator>::value && !pc_mat_real)
+  {
+    MaterialPropertyCoefficient dfr(mat_op.MaxCeedAttribute()),
+        dfi(mat_op.MaxCeedAttribute()), fr(mat_op.MaxCeedAttribute()),
+        fi(mat_op.MaxCeedAttribute()), dfbr(mat_op.MaxCeedBdrAttribute()),
+        dfbi(mat_op.MaxCeedBdrAttribute()), fbr(mat_op.MaxCeedBdrAttribute()),
+        fbi(mat_op.MaxCeedBdrAttribute());
+    AddStiffnessCoefficients(a0, dfr, fr);
+    AddStiffnessBdrCoefficients(a0, fbr);
+    AddDampingCoefficients(a1, fi);
+    AddDampingBdrCoefficients(a1, fbi);
+    AddRealMassCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fr);
+    AddRealMassBdrCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fbr);
+    AddImagMassCoefficients(a2, fi);
+    AddExtraSystemBdrCoefficients(a3, dfbr, dfbi, fbr, fbi);
+    int empty[2] = {(dfr.empty() && fr.empty() && dfbr.empty() && fbr.empty()),
+                    (dfi.empty() && fi.empty() && dfbi.empty() && fbi.empty())};
+    Mpi::GlobalMin(2, empty, GetComm());
+    if (!empty[0])
+    {
+      br_vec = AssembleOperators(GetNDSpaces(), &dfr, &fr, &dfbr, &fbr, skip_zeros,
+                                 assemble_q_data);
+      br_aux_vec =
+          AssembleAuxOperators(GetH1Spaces(), &fr, &fbr, skip_zeros, assemble_q_data);
+    }
+    if (!empty[1])
+    {
+      bi_vec = AssembleOperators(GetNDSpaces(), &dfi, &fi, &dfbi, &fbi, skip_zeros,
+                                 assemble_q_data);
+      bi_aux_vec =
+          AssembleAuxOperators(GetH1Spaces(), &fi, &fbi, skip_zeros, assemble_q_data);
+    }
+  }
+  else
+  {
+    MaterialPropertyCoefficient dfr(mat_op.MaxCeedAttribute()),
+        fr(mat_op.MaxCeedAttribute()), dfbr(mat_op.MaxCeedBdrAttribute()),
+        fbr(mat_op.MaxCeedBdrAttribute());
+    AddStiffnessCoefficients(a0, dfr, fr);
+    AddStiffnessBdrCoefficients(a0, fbr);
+    AddDampingCoefficients(a1, fr);
+    AddDampingBdrCoefficients(a1, fbr);
+    AddAbsMassCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fr);
+    AddRealMassBdrCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fbr);
+    AddExtraSystemBdrCoefficients(a3, dfbr, dfbr, fbr, fbr);
+    int empty = (dfr.empty() && fr.empty() && dfbr.empty() && fbr.empty());
+    Mpi::GlobalMin(1, &empty, GetComm());
+    if (!empty)
+    {
+      br_vec = AssembleOperators(GetNDSpaces(), &dfr, &fr, &dfbr, &fbr, skip_zeros,
+                                 assemble_q_data);
+      br_aux_vec =
+          AssembleAuxOperators(GetH1Spaces(), &fr, &fbr, skip_zeros, assemble_q_data);
+    }
+  }
+
   auto B = std::make_unique<BaseMultigridOperator<OperType>>(n_levels);
   for (bool aux : {false, true})
   {
     for (std::size_t l = 0; l < n_levels; l++)
     {
-      // Force coarse level operator to be fully assembled always.
       const auto &fespace_l =
           aux ? GetH1Spaces().GetFESpaceAtLevel(l) : GetNDSpaces().GetFESpaceAtLevel(l);
       const auto &dbc_tdof_lists_l = aux ? h1_dbc_tdof_lists[l] : nd_dbc_tdof_lists[l];
+      auto &br_l = aux ? br_aux_vec[l] : br_vec[l];
+      auto &bi_l = aux ? bi_aux_vec[l] : bi_vec[l];
       if (print_prec_hdr)
       {
         Mpi::Print(" Level {:d}{} (p = {:d}): {:d} unknowns", l, aux ? " (auxiliary)" : "",
                    fespace_l.GetMaxElementOrder(), fespace_l.GlobalTrueVSize());
-      }
-      MaterialPropertyCoefficient dfr(mat_op.MaxCeedAttribute()),
-          dfi(mat_op.MaxCeedAttribute()), fr(mat_op.MaxCeedAttribute()),
-          fi(mat_op.MaxCeedAttribute()), dfbr(mat_op.MaxCeedBdrAttribute()),
-          dfbi(mat_op.MaxCeedBdrAttribute()), fbr(mat_op.MaxCeedBdrAttribute()),
-          fbi(mat_op.MaxCeedBdrAttribute());
-      if (std::is_same<OperType, ComplexOperator>::value && !pc_mat_real)
-      {
-        // Build preconditioner based on the actual complex-valued system matrix.
-        AddStiffnessCoefficients(a0, dfr, fr);
-        AddStiffnessBdrCoefficients(a0, fbr);
-        AddDampingCoefficients(a1, fi);
-        AddDampingBdrCoefficients(a1, fbi);
-        AddRealMassCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fr);
-        AddRealMassBdrCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fbr);
-        AddImagMassCoefficients(a2, fi);
-        AddExtraSystemBdrCoefficients(a3, dfbr, dfbi, fbr, fbi);
-      }
-      else
-      {
-        // Real-valued system matrix (approximation) for preconditioning.
-        AddStiffnessCoefficients(a0, dfr, fr);
-        AddStiffnessBdrCoefficients(a0, fbr);
-        AddDampingCoefficients(a1, fr);
-        AddDampingBdrCoefficients(a1, fbr);
-        AddAbsMassCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fr);
-        AddRealMassBdrCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fbr);
-        AddExtraSystemBdrCoefficients(a3, dfbr, dfbr, fbr, fbr);
-      }
-      int empty[2] = {(dfr.empty() && fr.empty() && dfbr.empty() && fbr.empty()),
-                      (dfi.empty() && fi.empty() && dfbi.empty() && fbi.empty())};
-      Mpi::GlobalMin(2, empty, GetComm());
-      constexpr bool skip_zeros = false;
-      std::unique_ptr<Operator> br, bi;
-      if (!empty[0])
-      {
-        br = aux ? BuildAuxOperator(fespace_l, &fr, &fbr, l, skip_zeros)
-                 : BuildOperator(fespace_l, &dfr, &fr, &dfbr, &fbr, l, skip_zeros);
-      }
-      if (!empty[1])
-      {
-        bi = aux ? BuildAuxOperator(fespace_l, &fi, &fbi, l, skip_zeros)
-                 : BuildOperator(fespace_l, &dfi, &fi, &dfbi, &fbi, l, skip_zeros);
-      }
-      if (print_prec_hdr)
-      {
-        if (const auto *br_spm = dynamic_cast<const mfem::SparseMatrix *>(br.get()))
+        const auto *b_spm = dynamic_cast<const mfem::SparseMatrix *>(br_l.get());
+        if (!b_spm)
         {
-          HYPRE_BigInt nnz = br_spm->NumNonZeroElems();
+          b_spm = dynamic_cast<const mfem::SparseMatrix *>(bi_l.get());
+        }
+        if (b_spm)
+        {
+          HYPRE_BigInt nnz = b_spm->NumNonZeroElems();
           Mpi::GlobalSum(1, &nnz, fespace_l.GetComm());
           Mpi::Print(", {:d} NNZ\n", nnz);
         }
@@ -727,7 +781,7 @@ std::unique_ptr<OperType> SpaceOperator::GetPreconditionerMatrix(double a0, doub
           Mpi::Print("\n");
         }
       }
-      auto B_l = BuildLevelOperator(*B, std::move(br), std::move(bi), fespace_l);
+      auto B_l = BuildLevelParOperator(*B, std::move(br_l), std::move(bi_l), fespace_l);
       B_l->SetEssentialTrueDofs(dbc_tdof_lists_l, Operator::DiagonalPolicy::DIAG_ONE);
       if (aux)
       {
@@ -739,6 +793,7 @@ std::unique_ptr<OperType> SpaceOperator::GetPreconditionerMatrix(double a0, doub
       }
     }
   }
+
   print_prec_hdr = false;
   return B;
 }
diff --git a/palace/models/spaceoperator.hpp b/palace/models/spaceoperator.hpp
index 1b3c07f32..d65379489 100644
--- a/palace/models/spaceoperator.hpp
+++ b/palace/models/spaceoperator.hpp
@@ -169,10 +169,10 @@ class SpaceOperator
                                                   const ComplexOperator *K,
                                                   const ComplexOperator *M);
 
-  // Construct the real, optionally SPD matrix for frequency or time domain linear system
-  // preconditioning (Mr > 0, Mi < 0, |Mr + i Mi| is done on the material property
-  // coefficient, not the matrix entries themselves):
-  //             B = a0 K + a1 C -/+ a2 |Mr + i Mi| + A2r(a3) + A2i(a3) .
+  // Construct the matrix for frequency or time domain linear system preconditioning. If it
+  // is real-valued (Mr > 0, Mi < 0, |Mr + Mi| is done on the material property coefficient,
+  // not the matrix entries themselves):
+  //             B = a0 K + a1 C -/+ a2 |Mr + Mi| + A2r(a3) + A2i(a3) .
   template <typename OperType>
   std::unique_ptr<OperType> GetPreconditionerMatrix(double a0, double a1, double a2,
                                                     double a3);

From 4611552b46ad7d81fe81529f9633793bceda4fde Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Tue, 2 Jan 2024 09:31:43 -0800
Subject: [PATCH 3/5] Fix operator coarsening with OpenMP

---
 palace/fem/libceed/operator.cpp | 79 ++++++++++++++++++---------------
 palace/fem/libceed/operator.hpp |  4 +-
 2 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/palace/fem/libceed/operator.cpp b/palace/fem/libceed/operator.cpp
index 7333cdf4a..13534fdfe 100644
--- a/palace/fem/libceed/operator.cpp
+++ b/palace/fem/libceed/operator.cpp
@@ -59,7 +59,6 @@ void Operator::AssembleDiagonal(Vector &diag) const
   Ceed ceed;
   CeedMemType mem;
   CeedScalar *data;
-
   MFEM_VERIFY(diag.Size() == height, "Invalid size for diagonal vector!");
   diag = 0.0;
   PalaceCeedCallBackend(CeedOperatorGetCeed(ops[0], &ceed));
@@ -74,14 +73,15 @@ void Operator::AssembleDiagonal(Vector &diag) const
     mem = CEED_MEM_HOST;
   }
 
-  PalacePragmaOmp(parallel for private(ceed) schedule(static))
+  PalacePragmaOmp(parallel for schedule(static))
   for (std::size_t i = 0; i < ops.size(); i++)
   {
-    PalaceCeedCallBackend(CeedOperatorGetCeed(ops[i], &ceed));
-    PalaceCeedCall(ceed, CeedVectorSetArray(v[i], mem, CEED_USE_POINTER, data));
-    PalaceCeedCall(
-        ceed, CeedOperatorLinearAssembleAddDiagonal(ops[i], v[i], CEED_REQUEST_IMMEDIATE));
-    PalaceCeedCall(ceed, CeedVectorTakeArray(v[i], mem, nullptr));
+    Ceed ceed_i;
+    PalaceCeedCallBackend(CeedOperatorGetCeed(ops[i], &ceed_i));
+    PalaceCeedCall(ceed_i, CeedVectorSetArray(v[i], mem, CEED_USE_POINTER, data));
+    PalaceCeedCall(ceed_i, CeedOperatorLinearAssembleAddDiagonal(ops[i], v[i],
+                                                                 CEED_REQUEST_IMMEDIATE));
+    PalaceCeedCall(ceed_i, CeedVectorTakeArray(v[i], mem, nullptr));
   }
 }
 
@@ -96,7 +96,6 @@ inline void CeedAddMult(const std::vector<CeedOperator> &ops,
   CeedMemType mem;
   const CeedScalar *x_data;
   CeedScalar *y_data;
-
   PalaceCeedCallBackend(CeedOperatorGetCeed(ops[0], &ceed));
   PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem));
   if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE)
@@ -111,19 +110,20 @@ inline void CeedAddMult(const std::vector<CeedOperator> &ops,
     mem = CEED_MEM_HOST;
   }
 
-  PalacePragmaOmp(parallel for private(ceed) schedule(static))
+  PalacePragmaOmp(parallel for schedule(static))
   for (std::size_t i = 0; i < ops.size(); i++)
   {
     if (ops[i])  // No-op for an empty operator
     {
-      PalaceCeedCallBackend(CeedOperatorGetCeed(ops[i], &ceed));
-      PalaceCeedCall(ceed, CeedVectorSetArray(u[i], mem, CEED_USE_POINTER,
-                                              const_cast<CeedScalar *>(x_data)));
-      PalaceCeedCall(ceed, CeedVectorSetArray(v[i], mem, CEED_USE_POINTER, y_data));
-      PalaceCeedCall(ceed,
+      Ceed ceed_i;
+      PalaceCeedCallBackend(CeedOperatorGetCeed(ops[i], &ceed_i));
+      PalaceCeedCall(ceed_i, CeedVectorSetArray(u[i], mem, CEED_USE_POINTER,
+                                                const_cast<CeedScalar *>(x_data)));
+      PalaceCeedCall(ceed_i, CeedVectorSetArray(v[i], mem, CEED_USE_POINTER, y_data));
+      PalaceCeedCall(ceed_i,
                      CeedOperatorApplyAdd(ops[i], u[i], v[i], CEED_REQUEST_IMMEDIATE));
-      PalaceCeedCall(ceed, CeedVectorTakeArray(u[i], mem, nullptr));
-      PalaceCeedCall(ceed, CeedVectorTakeArray(v[i], mem, nullptr));
+      PalaceCeedCall(ceed_i, CeedVectorTakeArray(u[i], mem, nullptr));
+      PalaceCeedCall(ceed_i, CeedVectorTakeArray(v[i], mem, nullptr));
     }
   }
 }
@@ -283,17 +283,19 @@ void CeedOperatorAssembleCOO(const Operator &op, bool skip_zeros, CeedSize *nnz,
     *mem = CEED_MEM_HOST;
   }
 
-  PalacePragmaOmp(parallel for private(ceed) schedule(static))
+  PalacePragmaOmp(parallel for schedule(static))
   for (std::size_t i = 0; i < op.Size(); i++)
   {
+    Ceed ceed_i;
+    PalaceCeedCallBackend(CeedOperatorGetCeed(op[i], &ceed_i));
+
     // Assemble sparsity pattern (rows, cols are always host pointers).
-    PalaceCeedCallBackend(CeedOperatorGetCeed(op[i], &ceed));
-    PalaceCeedCall(ceed, CeedOperatorLinearAssembleSymbolic(op[i], &loc_nnz[i],
-                                                            &loc_rows[i], &loc_cols[i]));
+    PalaceCeedCall(ceed_i, CeedOperatorLinearAssembleSymbolic(op[i], &loc_nnz[i],
+                                                              &loc_rows[i], &loc_cols[i]));
 
     // Assemble values.
-    PalaceCeedCall(ceed, CeedVectorCreate(ceed, loc_nnz[i], &loc_vals[i]));
-    PalaceCeedCall(ceed, CeedOperatorLinearAssemble(op[i], loc_vals[i]));
+    PalaceCeedCall(ceed_i, CeedVectorCreate(ceed_i, loc_nnz[i], &loc_vals[i]));
+    PalaceCeedCall(ceed_i, CeedOperatorLinearAssemble(op[i], loc_vals[i]));
   }
 
   loc_offsets[0] = 0;
@@ -314,7 +316,7 @@ void CeedOperatorAssembleCOO(const Operator &op, bool skip_zeros, CeedSize *nnz,
     PalaceCeedCall(ceed, CeedVectorCreate(ceed, *nnz, vals));
     PalaceCeedCall(ceed, CeedVectorGetArrayWrite(*vals, *mem, &vals_array));
 
-    PalacePragmaOmp(parallel for private(ceed) schedule(static))
+    PalacePragmaOmp(parallel for schedule(static))
     for (std::size_t i = 0; i < op.Size(); i++)
     {
       const auto start = loc_offsets[i];
@@ -326,9 +328,10 @@ void CeedOperatorAssembleCOO(const Operator &op, bool skip_zeros, CeedSize *nnz,
       }
 
       // The CeedVector is on only on device when MFEM is also using the device.
+      Ceed ceed_i;
       const CeedScalar *loc_vals_array;
-      PalaceCeedCallBackend(CeedVectorGetCeed(loc_vals[i], &ceed));
-      PalaceCeedCall(ceed, CeedVectorGetArrayRead(loc_vals[i], *mem, &loc_vals_array));
+      PalaceCeedCallBackend(CeedVectorGetCeed(loc_vals[i], &ceed_i));
+      PalaceCeedCall(ceed_i, CeedVectorGetArrayRead(loc_vals[i], *mem, &loc_vals_array));
       if (*mem != CEED_MEM_HOST)
       {
         mfem::forall(end - start, [=] MFEM_HOST_DEVICE(int k)
@@ -341,10 +344,10 @@ void CeedOperatorAssembleCOO(const Operator &op, bool skip_zeros, CeedSize *nnz,
           vals_array[k] = loc_vals_array[k - start];
         }
       }
-      PalaceCeedCall(ceed, CeedVectorRestoreArrayRead(loc_vals[i], &loc_vals_array));
-      PalaceCeedCall(ceed, CeedInternalFree(&loc_rows[i]));
-      PalaceCeedCall(ceed, CeedInternalFree(&loc_cols[i]));
-      PalaceCeedCall(ceed, CeedVectorDestroy(&loc_vals[i]));
+      PalaceCeedCall(ceed_i, CeedVectorRestoreArrayRead(loc_vals[i], &loc_vals_array));
+      PalaceCeedCall(ceed_i, CeedInternalFree(&loc_rows[i]));
+      PalaceCeedCall(ceed_i, CeedInternalFree(&loc_cols[i]));
+      PalaceCeedCall(ceed_i, CeedVectorDestroy(&loc_vals[i]));
     }
 
     PalaceCeedCall(ceed, CeedVectorRestoreArray(*vals, &vals_array));
@@ -508,17 +511,23 @@ std::unique_ptr<Operator> CeedOperatorCoarsen(const Operator &op_fine,
                                                        fespace_coarse.GetVSize());
 
   // Assemble the coarse operator by coarsening each sub-operator (over threads, geometry
-  // types, integrators) of the original fine operator. We loop over Ceed contexts because
-  // extracting the Ceed context from a CeedOperator returns a different pointer (created
-  // with CeedReferenceCopy) and we need the original ones to access the FiniteElementSpace
-  // and Mesh object caches.
+  // types, integrators) of the original fine operator.
   MFEM_VERIFY(internal::GetCeedObjects().size() == op_fine.Size(),
               "Unexpected size mismatch in multithreaded Ceed contexts!");
-  const std::size_t nt = internal::GetCeedObjects().size();
+  const std::size_t nt = op_fine.Size();
   PalacePragmaOmp(parallel for schedule(static))
   for (std::size_t i = 0; i < nt; i++)
   {
-    Ceed ceed = internal::GetCeedObjects()[i];
+    Ceed ceed;
+    PalaceCeedCallBackend(CeedOperatorGetCeed(op_fine[i], &ceed));
+    {
+      Ceed ceed_parent;
+      PalaceCeedCall(ceed, CeedGetParent(ceed, &ceed_parent));
+      if (ceed_parent)
+      {
+        ceed = ceed_parent;
+      }
+    }
 
     // Initialize the composite operator on each thread.
     CeedOperator loc_op;
diff --git a/palace/fem/libceed/operator.hpp b/palace/fem/libceed/operator.hpp
index 4937329e2..478bd8330 100644
--- a/palace/fem/libceed/operator.hpp
+++ b/palace/fem/libceed/operator.hpp
@@ -69,8 +69,8 @@ std::unique_ptr<mfem::SparseMatrix> CeedOperatorFullAssemble(const Operator &op,
                                                              bool skip_zeros, bool set);
 
 // Construct a coarse-level ceed::Operator, reusing the quadrature data and quadrature
-// function from the fine-level operator. Only available for square operators (same input
-// and output spaces).
+// function from the fine-level operator. Only available for square, symmetric operators
+// (same input and output spaces).
 std::unique_ptr<Operator> CeedOperatorCoarsen(const Operator &op_fine,
                                               const FiniteElementSpace &fespace_coarse);
 

From 8ec1ad40ae3b2cf2dcb25070932b4680dd78adc2 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Tue, 9 Jan 2024 13:10:38 -0800
Subject: [PATCH 4/5] Optimize temporary vector allocation as intermediate
 storage in Operator::Mult methods

---
 palace/fem/fespace.hpp           |  50 +++++
 palace/fem/libceed/operator.cpp  |  52 ++---
 palace/fem/libceed/operator.hpp  |   4 +-
 palace/linalg/ams.cpp            |   6 +-
 palace/linalg/chebyshev.cpp      |   7 +-
 palace/linalg/chebyshev.hpp      |  40 +++-
 palace/linalg/distrelaxation.cpp |  15 +-
 palace/linalg/distrelaxation.hpp |  19 +-
 palace/linalg/errorestimator.cpp |  54 ++---
 palace/linalg/errorestimator.hpp |   5 +-
 palace/linalg/gmg.cpp            |   4 +-
 palace/linalg/operator.cpp       | 122 +++++-------
 palace/linalg/operator.hpp       |   1 +
 palace/linalg/rap.cpp            | 332 ++++++++++++++++++-------------
 palace/linalg/rap.hpp            |  57 +++---
 palace/linalg/solver.hpp         |  15 ++
 palace/linalg/vector.cpp         |   4 +
 17 files changed, 442 insertions(+), 345 deletions(-)

diff --git a/palace/fem/fespace.hpp b/palace/fem/fespace.hpp
index 470cf7d10..f7fbd3033 100644
--- a/palace/fem/fespace.hpp
+++ b/palace/fem/fespace.hpp
@@ -10,6 +10,7 @@
 #include "fem/libceed/ceed.hpp"
 #include "fem/mesh.hpp"
 #include "linalg/operator.hpp"
+#include "linalg/vector.hpp"
 
 namespace palace
 {
@@ -30,6 +31,9 @@ class FiniteElementSpace
   mutable ceed::CeedObjectMap<CeedBasis> basis;
   mutable ceed::CeedObjectMap<CeedElemRestriction> restr, interp_restr, interp_range_restr;
 
+  // Temporary storage for operator applications.
+  mutable ComplexVector tx, lx, ly;
+
   bool HasUniqueInterpRestriction(const mfem::FiniteElement &fe) const
   {
     // For interpolation operators and tensor-product elements, we need native (not
@@ -75,12 +79,16 @@ class FiniteElementSpace
 
   auto GetVDim() const { return Get().GetVDim(); }
   auto GetVSize() const { return Get().GetVSize(); }
+  auto GlobalVSize() const { return Get().GlobalVSize(); }
   auto GetTrueVSize() const { return Get().GetTrueVSize(); }
   auto GlobalTrueVSize() const { return Get().GlobalTrueVSize(); }
   auto Dimension() const { return mesh.Get().Dimension(); }
   auto SpaceDimension() const { return mesh.Get().SpaceDimension(); }
   auto GetMaxElementOrder() const { return Get().GetMaxElementOrder(); }
 
+  const auto *GetProlongationMatrix() const { return Get().GetProlongationMatrix(); }
+  const auto *GetRestrictionMatrix() const { return Get().GetRestrictionMatrix(); }
+
   // Return the basis object for elements of the given element geometry type.
   const CeedBasis GetCeedBasis(Ceed ceed, mfem::Geometry::Type geom) const;
 
@@ -115,6 +123,48 @@ class FiniteElementSpace
                            mfem::Geometry::Type geom, const std::vector<int> &indices,
                            bool is_interp = false, bool is_interp_range = false);
 
+  template <typename VecType>
+  auto &GetTVector() const
+  {
+    tx.SetSize(GetTrueVSize());
+    if constexpr (std::is_same<VecType, ComplexVector>::value)
+    {
+      return tx;
+    }
+    else
+    {
+      return tx.Real();
+    }
+  }
+
+  template <typename VecType>
+  auto &GetLVector() const
+  {
+    lx.SetSize(GetVSize());
+    if constexpr (std::is_same<VecType, ComplexVector>::value)
+    {
+      return lx;
+    }
+    else
+    {
+      return lx.Real();
+    }
+  }
+
+  template <typename VecType>
+  auto &GetLVector2() const
+  {
+    ly.SetSize(GetVSize());
+    if constexpr (std::is_same<VecType, ComplexVector>::value)
+    {
+      return ly;
+    }
+    else
+    {
+      return ly.Real();
+    }
+  }
+
   // Get the associated MPI communicator.
   MPI_Comm GetComm() const { return fespace.GetComm(); }
 };
diff --git a/palace/fem/libceed/operator.cpp b/palace/fem/libceed/operator.cpp
index 13534fdfe..5d66c2c06 100644
--- a/palace/fem/libceed/operator.cpp
+++ b/palace/fem/libceed/operator.cpp
@@ -142,67 +142,43 @@ void Operator::Mult(const Vector &x, Vector &y) const
 
 void Operator::AddMult(const Vector &x, Vector &y, const double a) const
 {
-  if (a == 1.0 && dof_multiplicity.Size() == 0)
-  {
-    CeedAddMult(ops, u, v, x, y);
-  }
-  else
+  MFEM_VERIFY(a == 1.0, "ceed::Operator::AddMult only supports coefficient = 1.0!");
+  if (dof_multiplicity.Size() > 0)
   {
-    Vector &temp = (height == width) ? temp_v : temp_u;
     temp.SetSize(height);
-    temp.UseDevice(true);
     temp = 0.0;
     CeedAddMult(ops, u, v, x, temp);
     if (dof_multiplicity.Size() > 0)
     {
       temp *= dof_multiplicity;
     }
-    y.Add(a, temp);
+    y += temp;
+  }
+  else
+  {
+    CeedAddMult(ops, u, v, x, y);
   }
 }
 
 void Operator::MultTranspose(const Vector &x, Vector &y) const
 {
   y = 0.0;
-  if (dof_multiplicity.Size() > 0)
-  {
-    temp_v = x;
-    temp_v *= dof_multiplicity;
-    CeedAddMult(ops_t, v, u, temp_v, y);
-  }
-  else
-  {
-    CeedAddMult(ops_t, v, u, x, y);
-  }
+  AddMultTranspose(x, y);
 }
 
 void Operator::AddMultTranspose(const Vector &x, Vector &y, const double a) const
 {
-  auto AddMultTransposeImpl = [this](const Vector &x_, Vector &y_, const double a_)
-  {
-    if (a_ == 1.0)
-    {
-      CeedAddMult(ops_t, v, u, x_, y_);
-    }
-    else
-    {
-      Vector &temp = (height == width && dof_multiplicity.Size() == 0) ? temp_v : temp_u;
-      temp.SetSize(width);
-      temp.UseDevice(true);
-      temp = 0.0;
-      CeedAddMult(ops_t, v, u, x_, temp);
-      y_.Add(a_, temp);
-    }
-  };
+  MFEM_VERIFY(a == 1.0,
+              "ceed::Operator::AddMultTranspose only supports coefficient = 1.0!");
   if (dof_multiplicity.Size() > 0)
   {
-    temp_v = x;
-    temp_v *= dof_multiplicity;
-    AddMultTransposeImpl(temp_v, y, a);
+    temp = x;
+    temp *= dof_multiplicity;
+    CeedAddMult(ops_t, v, u, temp, y);
   }
   else
   {
-    AddMultTransposeImpl(x, y, a);
+    CeedAddMult(ops_t, v, u, x, y);
   }
 }
 
diff --git a/palace/fem/libceed/operator.hpp b/palace/fem/libceed/operator.hpp
index 478bd8330..687870c44 100644
--- a/palace/fem/libceed/operator.hpp
+++ b/palace/fem/libceed/operator.hpp
@@ -26,10 +26,10 @@ class Operator : public palace::Operator
   std::vector<CeedOperator> ops, ops_t;
   std::vector<CeedVector> u, v;
   Vector dof_multiplicity;
-  mutable Vector temp_u, temp_v;
+  mutable Vector temp;
 
 public:
-  Operator(int h, int w) : palace::Operator(h, w) {}
+  Operator(int h, int w) : palace::Operator(h, w) { temp.UseDevice(true); }
   ~Operator() override;
 
   CeedOperator operator[](std::size_t i) const { return ops[i]; }
diff --git a/palace/linalg/ams.cpp b/palace/linalg/ams.cpp
index 7a357afeb..4b8814dbf 100644
--- a/palace/linalg/ams.cpp
+++ b/palace/linalg/ams.cpp
@@ -115,9 +115,9 @@ void HypreAmsSolver::ConstructAuxiliaryMatrices(FiniteElementSpace &nd_fespace,
   else
   {
     // Fall back to MFEM legacy assembly for identity interpolator.
-    mfem::ParFiniteElementSpace h1d_fespace(&mesh, &h1_fespace.GetFEColl(), space_dim,
-                                            mfem::Ordering::byVDIM);
-    mfem::DiscreteLinearOperator pi(&h1d_fespace, &nd_fespace.Get());
+    FiniteElementSpace h1d_fespace(h1_fespace.GetMesh(), &h1_fespace.GetFEColl(), space_dim,
+                                   mfem::Ordering::byVDIM);
+    mfem::DiscreteLinearOperator pi(&h1d_fespace.Get(), &nd_fespace.Get());
     pi.AddDomainInterpolator(new mfem::IdentityInterpolator);
     pi.SetAssemblyLevel(mfem::AssemblyLevel::LEGACY);
     pi.Assemble();
diff --git a/palace/linalg/chebyshev.cpp b/palace/linalg/chebyshev.cpp
index e9f8e8cd0..82f6e9255 100644
--- a/palace/linalg/chebyshev.cpp
+++ b/palace/linalg/chebyshev.cpp
@@ -162,7 +162,6 @@ template <typename OperType>
 void ChebyshevSmoother<OperType>::SetOperator(const OperType &op)
 {
   A = &op;
-  r.SetSize(op.Height());
   d.SetSize(op.Height());
   dinv.SetSize(op.Height());
   op.AssembleDiagonal(dinv);
@@ -177,7 +176,7 @@ void ChebyshevSmoother<OperType>::SetOperator(const OperType &op)
 }
 
 template <typename OperType>
-void ChebyshevSmoother<OperType>::Mult(const VecType &x, VecType &y) const
+void ChebyshevSmoother<OperType>::Mult2(const VecType &x, VecType &y, VecType &r) const
 {
   // Apply smoother: y = y + p(A) (x - A y) .
   for (int it = 0; it < pc_it; it++)
@@ -222,7 +221,6 @@ template <typename OperType>
 void ChebyshevSmoother1stKind<OperType>::SetOperator(const OperType &op)
 {
   A = &op;
-  r.SetSize(op.Height());
   d.SetSize(op.Height());
   dinv.SetSize(op.Height());
   op.AssembleDiagonal(dinv);
@@ -244,7 +242,8 @@ void ChebyshevSmoother1stKind<OperType>::SetOperator(const OperType &op)
 }
 
 template <typename OperType>
-void ChebyshevSmoother1stKind<OperType>::Mult(const VecType &x, VecType &y) const
+void ChebyshevSmoother1stKind<OperType>::Mult2(const VecType &x, VecType &y,
+                                               VecType &r) const
 {
   // Apply smoother: y = y + p(A) (x - A y) .
   for (int it = 0; it < pc_it; it++)
diff --git a/palace/linalg/chebyshev.hpp b/palace/linalg/chebyshev.hpp
index e37da8fde..a2de1e340 100644
--- a/palace/linalg/chebyshev.hpp
+++ b/palace/linalg/chebyshev.hpp
@@ -40,19 +40,31 @@ class ChebyshevSmoother : public Solver<OperType>
   // Maximum operator eigenvalue for Chebyshev polynomial smoothing.
   double lambda_max, sf_max;
 
-  // Temporary vectors for smoother application.
-  mutable VecType r, d;
+  // Temporary vector for smoother application.
+  mutable VecType d;
 
 public:
   ChebyshevSmoother(MPI_Comm comm, int smooth_it, int poly_order, double sf_max);
 
   void SetOperator(const OperType &op) override;
 
-  void Mult(const VecType &x, VecType &y) const override;
+  void Mult(const VecType &x, VecType &y) const override
+  {
+    MFEM_ABORT("ChebyshevSmoother implements Mult2 using an additional preallocated "
+               "temporary vector!");
+  }
 
   void MultTranspose(const VecType &x, VecType &y) const override
   {
-    Mult(x, y);  // Assumes operator symmetry
+    MFEM_ABORT("ChebyshevSmoother implements MultTranspose2 using an additional "
+               "preallocated temporary vector!");
+  }
+
+  void Mult2(const VecType &x, VecType &y, VecType &r) const override;
+
+  void MultTranspose2(const VecType &x, VecType &y, VecType &r) const override
+  {
+    Mult2(x, y, r);  // Assumes operator symmetry
   }
 };
 
@@ -84,8 +96,8 @@ class ChebyshevSmoother1stKind : public Solver<OperType>
   // polynomial smoothing.
   double theta, delta, sf_max, sf_min;
 
-  // Temporary vectors for smoother application.
-  mutable VecType r, d;
+  // Temporary vector for smoother application.
+  mutable VecType d;
 
 public:
   ChebyshevSmoother1stKind(MPI_Comm comm, int smooth_it, int poly_order, double sf_max,
@@ -93,11 +105,23 @@ class ChebyshevSmoother1stKind : public Solver<OperType>
 
   void SetOperator(const OperType &op) override;
 
-  void Mult(const VecType &x, VecType &y) const override;
+  void Mult(const VecType &x, VecType &y) const override
+  {
+    MFEM_ABORT("ChebyshevSmoother1stKind implements Mult2 using an additional preallocated "
+               "temporary vector!");
+  }
 
   void MultTranspose(const VecType &x, VecType &y) const override
   {
-    Mult(x, y);  // Assumes operator symmetry
+    MFEM_ABORT("ChebyshevSmoother1stKind implements MultTranspose2 using an additional "
+               "preallocated temporary vector!");
+  }
+
+  void Mult2(const VecType &x, VecType &y, VecType &r) const override;
+
+  void MultTranspose2(const VecType &x, VecType &y, VecType &r) const override
+  {
+    Mult2(x, y, r);  // Assumes operator symmetry
   }
 };
 
diff --git a/palace/linalg/distrelaxation.cpp b/palace/linalg/distrelaxation.cpp
index 8eaa55696..0958a6865 100644
--- a/palace/linalg/distrelaxation.cpp
+++ b/palace/linalg/distrelaxation.cpp
@@ -48,9 +48,9 @@ void DistRelaxationSmoother<OperType>::SetOperators(const OperType &op,
               "Invalid operator sizes for DistRelaxationSmoother!");
   A = &op;
   A_G = &op_G;
-  r.SetSize(op.Height());
   x_G.SetSize(op_G.Height());
   y_G.SetSize(op_G.Height());
+  r_G.SetSize(op_G.Height());
 
   const auto *PtAP_G = dynamic_cast<const ParOperType *>(&op_G);
   MFEM_VERIFY(PtAP_G,
@@ -93,14 +93,14 @@ inline void RealMultTranspose(const Operator &op, const ComplexVector &x, Comple
 }  // namespace
 
 template <typename OperType>
-void DistRelaxationSmoother<OperType>::Mult(const VecType &x, VecType &y) const
+void DistRelaxationSmoother<OperType>::Mult2(const VecType &x, VecType &y, VecType &r) const
 {
   // Apply smoother.
   for (int it = 0; it < pc_it; it++)
   {
     // y = y + B (x - A y)
     B->SetInitialGuess(this->initial_guess || it > 0);
-    B->Mult(x, y);
+    B->Mult2(x, y, r);
 
     // y = y + G B_G Gᵀ (x - A y)
     A->Mult(y, r);
@@ -110,13 +110,14 @@ void DistRelaxationSmoother<OperType>::Mult(const VecType &x, VecType &y) const
     {
       linalg::SetSubVector(x_G, *dbc_tdof_list_G, 0.0);
     }
-    B_G->Mult(x_G, y_G);
+    B_G->Mult2(x_G, y_G, r_G);
     RealAddMult(*G, y_G, y);
   }
 }
 
 template <typename OperType>
-void DistRelaxationSmoother<OperType>::MultTranspose(const VecType &x, VecType &y) const
+void DistRelaxationSmoother<OperType>::MultTranspose2(const VecType &x, VecType &y,
+                                                      VecType &r) const
 {
   // Apply transpose.
   B->SetInitialGuess(true);
@@ -138,11 +139,11 @@ void DistRelaxationSmoother<OperType>::MultTranspose(const VecType &x, VecType &
     {
       linalg::SetSubVector(x_G, *dbc_tdof_list_G, 0.0);
     }
-    B_G->MultTranspose(x_G, y_G);
+    B_G->MultTranspose2(x_G, y_G, r_G);
     RealAddMult(*G, y_G, y);
 
     // y = y + Bᵀ (x - A y)
-    B->MultTranspose(x, y);
+    B->MultTranspose2(x, y, r);
   }
 }
 
diff --git a/palace/linalg/distrelaxation.hpp b/palace/linalg/distrelaxation.hpp
index cee3373ec..77969ffe4 100644
--- a/palace/linalg/distrelaxation.hpp
+++ b/palace/linalg/distrelaxation.hpp
@@ -47,7 +47,7 @@ class DistRelaxationSmoother : public Solver<OperType>
   std::unique_ptr<Solver<OperType>> B_G;
 
   // Temporary vectors for smoother application.
-  mutable VecType r, x_G, y_G;
+  mutable VecType x_G, y_G, r_G;
 
 public:
   DistRelaxationSmoother(MPI_Comm comm, const Operator &G, int smooth_it,
@@ -59,11 +59,24 @@ class DistRelaxationSmoother : public Solver<OperType>
     MFEM_ABORT("SetOperator with a single operator is not implemented for "
                "DistRelaxationSmoother, use the two argument signature instead!");
   }
+
   void SetOperators(const OperType &op, const OperType &op_G);
 
-  void Mult(const VecType &x, VecType &y) const override;
+  void Mult(const VecType &x, VecType &y) const override
+  {
+    MFEM_ABORT("DistRelaxationSmoother implements Mult2 using an additional preallocated "
+               "temporary vector!");
+  }
+
+  void MultTranspose(const VecType &x, VecType &y) const override
+  {
+    MFEM_ABORT("DistRelaxationSmoother implements MultTranspose2 using an additional "
+               "preallocated temporary vector!");
+  }
+
+  void Mult2(const VecType &x, VecType &y, VecType &r) const override;
 
-  void MultTranspose(const VecType &x, VecType &y) const override;
+  void MultTranspose2(const VecType &x, VecType &y, VecType &r) const override;
 };
 
 }  // namespace palace
diff --git a/palace/linalg/errorestimator.cpp b/palace/linalg/errorestimator.cpp
index b438570b5..914bd0a08 100644
--- a/palace/linalg/errorestimator.cpp
+++ b/palace/linalg/errorestimator.cpp
@@ -97,44 +97,31 @@ FluxProjector::FluxProjector(const MaterialOperator &mat_op,
   rhs.SetSize(h1d_fespace.GetTrueVSize());
 }
 
-template <typename VecType>
-void FluxProjector::Mult(const VecType &x, VecType &y) const
+void FluxProjector::Mult(const Vector &x, Vector &y) const
 {
   BlockTimer bt(Timer::SOLVEESTIMATOR);
   MFEM_ASSERT(y.Size() == rhs.Size(), "Invalid vector dimensions for FluxProjector::Mult!");
   MFEM_ASSERT(
       y.Size() % x.Size() == 0,
       "Invalid vector dimension for FluxProjector::Mult, does not yield even blocking!");
-  auto MultImpl = [this](const Vector &x_, Vector &y_)
+  const int vdim = y.Size() / x.Size();
+  Flux->Mult(x, rhs);
+  if (vdim == 1)
   {
-    const int vdim = y_.Size() / x_.Size();
-    Flux->Mult(x_, rhs);
-    if (vdim == 1)
-    {
-      // Mpi::Print(" Computing smooth flux projection for error estimation\n");
-      ksp->Mult(rhs, y_);
-    }
-    else
-    {
-      for (int i = 0; i < vdim; i++)
-      {
-        // Mpi::Print(" Computing smooth flux projection of flux component {:d}/{:d} for "
-        //            "error estimation\n",
-        //            i + 1, vdim);
-        const Vector rhsb(rhs, i * x_.Size(), x_.Size());
-        Vector yb(y_, i * x_.Size(), x_.Size());
-        ksp->Mult(rhsb, yb);
-      }
-    }
-  };
-  if constexpr (std::is_same<VecType, ComplexVector>::value)
-  {
-    MultImpl(x.Real(), y.Real());
-    MultImpl(x.Imag(), y.Imag());
+    // Mpi::Print(" Computing smooth flux projection for error estimation\n");
+    ksp->Mult(rhs, y);
   }
   else
   {
-    MultImpl(x, y);
+    for (int i = 0; i < vdim; i++)
+    {
+      // Mpi::Print(" Computing smooth flux projection of flux component {:d}/{:d} for "
+      //            "error estimation\n",
+      //            i + 1, vdim);
+      const Vector rhsb(rhs, i * x.Size(), x.Size());
+      Vector yb(y, i * x.Size(), x.Size());
+      ksp->Mult(rhsb, yb);
+    }
   }
 }
 
@@ -154,16 +141,18 @@ ErrorIndicator CurlFluxErrorEstimator<VecType>::ComputeIndicators(const VecType
   // Compute the projection of the discontinuous flux onto the smooth finite element space
   // and populate the corresponding grid functions.
   BlockTimer bt(Timer::ESTIMATION);
-  projector.Mult(U, F);
   if constexpr (std::is_same<VecType, ComplexVector>::value)
   {
-    F_gf.real().SetFromTrueDofs(F.Real());
-    F_gf.imag().SetFromTrueDofs(F.Imag());
+    projector.Mult(U.Real(), F);
+    F_gf.real().SetFromTrueDofs(F);
+    projector.Mult(U.Imag(), F);
+    F_gf.imag().SetFromTrueDofs(F);
     U_gf.real().SetFromTrueDofs(U.Real());
     U_gf.imag().SetFromTrueDofs(U.Imag());
   }
   else
   {
+    projector.Mult(U, F);
     F_gf.SetFromTrueDofs(F);
     U_gf.SetFromTrueDofs(U);
   }
@@ -346,9 +335,6 @@ ErrorIndicator GradFluxErrorEstimator::ComputeIndicators(const Vector &U) const
   return ErrorIndicator(std::move(estimates));
 }
 
-template void FluxProjector::Mult(const Vector &, Vector &) const;
-template void FluxProjector::Mult(const ComplexVector &, ComplexVector &) const;
-
 template class CurlFluxErrorEstimator<Vector>;
 template class CurlFluxErrorEstimator<ComplexVector>;
 
diff --git a/palace/linalg/errorestimator.hpp b/palace/linalg/errorestimator.hpp
index 852829420..60e3aa5f8 100644
--- a/palace/linalg/errorestimator.hpp
+++ b/palace/linalg/errorestimator.hpp
@@ -43,8 +43,7 @@ class FluxProjector
   FluxProjector(const MaterialOperator &mat_op, const FiniteElementSpace &h1_fespace,
                 const FiniteElementSpace &h1d_fespace, double tol, int max_it, int print);
 
-  template <typename VecType>
-  void Mult(const VecType &x, VecType &y) const;
+  void Mult(const Vector &x, Vector &y) const;
 };
 
 // Class used for computing curl flux error estimate, i.e. || μ⁻¹ ∇ × Uₕ - F ||_K where F
@@ -66,7 +65,7 @@ class CurlFluxErrorEstimator
   FluxProjector projector;
 
   // Temporary vectors for error estimation.
-  mutable VecType F;
+  mutable Vector F;
   mutable GridFunctionType F_gf, U_gf;
 
 public:
diff --git a/palace/linalg/gmg.cpp b/palace/linalg/gmg.cpp
index 27bd1961d..2b02f2b28 100644
--- a/palace/linalg/gmg.cpp
+++ b/palace/linalg/gmg.cpp
@@ -178,7 +178,7 @@ void GeometricMultigridSolver<OperType>::VCycle(int l, bool initial_guess) const
     B[l]->Mult(X[l], Y[l]);
     return;
   }
-  B[l]->Mult(X[l], Y[l]);
+  B[l]->Mult2(X[l], Y[l], R[l]);
 
   // Compute residual.
   A[l]->Mult(Y[l], R[l]);
@@ -198,7 +198,7 @@ void GeometricMultigridSolver<OperType>::VCycle(int l, bool initial_guess) const
 
   // Post-smooth, with nonzero initial guess.
   B[l]->SetInitialGuess(true);
-  B[l]->MultTranspose(X[l], Y[l]);
+  B[l]->MultTranspose2(X[l], Y[l], R[l]);
 }
 
 template class GeometricMultigridSolver<Operator>;
diff --git a/palace/linalg/operator.cpp b/palace/linalg/operator.cpp
index 8f655a4dc..8723ade41 100644
--- a/palace/linalg/operator.cpp
+++ b/palace/linalg/operator.cpp
@@ -101,15 +101,16 @@ void ComplexWrapperOperator::Mult(const ComplexVector &x, ComplexVector &y) cons
   const Vector &xi = x.Imag();
   Vector &yr = y.Real();
   Vector &yi = y.Imag();
-  if (Ar)
+  if (Ai)
   {
-    if (!zero_real)
+    if (!zero_imag)
     {
-      Ar->Mult(xr, yr);
+      Ai->Mult(xi, yr);
+      yr *= -1.0;
     }
-    if (!zero_imag)
+    if (!zero_real)
     {
-      Ar->Mult(xi, yi);
+      Ai->Mult(xr, yi);
     }
   }
   else
@@ -117,15 +118,15 @@ void ComplexWrapperOperator::Mult(const ComplexVector &x, ComplexVector &y) cons
     yr = 0.0;
     yi = 0.0;
   }
-  if (Ai)
+  if (Ar)
   {
-    if (!zero_imag)
+    if (!zero_real)
     {
-      Ai->AddMult(xi, yr, -1.0);
+      Ar->AddMult(xr, yr);
     }
-    if (!zero_real)
+    if (!zero_imag)
     {
-      Ai->AddMult(xr, yi, 1.0);
+      Ar->AddMult(xi, yi);
     }
   }
 }
@@ -138,15 +139,16 @@ void ComplexWrapperOperator::MultTranspose(const ComplexVector &x, ComplexVector
   const Vector &xi = x.Imag();
   Vector &yr = y.Real();
   Vector &yi = y.Imag();
-  if (Ar)
+  if (Ai)
   {
-    if (!zero_real)
+    if (!zero_imag)
     {
-      Ar->MultTranspose(xr, yr);
+      Ai->MultTranspose(xi, yr);
+      yr *= -1.0;
     }
-    if (!zero_imag)
+    if (!zero_real)
     {
-      Ar->MultTranspose(xi, yi);
+      Ai->MultTranspose(xr, yi);
     }
   }
   else
@@ -154,15 +156,15 @@ void ComplexWrapperOperator::MultTranspose(const ComplexVector &x, ComplexVector
     yr = 0.0;
     yi = 0.0;
   }
-  if (Ai)
+  if (Ar)
   {
-    if (!zero_imag)
+    if (!zero_real)
     {
-      Ai->AddMultTranspose(xi, yr, -1.0);
+      Ar->AddMultTranspose(xr, yr);
     }
-    if (!zero_real)
+    if (!zero_imag)
     {
-      Ai->AddMultTranspose(xr, yi, 1.0);
+      Ar->AddMultTranspose(xi, yi);
     }
   }
 }
@@ -176,15 +178,16 @@ void ComplexWrapperOperator::MultHermitianTranspose(const ComplexVector &x,
   const Vector &xi = x.Imag();
   Vector &yr = y.Real();
   Vector &yi = y.Imag();
-  if (Ar)
+  if (Ai)
   {
-    if (!zero_real)
+    if (!zero_imag)
     {
-      Ar->MultTranspose(xr, yr);
+      Ai->MultTranspose(xi, yr);
     }
-    if (!zero_imag)
+    if (!zero_real)
     {
-      Ar->MultTranspose(xi, yi);
+      Ai->MultTranspose(xr, yi);
+      yi *= -1.0;
     }
   }
   else
@@ -192,15 +195,15 @@ void ComplexWrapperOperator::MultHermitianTranspose(const ComplexVector &x,
     yr = 0.0;
     yi = 0.0;
   }
-  if (Ai)
+  if (Ar)
   {
-    if (!zero_imag)
+    if (!zero_real)
     {
-      Ai->AddMultTranspose(xi, yr, 1.0);
+      Ar->AddMultTranspose(xr, yr);
     }
-    if (!zero_real)
+    if (!zero_imag)
     {
-      Ai->AddMultTranspose(xr, yi, -1.0);
+      Ar->AddMultTranspose(xi, yi);
     }
   }
 }
@@ -214,23 +217,14 @@ void ComplexWrapperOperator::AddMult(const ComplexVector &x, ComplexVector &y,
   const Vector &xi = x.Imag();
   Vector &yr = y.Real();
   Vector &yi = y.Imag();
+  // MFEM_VERIFY(a.real() == 0.0 || a.imag() == 0.0,
+  //             "ComplexWrapperOperator::AddMult does not support a general complex-valued
+  //             " "coefficient!");
   if (a.real() != 0.0 && a.imag() != 0.0)
   {
     ty.SetSize(height);
     Mult(x, ty);
-    const int N = height;
-    const double ar = a.real();
-    const double ai = a.imag();
-    const auto *TYR = ty.Real().Read();
-    const auto *TYI = ty.Imag().Read();
-    auto *YR = yr.ReadWrite();
-    auto *YI = yi.ReadWrite();
-    mfem::forall(N,
-                 [=] MFEM_HOST_DEVICE(int i)
-                 {
-                   YR[i] += ar * TYR[i] - ai * TYI[i];
-                   YI[i] += ai * TYR[i] + ar * TYI[i];
-                 });
+    y.AXPY(a, ty);
   }
   else if (a.real() != 0.0)
   {
@@ -293,23 +287,14 @@ void ComplexWrapperOperator::AddMultTranspose(const ComplexVector &x, ComplexVec
   const Vector &xi = x.Imag();
   Vector &yr = y.Real();
   Vector &yi = y.Imag();
+  // MFEM_VERIFY(a.real() == 0.0 || a.imag() == 0.0,
+  //             "ComplexWrapperOperator::AddMultTranspose does not support a general "
+  //             "complex-valued coefficient!");
   if (a.real() != 0.0 && a.imag() != 0.0)
   {
     tx.SetSize(width);
     MultTranspose(x, tx);
-    const int N = width;
-    const double ar = a.real();
-    const double ai = a.imag();
-    const auto *TXR = tx.Real().Read();
-    const auto *TXI = tx.Imag().Read();
-    auto *YR = yr.ReadWrite();
-    auto *YI = yi.ReadWrite();
-    mfem::forall(N,
-                 [=] MFEM_HOST_DEVICE(int i)
-                 {
-                   YR[i] += ar * TXR[i] - ai * TXI[i];
-                   YI[i] += ai * TXR[i] + ar * TXI[i];
-                 });
+    y.AXPY(a, tx);
   }
   else if (a.real() != 0.0)
   {
@@ -373,23 +358,14 @@ void ComplexWrapperOperator::AddMultHermitianTranspose(const ComplexVector &x,
   const Vector &xi = x.Imag();
   Vector &yr = y.Real();
   Vector &yi = y.Imag();
+  // MFEM_VERIFY(a.real() == 0.0 || a.imag() == 0.0,
+  //             "ComplexWrapperOperator::AddMultHermitianTranspose does not support a "
+  //             "general complex-valued coefficient!");
   if (a.real() != 0.0 && a.imag() != 0.0)
   {
     tx.SetSize(width);
     MultHermitianTranspose(x, tx);
-    const int N = width;
-    const double ar = a.real();
-    const double ai = a.imag();
-    const auto *TXR = tx.Real().Read();
-    const auto *TXI = tx.Imag().Read();
-    auto *YR = yr.ReadWrite();
-    auto *YI = yi.ReadWrite();
-    mfem::forall(N,
-                 [=] MFEM_HOST_DEVICE(int i)
-                 {
-                   YR[i] += ar * TXR[i] - ai * TXI[i];
-                   YI[i] += ai * TXR[i] + ar * TXI[i];
-                 });
+    y.AXPY(a, tx);
   }
   else if (a.real() != 0.0)
   {
@@ -487,17 +463,21 @@ void SumOperator::MultTranspose(const Vector &x, Vector &y) const
 
 void SumOperator::AddMult(const Vector &x, Vector &y, const double a) const
 {
+  z.SetSize(y.Size());
   for (const auto &[op, c] : ops)
   {
-    op->AddMult(x, y, a * c);
+    op->Mult(x, z);
+    y.Add(a * c, z);
   }
 }
 
 void SumOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) const
 {
+  z.SetSize(y.Size());
   for (const auto &[op, c] : ops)
   {
-    op->AddMultTranspose(x, y, a * c);
+    op->MultTranspose(x, z);
+    y.Add(a * c, z);
   }
 }
 
diff --git a/palace/linalg/operator.hpp b/palace/linalg/operator.hpp
index 78dc92534..ab204c076 100644
--- a/palace/linalg/operator.hpp
+++ b/palace/linalg/operator.hpp
@@ -115,6 +115,7 @@ class SumOperator : public Operator
 {
 private:
   std::vector<std::pair<const Operator *, double>> ops;
+  mutable Vector z;
 
 public:
   SumOperator(int s) : Operator(s) {}
diff --git a/palace/linalg/rap.cpp b/palace/linalg/rap.cpp
index 1fe426080..457f22511 100644
--- a/palace/linalg/rap.cpp
+++ b/palace/linalg/rap.cpp
@@ -9,32 +9,25 @@ namespace palace
 {
 
 ParOperator::ParOperator(std::unique_ptr<Operator> &&dA, const Operator *pA,
-                         const mfem::ParFiniteElementSpace &trial_fespace,
-                         const mfem::ParFiniteElementSpace &test_fespace,
-                         bool test_restrict)
+                         const FiniteElementSpace &trial_fespace,
+                         const FiniteElementSpace &test_fespace, bool test_restrict)
   : Operator(test_fespace.GetTrueVSize(), trial_fespace.GetTrueVSize()),
     data_A(std::move(dA)), A((data_A != nullptr) ? data_A.get() : pA),
     trial_fespace(trial_fespace), test_fespace(test_fespace), use_R(test_restrict),
     dbc_tdof_list(nullptr), diag_policy(DiagonalPolicy::DIAG_ONE), RAP(nullptr)
 {
   MFEM_VERIFY(A, "Cannot construct ParOperator from an empty matrix!");
-  lx.SetSize(A->Width());
-  ly.SetSize(A->Height());
-  ty.SetSize(width);
 }
 
 ParOperator::ParOperator(std::unique_ptr<Operator> &&A,
-                         const mfem::ParFiniteElementSpace &trial_fespace,
-                         const mfem::ParFiniteElementSpace &test_fespace,
-                         bool test_restrict)
+                         const FiniteElementSpace &trial_fespace,
+                         const FiniteElementSpace &test_fespace, bool test_restrict)
   : ParOperator(std::move(A), nullptr, trial_fespace, test_fespace, test_restrict)
 {
 }
 
-ParOperator::ParOperator(const Operator &A,
-                         const mfem::ParFiniteElementSpace &trial_fespace,
-                         const mfem::ParFiniteElementSpace &test_fespace,
-                         bool test_restrict)
+ParOperator::ParOperator(const Operator &A, const FiniteElementSpace &trial_fespace,
+                         const FiniteElementSpace &test_fespace, bool test_restrict)
   : ParOperator(nullptr, &A, trial_fespace, test_fespace, test_restrict)
 {
 }
@@ -59,14 +52,18 @@ void ParOperator::EliminateRHS(const Vector &x, Vector &b) const
   }
 
   MFEM_VERIFY(A, "No local matrix available for ParOperator::EliminateRHS!");
-  ty = 0.0;
-  linalg::SetSubVector(ty, *dbc_tdof_list, x);
-  trial_fespace.GetProlongationMatrix()->Mult(ty, lx);
+  auto &tx = trial_fespace.GetTVector<Vector>();
+  auto &lx = trial_fespace.GetLVector<Vector>();
+  auto &ly = GetTestLVector();
+  tx = 0.0;
+  linalg::SetSubVector(tx, *dbc_tdof_list, x);
+  trial_fespace.GetProlongationMatrix()->Mult(tx, lx);
 
   // Apply the unconstrained operator.
   A->Mult(lx, ly);
+  ly *= -1.0;
 
-  RestrictionMatrixAddMult(ly, b, -1.0);
+  RestrictionMatrixAddMult(ly, b);
   if (diag_policy == DiagonalPolicy::DIAG_ONE)
   {
     linalg::SetSubVector(b, *dbc_tdof_list, x);
@@ -98,9 +95,9 @@ mfem::HypreParMatrix &ParOperator::ParallelAssemble(bool skip_zeros) const
   if (&trial_fespace == &test_fespace)
   {
     mfem::HypreParMatrix *hA = new mfem::HypreParMatrix(
-        trial_fespace.GetComm(), trial_fespace.GlobalVSize(), trial_fespace.GetDofOffsets(),
-        const_cast<mfem::SparseMatrix *>(sA));
-    const mfem::HypreParMatrix *P = trial_fespace.Dof_TrueDof_Matrix();
+        trial_fespace.GetComm(), trial_fespace.GlobalVSize(),
+        trial_fespace.Get().GetDofOffsets(), const_cast<mfem::SparseMatrix *>(sA));
+    const mfem::HypreParMatrix *P = trial_fespace.Get().Dof_TrueDof_Matrix();
     RAP = std::make_unique<mfem::HypreParMatrix>(hypre_ParCSRMatrixRAP(*P, *hA, *P), true);
     delete hA;
   }
@@ -108,12 +105,12 @@ mfem::HypreParMatrix &ParOperator::ParallelAssemble(bool skip_zeros) const
   {
     mfem::HypreParMatrix *hA = new mfem::HypreParMatrix(
         trial_fespace.GetComm(), test_fespace.GlobalVSize(), trial_fespace.GlobalVSize(),
-        test_fespace.GetDofOffsets(), trial_fespace.GetDofOffsets(),
+        test_fespace.Get().GetDofOffsets(), trial_fespace.Get().GetDofOffsets(),
         const_cast<mfem::SparseMatrix *>(sA));
-    const mfem::HypreParMatrix *P = trial_fespace.Dof_TrueDof_Matrix();
+    const mfem::HypreParMatrix *P = trial_fespace.Get().Dof_TrueDof_Matrix();
     if (!use_R)
     {
-      const mfem::HypreParMatrix *Rt = test_fespace.Dof_TrueDof_Matrix();
+      const mfem::HypreParMatrix *Rt = test_fespace.Get().Dof_TrueDof_Matrix();
       RAP =
           std::make_unique<mfem::HypreParMatrix>(hypre_ParCSRMatrixRAP(*Rt, *hA, *P), true);
     }
@@ -122,8 +119,8 @@ mfem::HypreParMatrix &ParOperator::ParallelAssemble(bool skip_zeros) const
       mfem::SparseMatrix *sRt = mfem::Transpose(*test_fespace.GetRestrictionMatrix());
       mfem::HypreParMatrix *hRt = new mfem::HypreParMatrix(
           test_fespace.GetComm(), test_fespace.GlobalVSize(),
-          test_fespace.GlobalTrueVSize(), test_fespace.GetDofOffsets(),
-          test_fespace.GetTrueDofOffsets(), sRt);
+          test_fespace.GlobalTrueVSize(), test_fespace.Get().GetDofOffsets(),
+          test_fespace.Get().GetTrueDofOffsets(), sRt);
       RAP = std::make_unique<mfem::HypreParMatrix>(hypre_ParCSRMatrixRAP(*hRt, *hA, *P),
                                                    true);
       delete sRt;
@@ -153,24 +150,25 @@ void ParOperator::AssembleDiagonal(Vector &diag) const
   // entry-wise absolute values of the conforming prolongation operator.
   MFEM_VERIFY(&trial_fespace == &test_fespace,
               "Diagonal assembly is only available for square ParOperator!");
+  auto &lx = trial_fespace.GetLVector<Vector>();
   if (const auto *sA = dynamic_cast<const mfem::SparseMatrix *>(A))
   {
-    sA->GetDiag(ly);
+    sA->GetDiag(lx);
   }
   else
   {
-    A->AssembleDiagonal(ly);
+    A->AssembleDiagonal(lx);
   }
 
   // Parallel assemble and eliminate essential true dofs.
   const Operator *P = test_fespace.GetProlongationMatrix();
   if (const auto *hP = dynamic_cast<const mfem::HypreParMatrix *>(P))
   {
-    hP->AbsMultTranspose(1.0, ly, 0.0, diag);
+    hP->AbsMultTranspose(1.0, lx, 0.0, diag);
   }
   else
   {
-    P->MultTranspose(ly, diag);
+    P->MultTranspose(lx, diag);
   }
   if (dbc_tdof_list)
   {
@@ -195,11 +193,14 @@ void ParOperator::Mult(const Vector &x, Vector &y) const
     return;
   }
 
+  auto &lx = trial_fespace.GetLVector<Vector>();
+  auto &ly = GetTestLVector();
   if (dbc_tdof_list)
   {
-    ty = x;
-    linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
-    trial_fespace.GetProlongationMatrix()->Mult(ty, lx);
+    auto &tx = trial_fespace.GetTVector<Vector>();
+    tx = x;
+    linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
+    trial_fespace.GetProlongationMatrix()->Mult(tx, lx);
   }
   else
   {
@@ -223,84 +224,95 @@ void ParOperator::Mult(const Vector &x, Vector &y) const
   }
 }
 
-void ParOperator::AddMult(const Vector &x, Vector &y, const double a) const
+void ParOperator::MultTranspose(const Vector &x, Vector &y) const
 {
-  MFEM_ASSERT(x.Size() == width && y.Size() == height,
-              "Incompatible dimensions for ParOperator::AddMult!");
+  MFEM_ASSERT(x.Size() == height && y.Size() == width,
+              "Incompatible dimensions for ParOperator::MultTranspose!");
   if (RAP)
   {
-    RAP->AddMult(x, y, a);
+    RAP->MultTranspose(x, y);
     return;
   }
 
+  auto &lx = trial_fespace.GetLVector<Vector>();
+  auto &ly = GetTestLVector();
   if (dbc_tdof_list)
   {
+    auto &ty = test_fespace.GetTVector<Vector>();
     ty = x;
     linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
-    trial_fespace.GetProlongationMatrix()->Mult(ty, lx);
+    RestrictionMatrixMultTranspose(ty, ly);
   }
   else
   {
-    trial_fespace.GetProlongationMatrix()->Mult(x, lx);
+    RestrictionMatrixMultTranspose(x, ly);
   }
 
   // Apply the operator on the L-vector.
-  A->Mult(lx, ly);
+  A->MultTranspose(ly, lx);
 
+  trial_fespace.GetProlongationMatrix()->MultTranspose(lx, y);
   if (dbc_tdof_list)
   {
-    RestrictionMatrixMult(ly, ty);
     if (diag_policy == DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(ty, *dbc_tdof_list, x);
+      linalg::SetSubVector(y, *dbc_tdof_list, x);
     }
     else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(y, *dbc_tdof_list, 0.0);
     }
-    y.Add(a, ty);
-  }
-  else
-  {
-    RestrictionMatrixAddMult(ly, y, a);
   }
 }
 
-void ParOperator::MultTranspose(const Vector &x, Vector &y) const
+void ParOperator::AddMult(const Vector &x, Vector &y, const double a) const
 {
-  MFEM_ASSERT(x.Size() == height && y.Size() == width,
-              "Incompatible dimensions for ParOperator::MultTranspose!");
+  MFEM_ASSERT(x.Size() == width && y.Size() == height,
+              "Incompatible dimensions for ParOperator::AddMult!");
   if (RAP)
   {
-    RAP->MultTranspose(x, y);
+    RAP->AddMult(x, y, a);
     return;
   }
 
+  auto &lx = trial_fespace.GetLVector<Vector>();
+  auto &ly = GetTestLVector();
   if (dbc_tdof_list)
   {
-    ty = x;
-    linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
-    RestrictionMatrixMultTranspose(ty, ly);
+    auto &tx = trial_fespace.GetTVector<Vector>();
+    tx = x;
+    linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
+    trial_fespace.GetProlongationMatrix()->Mult(tx, lx);
   }
   else
   {
-    RestrictionMatrixMultTranspose(x, ly);
+    trial_fespace.GetProlongationMatrix()->Mult(x, lx);
   }
 
   // Apply the operator on the L-vector.
-  A->MultTranspose(ly, lx);
+  A->Mult(lx, ly);
 
-  trial_fespace.GetProlongationMatrix()->MultTranspose(lx, y);
   if (dbc_tdof_list)
   {
+    auto &ty = test_fespace.GetTVector<Vector>();
+    RestrictionMatrixMult(ly, ty);
     if (diag_policy == DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, x);
+      linalg::SetSubVector(ty, *dbc_tdof_list, x);
     }
     else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+    }
+    y.Add(a, ty);
+  }
+  else
+  {
+    if (a != 1.0)
+    {
+      ly *= a;
     }
+    RestrictionMatrixAddMult(ly, y);
   }
 }
 
@@ -314,8 +326,11 @@ void ParOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) c
     return;
   }
 
+  auto &lx = trial_fespace.GetLVector<Vector>();
+  auto &ly = GetTestLVector();
   if (dbc_tdof_list)
   {
+    auto &ty = test_fespace.GetTVector<Vector>();
     ty = x;
     linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
     RestrictionMatrixMultTranspose(ty, ly);
@@ -330,20 +345,25 @@ void ParOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) c
 
   if (dbc_tdof_list)
   {
-    trial_fespace.GetProlongationMatrix()->MultTranspose(lx, ty);
+    auto &tx = trial_fespace.GetTVector<Vector>();
+    trial_fespace.GetProlongationMatrix()->MultTranspose(lx, tx);
     if (diag_policy == DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(ty, *dbc_tdof_list, x);
+      linalg::SetSubVector(tx, *dbc_tdof_list, x);
     }
     else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
     }
-    y.Add(a, ty);
+    y.Add(a, tx);
   }
   else
   {
-    trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx, y, a);
+    if (a != 1.0)
+    {
+      lx *= a;
+    }
+    trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx, y);
   }
 }
 
@@ -359,16 +379,15 @@ void ParOperator::RestrictionMatrixMult(const Vector &ly, Vector &ty) const
   }
 }
 
-void ParOperator::RestrictionMatrixAddMult(const Vector &ly, Vector &ty,
-                                           const double a) const
+void ParOperator::RestrictionMatrixAddMult(const Vector &ly, Vector &ty) const
 {
   if (!use_R)
   {
-    test_fespace.GetProlongationMatrix()->AddMultTranspose(ly, ty, a);
+    test_fespace.GetProlongationMatrix()->AddMultTranspose(ly, ty);
   }
   else
   {
-    test_fespace.GetRestrictionMatrix()->AddMult(ly, ty, a);
+    test_fespace.GetRestrictionMatrix()->AddMult(ly, ty);
   }
 }
 
@@ -384,11 +403,17 @@ void ParOperator::RestrictionMatrixMultTranspose(const Vector &ty, Vector &ly) c
   }
 }
 
+Vector &ParOperator::GetTestLVector() const
+{
+  return (&trial_fespace == &test_fespace) ? trial_fespace.GetLVector2<Vector>()
+                                           : test_fespace.GetLVector<Vector>();
+}
+
 ComplexParOperator::ComplexParOperator(std::unique_ptr<Operator> &&dAr,
                                        std::unique_ptr<Operator> &&dAi, const Operator *pAr,
                                        const Operator *pAi,
-                                       const mfem::ParFiniteElementSpace &trial_fespace,
-                                       const mfem::ParFiniteElementSpace &test_fespace,
+                                       const FiniteElementSpace &trial_fespace,
+                                       const FiniteElementSpace &test_fespace,
                                        bool test_restrict)
   : ComplexOperator(test_fespace.GetTrueVSize(), trial_fespace.GetTrueVSize()),
     data_A((dAr != nullptr || dAi != nullptr)
@@ -407,15 +432,12 @@ ComplexParOperator::ComplexParOperator(std::unique_ptr<Operator> &&dAr,
   // We use the non-owning constructors for real and imaginary part ParOperators, since we
   // construct A as a ComplexWrapperOperator which has separate access to the real and
   // imaginary components.
-  lx.SetSize(A->Width());
-  ly.SetSize(A->Height());
-  ty.SetSize(width);
 }
 
 ComplexParOperator::ComplexParOperator(std::unique_ptr<Operator> &&Ar,
                                        std::unique_ptr<Operator> &&Ai,
-                                       const mfem::ParFiniteElementSpace &trial_fespace,
-                                       const mfem::ParFiniteElementSpace &test_fespace,
+                                       const FiniteElementSpace &trial_fespace,
+                                       const FiniteElementSpace &test_fespace,
                                        bool test_restrict)
   : ComplexParOperator(std::move(Ar), std::move(Ai), nullptr, nullptr, trial_fespace,
                        test_fespace, test_restrict)
@@ -423,8 +445,8 @@ ComplexParOperator::ComplexParOperator(std::unique_ptr<Operator> &&Ar,
 }
 
 ComplexParOperator::ComplexParOperator(const Operator *Ar, const Operator *Ai,
-                                       const mfem::ParFiniteElementSpace &trial_fespace,
-                                       const mfem::ParFiniteElementSpace &test_fespace,
+                                       const FiniteElementSpace &trial_fespace,
+                                       const FiniteElementSpace &test_fespace,
                                        bool test_restrict)
   : ComplexParOperator(nullptr, nullptr, Ar, Ai, trial_fespace, test_fespace, test_restrict)
 {
@@ -471,12 +493,15 @@ void ComplexParOperator::Mult(const ComplexVector &x, ComplexVector &y) const
 {
   MFEM_ASSERT(x.Size() == width && y.Size() == height,
               "Incompatible dimensions for ComplexParOperator::Mult!");
+  auto &lx = trial_fespace.GetLVector<ComplexVector>();
+  auto &ly = GetTestLVector();
   if (dbc_tdof_list)
   {
-    ty = x;
-    linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
-    trial_fespace.GetProlongationMatrix()->Mult(ty.Real(), lx.Real());
-    trial_fespace.GetProlongationMatrix()->Mult(ty.Imag(), lx.Imag());
+    auto &tx = trial_fespace.GetTVector<ComplexVector>();
+    tx = x;
+    linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
+    trial_fespace.GetProlongationMatrix()->Mult(tx.Real(), lx.Real());
+    trial_fespace.GetProlongationMatrix()->Mult(tx.Imag(), lx.Imag());
   }
   else
   {
@@ -501,53 +526,52 @@ void ComplexParOperator::Mult(const ComplexVector &x, ComplexVector &y) const
   }
 }
 
-void ComplexParOperator::AddMult(const ComplexVector &x, ComplexVector &y,
-                                 const std::complex<double> a) const
+void ComplexParOperator::MultTranspose(const ComplexVector &x, ComplexVector &y) const
 {
-  MFEM_ASSERT(x.Size() == width && y.Size() == height,
-              "Incompatible dimensions for ComplexParOperator::AddMult!");
+  MFEM_ASSERT(x.Size() == height && y.Size() == width,
+              "Incompatible dimensions for ComplexParOperator::MultTranspose!");
+  auto &lx = trial_fespace.GetLVector<ComplexVector>();
+  auto &ly = GetTestLVector();
   if (dbc_tdof_list)
   {
+    auto &ty = test_fespace.GetTVector<ComplexVector>();
     ty = x;
     linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
-    trial_fespace.GetProlongationMatrix()->Mult(ty.Real(), lx.Real());
-    trial_fespace.GetProlongationMatrix()->Mult(ty.Imag(), lx.Imag());
+    RestrictionMatrixMultTranspose(ty, ly);
   }
   else
   {
-    trial_fespace.GetProlongationMatrix()->Mult(x.Real(), lx.Real());
-    trial_fespace.GetProlongationMatrix()->Mult(x.Imag(), lx.Imag());
+    RestrictionMatrixMultTranspose(x, ly);
   }
 
   // Apply the operator on the L-vector.
-  ly = 0.0;
-  A->AddMult(lx, ly, a);
+  A->MultTranspose(ly, lx);
 
+  trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), y.Real());
+  trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), y.Imag());
   if (dbc_tdof_list)
   {
-    RestrictionMatrixMult(ly, ty);
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(ty, *dbc_tdof_list, x);
+      linalg::SetSubVector(y, *dbc_tdof_list, x);
     }
     else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(y, *dbc_tdof_list, 0.0);
     }
-    y += ty;
-  }
-  else
-  {
-    RestrictionMatrixAddMult(ly, y, 1.0);
   }
 }
 
-void ComplexParOperator::MultTranspose(const ComplexVector &x, ComplexVector &y) const
+void ComplexParOperator::MultHermitianTranspose(const ComplexVector &x,
+                                                ComplexVector &y) const
 {
   MFEM_ASSERT(x.Size() == height && y.Size() == width,
-              "Incompatible dimensions for ComplexParOperator::MultTranspose!");
+              "Incompatible dimensions for ComplexParOperator::MultHermitianTranspose!");
+  auto &lx = trial_fespace.GetLVector<ComplexVector>();
+  auto &ly = GetTestLVector();
   if (dbc_tdof_list)
   {
+    auto &ty = test_fespace.GetTVector<ComplexVector>();
     ty = x;
     linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
     RestrictionMatrixMultTranspose(ty, ly);
@@ -558,7 +582,7 @@ void ComplexParOperator::MultTranspose(const ComplexVector &x, ComplexVector &y)
   }
 
   // Apply the operator on the L-vector.
-  A->MultTranspose(ly, lx);
+  A->MultHermitianTranspose(ly, lx);
 
   trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), y.Real());
   trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), y.Imag());
@@ -575,30 +599,34 @@ void ComplexParOperator::MultTranspose(const ComplexVector &x, ComplexVector &y)
   }
 }
 
-void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector &y,
-                                          const std::complex<double> a) const
+void ComplexParOperator::AddMult(const ComplexVector &x, ComplexVector &y,
+                                 const std::complex<double> a) const
 {
-  MFEM_ASSERT(x.Size() == height && y.Size() == width,
-              "Incompatible dimensions for ComplexParOperator::AddMultTranspose!");
+  MFEM_ASSERT(x.Size() == width && y.Size() == height,
+              "Incompatible dimensions for ComplexParOperator::AddMult!");
+  auto &lx = trial_fespace.GetLVector<ComplexVector>();
+  auto &ly = GetTestLVector();
   if (dbc_tdof_list)
   {
-    ty = x;
-    linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
-    RestrictionMatrixMultTranspose(ty, ly);
+    auto &tx = trial_fespace.GetTVector<ComplexVector>();
+    tx = x;
+    linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
+    trial_fespace.GetProlongationMatrix()->Mult(tx.Real(), lx.Real());
+    trial_fespace.GetProlongationMatrix()->Mult(tx.Imag(), lx.Imag());
   }
   else
   {
-    RestrictionMatrixMultTranspose(x, ly);
+    trial_fespace.GetProlongationMatrix()->Mult(x.Real(), lx.Real());
+    trial_fespace.GetProlongationMatrix()->Mult(x.Imag(), lx.Imag());
   }
 
   // Apply the operator on the L-vector.
-  lx = 0.0;
-  A->AddMultTranspose(ly, lx, a);
+  A->Mult(lx, ly);
 
   if (dbc_tdof_list)
   {
-    trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), ty.Real());
-    trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), ty.Imag());
+    auto &ty = test_fespace.GetTVector<ComplexVector>();
+    RestrictionMatrixMult(ly, ty);
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
       linalg::SetSubVector(ty, *dbc_tdof_list, x);
@@ -607,22 +635,28 @@ void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector
     {
       linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
     }
-    y += ty;
+    y.AXPY(a, ty);
   }
   else
   {
-    trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Real(), y.Real());
-    trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Imag(), y.Imag());
+    if (a != 1.0)
+    {
+      ly *= a;
+    }
+    RestrictionMatrixAddMult(ly, y);
   }
 }
 
-void ComplexParOperator::MultHermitianTranspose(const ComplexVector &x,
-                                                ComplexVector &y) const
+void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector &y,
+                                          const std::complex<double> a) const
 {
   MFEM_ASSERT(x.Size() == height && y.Size() == width,
-              "Incompatible dimensions for ComplexParOperator::MultHermitianTranspose!");
+              "Incompatible dimensions for ComplexParOperator::AddMultTranspose!");
+  auto &lx = trial_fespace.GetLVector<ComplexVector>();
+  auto &ly = GetTestLVector();
   if (dbc_tdof_list)
   {
+    auto &ty = test_fespace.GetTVector<ComplexVector>();
     ty = x;
     linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
     RestrictionMatrixMultTranspose(ty, ly);
@@ -633,20 +667,31 @@ void ComplexParOperator::MultHermitianTranspose(const ComplexVector &x,
   }
 
   // Apply the operator on the L-vector.
-  A->MultHermitianTranspose(ly, lx);
+  A->MultTranspose(ly, lx);
 
-  trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), y.Real());
-  trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), y.Imag());
   if (dbc_tdof_list)
   {
+    auto &tx = trial_fespace.GetTVector<ComplexVector>();
+    trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real());
+    trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag());
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, x);
+      linalg::SetSubVector(tx, *dbc_tdof_list, x);
     }
     else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
     }
+    y.AXPY(a, tx);
+  }
+  else
+  {
+    if (a != 1.0)
+    {
+      lx *= a;
+    }
+    trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Real(), y.Real());
+    trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Imag(), y.Imag());
   }
 }
 
@@ -655,8 +700,11 @@ void ComplexParOperator::AddMultHermitianTranspose(const ComplexVector &x, Compl
 {
   MFEM_ASSERT(x.Size() == height && y.Size() == width,
               "Incompatible dimensions for ComplexParOperator::AddMultHermitianTranspose!");
+  auto &lx = trial_fespace.GetLVector<ComplexVector>();
+  auto &ly = GetTestLVector();
   if (dbc_tdof_list)
   {
+    auto &ty = test_fespace.GetTVector<ComplexVector>();
     ty = x;
     linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
     RestrictionMatrixMultTranspose(ty, ly);
@@ -667,25 +715,29 @@ void ComplexParOperator::AddMultHermitianTranspose(const ComplexVector &x, Compl
   }
 
   // Apply the operator on the L-vector.
-  lx = 0.0;
-  A->AddMultHermitianTranspose(ly, lx, a);
+  A->MultHermitianTranspose(ly, lx);
 
   if (dbc_tdof_list)
   {
-    trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), ty.Real());
-    trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), ty.Imag());
+    auto &tx = trial_fespace.GetTVector<ComplexVector>();
+    trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real());
+    trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag());
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(ty, *dbc_tdof_list, x);
+      linalg::SetSubVector(tx, *dbc_tdof_list, x);
     }
     else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
     }
-    y += ty;
+    y.AXPY(a, tx);
   }
   else
   {
+    if (a != 1.0)
+    {
+      lx *= a;
+    }
     trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Real(), y.Real());
     trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Imag(), y.Imag());
   }
@@ -707,17 +759,17 @@ void ComplexParOperator::RestrictionMatrixMult(const ComplexVector &ly,
 }
 
 void ComplexParOperator::RestrictionMatrixAddMult(const ComplexVector &ly,
-                                                  ComplexVector &ty, const double a) const
+                                                  ComplexVector &ty) const
 {
   if (!use_R)
   {
-    test_fespace.GetProlongationMatrix()->AddMultTranspose(ly.Real(), ty.Real(), a);
-    test_fespace.GetProlongationMatrix()->AddMultTranspose(ly.Imag(), ty.Imag(), a);
+    test_fespace.GetProlongationMatrix()->AddMultTranspose(ly.Real(), ty.Real());
+    test_fespace.GetProlongationMatrix()->AddMultTranspose(ly.Imag(), ty.Imag());
   }
   else
   {
-    test_fespace.GetRestrictionMatrix()->AddMult(ly.Real(), ty.Real(), a);
-    test_fespace.GetRestrictionMatrix()->AddMult(ly.Imag(), ty.Imag(), a);
+    test_fespace.GetRestrictionMatrix()->AddMult(ly.Real(), ty.Real());
+    test_fespace.GetRestrictionMatrix()->AddMult(ly.Imag(), ty.Imag());
   }
 }
 
@@ -736,4 +788,10 @@ void ComplexParOperator::RestrictionMatrixMultTranspose(const ComplexVector &ty,
   }
 }
 
+ComplexVector &ComplexParOperator::GetTestLVector() const
+{
+  return (&trial_fespace == &test_fespace) ? trial_fespace.GetLVector2<ComplexVector>()
+                                           : test_fespace.GetLVector<ComplexVector>();
+}
+
 }  // namespace palace
diff --git a/palace/linalg/rap.hpp b/palace/linalg/rap.hpp
index bbfa00edb..2ee99d3e0 100644
--- a/palace/linalg/rap.hpp
+++ b/palace/linalg/rap.hpp
@@ -6,13 +6,10 @@
 
 #include <memory>
 #include <mfem.hpp>
+#include "fem/fespace.hpp"
 #include "linalg/operator.hpp"
 #include "linalg/vector.hpp"
 
-// XX TODO: Many ParOperator and ComplexParOperator objects could share the same local
-//          temporary vectors used in parallel matrix-vector products (lx, ly, ty) for
-//          improved memory usage.
-
 namespace palace
 {
 
@@ -31,7 +28,7 @@ class ParOperator : public Operator
   const Operator *A;
 
   // Finite element spaces for parallel prolongation and restriction.
-  const mfem::ParFiniteElementSpace &trial_fespace, &test_fespace;
+  const FiniteElementSpace &trial_fespace, &test_fespace;
   const bool use_R;
 
   // Lists of constrained essential boundary true dofs for elimination.
@@ -44,32 +41,29 @@ class ParOperator : public Operator
   // deleted.
   mutable std::unique_ptr<mfem::HypreParMatrix> RAP;
 
-  // Temporary storage for operator application.
-  mutable Vector lx, ly, ty;
-
   // Helper methods for operator application.
   void RestrictionMatrixMult(const Vector &ly, Vector &ty) const;
-  void RestrictionMatrixAddMult(const Vector &ly, Vector &ty, const double a) const;
+  void RestrictionMatrixAddMult(const Vector &ly, Vector &ty) const;
   void RestrictionMatrixMultTranspose(const Vector &ty, Vector &ly) const;
+  Vector &GetTestLVector() const;
 
   ParOperator(std::unique_ptr<Operator> &&dA, const Operator *pA,
-              const mfem::ParFiniteElementSpace &trial_fespace,
-              const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict);
+              const FiniteElementSpace &trial_fespace,
+              const FiniteElementSpace &test_fespace, bool test_restrict);
 
 public:
   // Construct the parallel operator, inheriting ownership of the local operator.
-  ParOperator(std::unique_ptr<Operator> &&A,
-              const mfem::ParFiniteElementSpace &trial_fespace,
-              const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict);
-  ParOperator(std::unique_ptr<Operator> &&A, const mfem::ParFiniteElementSpace &fespace)
+  ParOperator(std::unique_ptr<Operator> &&A, const FiniteElementSpace &trial_fespace,
+              const FiniteElementSpace &test_fespace, bool test_restrict);
+  ParOperator(std::unique_ptr<Operator> &&A, const FiniteElementSpace &fespace)
     : ParOperator(std::move(A), fespace, fespace, false)
   {
   }
 
   // Non-owning constructors.
-  ParOperator(const Operator &A, const mfem::ParFiniteElementSpace &trial_fespace,
-              const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict);
-  ParOperator(const Operator &A, const mfem::ParFiniteElementSpace &fespace)
+  ParOperator(const Operator &A, const FiniteElementSpace &trial_fespace,
+              const FiniteElementSpace &test_fespace, bool test_restrict);
+  ParOperator(const Operator &A, const FiniteElementSpace &fespace)
     : ParOperator(A, fespace, fespace, false)
   {
   }
@@ -122,11 +116,11 @@ class ComplexParOperator : public ComplexOperator
   const ComplexWrapperOperator *A;
 
   // Finite element spaces for parallel prolongation and restriction.
-  const mfem::ParFiniteElementSpace &trial_fespace, &test_fespace;
+  const FiniteElementSpace &trial_fespace, &test_fespace;
   const bool use_R;
 
   // Lists of constrained essential boundary true dofs for elimination.
-  mutable const mfem::Array<int> *dbc_tdof_list;
+  const mfem::Array<int> *dbc_tdof_list;
 
   // Diagonal policy for constrained true dofs.
   Operator::DiagonalPolicy diag_policy;
@@ -134,38 +128,35 @@ class ComplexParOperator : public ComplexOperator
   // Real and imaginary parts of the operator as non-owning ParOperator objects.
   std::unique_ptr<ParOperator> RAPr, RAPi;
 
-  // Temporary storage for operator application.
-  mutable ComplexVector lx, ly, ty;
-
   // Helper methods for operator application.
   void RestrictionMatrixMult(const ComplexVector &ly, ComplexVector &ty) const;
-  void RestrictionMatrixAddMult(const ComplexVector &ly, ComplexVector &ty,
-                                const double a) const;
+  void RestrictionMatrixAddMult(const ComplexVector &ly, ComplexVector &ty) const;
   void RestrictionMatrixMultTranspose(const ComplexVector &ty, ComplexVector &ly) const;
+  ComplexVector &GetTestLVector() const;
 
   ComplexParOperator(std::unique_ptr<Operator> &&dAr, std::unique_ptr<Operator> &&dAi,
                      const Operator *pAr, const Operator *pAi,
-                     const mfem::ParFiniteElementSpace &trial_fespace,
-                     const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict);
+                     const FiniteElementSpace &trial_fespace,
+                     const FiniteElementSpace &test_fespace, bool test_restrict);
 
 public:
   // Construct the complex-valued parallel operator from the separate real and imaginary
   // parts, inheriting ownership of the local operator.
   ComplexParOperator(std::unique_ptr<Operator> &&Ar, std::unique_ptr<Operator> &&Ai,
-                     const mfem::ParFiniteElementSpace &trial_fespace,
-                     const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict);
+                     const FiniteElementSpace &trial_fespace,
+                     const FiniteElementSpace &test_fespace, bool test_restrict);
   ComplexParOperator(std::unique_ptr<Operator> &&Ar, std::unique_ptr<Operator> &&Ai,
-                     const mfem::ParFiniteElementSpace &fespace)
+                     const FiniteElementSpace &fespace)
     : ComplexParOperator(std::move(Ar), std::move(Ai), fespace, fespace, false)
   {
   }
 
   // Non-owning constructors.
   ComplexParOperator(const Operator *Ar, const Operator *Ai,
-                     const mfem::ParFiniteElementSpace &trial_fespace,
-                     const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict);
+                     const FiniteElementSpace &trial_fespace,
+                     const FiniteElementSpace &test_fespace, bool test_restrict);
   ComplexParOperator(const Operator *Ar, const Operator *Ai,
-                     const mfem::ParFiniteElementSpace &fespace)
+                     const FiniteElementSpace &fespace)
     : ComplexParOperator(Ar, Ai, fespace, fespace, false)
   {
   }
diff --git a/palace/linalg/solver.hpp b/palace/linalg/solver.hpp
index 33f3c2ed5..1a47a64bc 100644
--- a/palace/linalg/solver.hpp
+++ b/palace/linalg/solver.hpp
@@ -47,6 +47,21 @@ class Solver : public OperType
   {
     MFEM_ABORT("MultTranspose() is not implemented for base class Solver<OperType>!");
   }
+
+  // Apply the solver with a preallocated temporary storage vector.
+  virtual void Mult2(const VecType &x, VecType &y, VecType &r) const
+  {
+    MFEM_ABORT("Mult2() with temporary storage vector is not implemented for base class "
+               "Solver<OperType>!");
+  }
+
+  // Apply the solver for the transpose problem with a preallocated temporary storage
+  //  vector.
+  virtual void MultTranspose2(const VecType &x, VecType &y, VecType &r) const
+  {
+    MFEM_ABORT("MultTranspose2() with temporary storage vector is not implemented for base "
+               "class Solver<OperType>!");
+  }
 };
 
 // This solver wraps a real-valued mfem::Solver for application to complex-valued problems
diff --git a/palace/linalg/vector.cpp b/palace/linalg/vector.cpp
index 84642553a..b0e118a44 100644
--- a/palace/linalg/vector.cpp
+++ b/palace/linalg/vector.cpp
@@ -31,6 +31,10 @@ ComplexVector::ComplexVector(const std::complex<double> *py, int n) : ComplexVec
 
 void ComplexVector::SetSize(int n)
 {
+  if (x.Size() == 2 * n)
+  {
+    return;
+  }
   x.SetSize(2 * n);
   xr.MakeRef(x, 0, n);
   xi.MakeRef(x, n, n);

From 1161062613b8ea6d6a63ae0b135e97fb3790cb02 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Mon, 29 Jan 2024 08:38:56 -0800
Subject: [PATCH 5/5] Address PR feedback

---
 palace/linalg/operator.cpp |  9 ---------
 palace/linalg/solver.cpp   | 40 ++++++++++++++------------------------
 2 files changed, 15 insertions(+), 34 deletions(-)

diff --git a/palace/linalg/operator.cpp b/palace/linalg/operator.cpp
index 8723ade41..461f4b2f6 100644
--- a/palace/linalg/operator.cpp
+++ b/palace/linalg/operator.cpp
@@ -217,9 +217,6 @@ void ComplexWrapperOperator::AddMult(const ComplexVector &x, ComplexVector &y,
   const Vector &xi = x.Imag();
   Vector &yr = y.Real();
   Vector &yi = y.Imag();
-  // MFEM_VERIFY(a.real() == 0.0 || a.imag() == 0.0,
-  //             "ComplexWrapperOperator::AddMult does not support a general complex-valued
-  //             " "coefficient!");
   if (a.real() != 0.0 && a.imag() != 0.0)
   {
     ty.SetSize(height);
@@ -287,9 +284,6 @@ void ComplexWrapperOperator::AddMultTranspose(const ComplexVector &x, ComplexVec
   const Vector &xi = x.Imag();
   Vector &yr = y.Real();
   Vector &yi = y.Imag();
-  // MFEM_VERIFY(a.real() == 0.0 || a.imag() == 0.0,
-  //             "ComplexWrapperOperator::AddMultTranspose does not support a general "
-  //             "complex-valued coefficient!");
   if (a.real() != 0.0 && a.imag() != 0.0)
   {
     tx.SetSize(width);
@@ -358,9 +352,6 @@ void ComplexWrapperOperator::AddMultHermitianTranspose(const ComplexVector &x,
   const Vector &xi = x.Imag();
   Vector &yr = y.Real();
   Vector &yi = y.Imag();
-  // MFEM_VERIFY(a.real() == 0.0 || a.imag() == 0.0,
-  //             "ComplexWrapperOperator::AddMultHermitianTranspose does not support a "
-  //             "general complex-valued coefficient!");
   if (a.real() != 0.0 && a.imag() != 0.0)
   {
     tx.SetSize(width);
diff --git a/palace/linalg/solver.cpp b/palace/linalg/solver.cpp
index 29584eaee..7ea33e07b 100644
--- a/palace/linalg/solver.cpp
+++ b/palace/linalg/solver.cpp
@@ -21,11 +21,8 @@ void MfemWrapperSolver<Operator>::SetOperator(const Operator &op)
     const auto *PtAP = dynamic_cast<const ParOperator *>(&op);
     MFEM_VERIFY(PtAP,
                 "MfemWrapperSolver must be able to construct a HypreParMatrix operator!");
-    pc->SetOperator(PtAP->ParallelAssemble());
-    if (!save_assembled)
-    {
-      PtAP->StealParallelAssemble();
-    }
+    pc->SetOperator(!save_assembled ? *PtAP->StealParallelAssemble()
+                                    : PtAP->ParallelAssemble());
   }
   this->height = op.Height();
   this->width = op.Width();
@@ -34,31 +31,24 @@ void MfemWrapperSolver<Operator>::SetOperator(const Operator &op)
 template <>
 void MfemWrapperSolver<ComplexOperator>::SetOperator(const ComplexOperator &op)
 {
-  // XX TODO: Test complex matrix assembly if coarse solve supports it
   // Assemble the real and imaginary parts, then add.
-  const mfem::HypreParMatrix *hAr = nullptr, *hAi = nullptr;
+  // XX TODO: Test complex matrix assembly if coarse solve supports it
+  const mfem::HypreParMatrix *hAr = dynamic_cast<const mfem::HypreParMatrix *>(op.Real());
+  const mfem::HypreParMatrix *hAi = dynamic_cast<const mfem::HypreParMatrix *>(op.Imag());
   const ParOperator *PtAPr = nullptr, *PtAPi = nullptr;
-  if (op.Real())
+  if (op.Real() && !hAr)
   {
-    hAr = dynamic_cast<const mfem::HypreParMatrix *>(op.Real());
-    if (!hAr)
-    {
-      PtAPr = dynamic_cast<const ParOperator *>(op.Real());
-      MFEM_VERIFY(PtAPr,
-                  "MfemWrapperSolver must be able to construct a HypreParMatrix operator!");
-      hAr = &PtAPr->ParallelAssemble();
-    }
+    PtAPr = dynamic_cast<const ParOperator *>(op.Real());
+    MFEM_VERIFY(PtAPr,
+                "MfemWrapperSolver must be able to construct a HypreParMatrix operator!");
+    hAr = &PtAPr->ParallelAssemble();
   }
-  if (op.Imag())
+  if (op.Imag() && !hAi)
   {
-    hAi = dynamic_cast<const mfem::HypreParMatrix *>(op.Imag());
-    if (!hAi)
-    {
-      PtAPi = dynamic_cast<const ParOperator *>(op.Imag());
-      MFEM_VERIFY(PtAPi,
-                  "MfemWrapperSolver must be able to construct a HypreParMatrix operator!");
-      hAi = &PtAPi->ParallelAssemble();
-    }
+    PtAPi = dynamic_cast<const ParOperator *>(op.Imag());
+    MFEM_VERIFY(PtAPi,
+                "MfemWrapperSolver must be able to construct a HypreParMatrix operator!");
+    hAi = &PtAPi->ParallelAssemble();
   }
   if (hAr && hAi)
   {