From 7ec5379aaa24fbc9e337cd36a6542769abf2b5d5 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Tue, 4 Oct 2022 14:27:32 -0700
Subject: [PATCH 1/8] Runge-Kutta support for AMR

This adds RK2, RK3 and RK4 in a new namespace RungeKutta.  Together with the
enhanced FillPatcher class, these functions can be used for RK time stepping
in AMR simulations.  A new function AmrLevel::RK is added for AmrLevel based
codes.  See CNS::advance in Tests/GPU/CNS/CNS_advance.cpp for an example of
using the new AmrLevel::RK function.

The main motivation for this PR is that ghost cell filling for high order (>
2) RK methods at coarse/fine boundary is non-trivial when there is
subcycling.
---
 Src/Amr/AMReX_AmrLevel.H                      | 113 ++++++-
 Src/Amr/AMReX_AmrLevel.cpp                    |  27 ++
 Src/AmrCore/AMReX_FillPatcher.H               | 275 ++++++++++++++++-
 Src/Base/AMReX_RungeKutta.H                   | 287 ++++++++++++++++++
 Src/Base/CMakeLists.txt                       |   1 +
 Src/Base/Make.package                         |   2 +-
 Tests/GPU/CNS/Source/CNS.H                    |   2 +
 Tests/GPU/CNS/Source/CNS.cpp                  |   5 +
 Tests/GPU/CNS/Source/CNS_advance.cpp          |  33 +-
 .../CNS/Source/diffusion/CNS_diffusion_K.H    |  20 +-
 10 files changed, 707 insertions(+), 58 deletions(-)
 create mode 100644 Src/Base/AMReX_RungeKutta.H
diff --git a/Src/Amr/AMReX_AmrLevel.H b/Src/Amr/AMReX_AmrLevel.H
index cca2e9776cd..5034df1b5e5 100644
--- a/Src/Amr/AMReX_AmrLevel.H
+++ b/Src/Amr/AMReX_AmrLevel.H
@@ -15,6 +15,7 @@
 #include <AMReX_StateDescriptor.H>
 #include <AMReX_StateData.H>
 #include <AMReX_VisMF.H>
+#include <AMReX_RungeKutta.H>
 #include <AMReX_FillPatcher.H>
 #ifdef AMREX_USE_EB
 #include <AMReX_EBSupport.H>
@@ -153,11 +154,10 @@ public:
                           int  ncycle) = 0;
 
     /**
-    * \brief Contains operations to be done after a timestep.  This is a
-    * pure virtual function and hence MUST be implemented by derived
-    * classes.
+    * \brief Contains operations to be done after a timestep.  If this
+    * function is overridden, don't forget to reset FillPatcher.
     */
-    virtual  void post_timestep (int iteration) = 0;
+    virtual void post_timestep (int iteration);
     /**
     * \brief Contains operations to be done only after a full coarse
     * timestep.  The default implementation does nothing.
@@ -397,8 +397,33 @@ public:
                               Real      time,
                               int       index,
                               int       scomp,
-                             int       ncomp,
-                             int       dcomp=0);
+                              int       ncomp,
+                              int       dcomp=0);
+
+    /**
+     * \brief Evolve one step with Runge-Kutta (2, 3, or 4)
+     *
+     * To use RK, the StateData must have all the ghost cells needed.  See
+     * namespace RungeKutta for expected function signatures of the callable
+     * parameters.
+     *
+     * \param order      order of RK
+     * \param state_type index of StateData
+     * \param time       time at the beginning of the step.
+     * \param dt         time step
+     * \param iteration  iteration number on fine level during a coarse time
+     *                   step.  For an AMR simulation with subcycling and a
+     *                   refinement ratio of 2, the number is either 1 or 2,
+     *                   denoting the first and second substep, respectively.
+     * \param ncycle     number of subcyling steps.  It's usually 2 or 4.
+     *                   Without subcycling, this will be 1.
+     * \param f          computing right-hand side for evolving the StateData.
+     *                   One can also register data for flux registers in this.
+     * \param p          optionally post-processing RK stage results
+     */
+    template <typename F, typename P = RungeKutta::PostStageNoOp>
+    void RK (int order, int state_type, Real time, Real dt, int iteration,
+             int ncycle, F&& f, P&& p = RungeKutta::PostStageNoOp());
 
 #ifdef AMREX_USE_EB
     static void SetEBMaxGrowCells (int nbasic, int nvolume, int nfull) noexcept {
@@ -457,6 +482,14 @@ protected:
 
 private:
 
+    template <std::size_t order>
+    void storeRKCoarseData (int state_type, Real time, Real dt,
+                            MultiFab const& S_old,
+                            Array<MultiFab,order> const& rkk);
+
+    void FillRKPatch (int state_index, MultiFab& S, Real time,
+                      int stage, int iteration, int ncycle);
+
     mutable BoxArray      edge_grids[AMREX_SPACEDIM];  // face-centered grids
     mutable BoxArray      nodal_grids;              // all nodal grids
 };
@@ -577,6 +610,74 @@ private:
     std::map< int,Vector< Vector< Vector<FillBoxId> > > > m_fbid; // [grid][level][fillablesubbox][oldnew]
 };
 
+template <typename F, typename P>
+void AmrLevel::RK (int order, int state_type, Real time, Real dt, int iteration,
+                   int ncycle, F&& f, P&& p)
+{
+    BL_PROFILE("AmrLevel::RK()");
+
+    AMREX_ASSERT(AmrLevel::desc_lst[state_type].nExtra() > 0); // Need ghost cells in StateData
+
+    MultiFab& S_old = get_old_data(state_type);
+    MultiFab& S_new = get_new_data(state_type);
+    const Real t_old = state[state_type].prevTime();
+    const Real t_new = state[state_type].curTime();
+    AMREX_ALWAYS_ASSERT(amrex::almostEqual(time,t_old) && amrex::almostEqual(time+dt,t_new));
+
+    if (order == 2) {
+        RungeKutta::RK2(S_old, S_new, time, dt, std::forward<F>(f),
+                        [&] (int /*stage*/, MultiFab& mf, Real t) {
+                            FillPatcherFill(mf, 0, mf.nComp(), mf.nGrow(), t,
+                                            state_type, 0); },
+                        std::forward<P>(p));
+    } else if (order == 3) {
+        RungeKutta::RK3(S_old, S_new, time, dt, std::forward<F>(f),
+                        [&] (int stage, MultiFab& mf, Real t) {
+                            FillRKPatch(state_type, mf, t, stage, iteration, ncycle);
+                        },
+                        [&] (Array<MultiFab,3> const& rkk) {
+                            if (level < parent->finestLevel()) {
+                                storeRKCoarseData(state_type, time, dt, S_old, rkk);
+                            }
+                        },
+                        std::forward<P>(p));
+    } else if (order == 4) {
+        RungeKutta::RK4(S_old, S_new, time, dt, std::forward<F>(f),
+                        [&] (int stage, MultiFab& mf, Real t) {
+                            FillRKPatch(state_type, mf, t, stage, iteration, ncycle);
+                        },
+                        [&] (Array<MultiFab,4> const& rkk) {
+                            if (level < parent->finestLevel()) {
+                                storeRKCoarseData(state_type, time, dt, S_old, rkk);
+                            }
+                        },
+                        std::forward<P>(p));
+    } else {
+        amrex::Abort("AmrLevel::RK: order = "+std::to_string(order)+" is not supported");
+    }
+}
+
+template <std::size_t order>
+void AmrLevel::storeRKCoarseData (int state_type, Real time, Real dt,
+                                  MultiFab const& S_old,
+                                  Array<MultiFab,order> const& rkk)
+{
+    if (level == parent->finestLevel()) { return; }
+
+    const StateDescriptor& desc = AmrLevel::desc_lst[state_type];
+
+    auto& fillpatcher = parent->getLevel(level+1).m_fillpatcher[state_type];
+    fillpatcher = std::make_unique<FillPatcher<MultiFab>>
+        (parent->boxArray(level+1), parent->DistributionMap(level+1),
+         parent->Geom(level+1),
+         parent->boxArray(level), parent->DistributionMap(level),
+         parent->Geom(level),
+         IntVect(desc.nExtra()), desc.nComp(), desc.interp(0));
+
+    fillpatcher->storeRKCoarseData(time, dt, S_old, rkk);
+}
+
+
 }
 
 #endif /*_AmrLevel_H_*/
diff --git a/Src/Amr/AMReX_AmrLevel.cpp b/Src/Amr/AMReX_AmrLevel.cpp
index fbeba917255..c10a1e6277b 100644
--- a/Src/Amr/AMReX_AmrLevel.cpp
+++ b/Src/Amr/AMReX_AmrLevel.cpp
@@ -31,6 +31,14 @@ EBSupport AmrLevel::m_eb_support_level = EBSupport::volume;
 DescriptorList AmrLevel::desc_lst;
 DeriveList     AmrLevel::derive_lst;
 
+void
+AmrLevel::post_timestep (int /*iteration*/)
+{
+    if (level < parent->finestLevel()) {
+        parent->getLevel(level+1).resetFillPatcher();
+    }
+}
+
 void
 AmrLevel::postCoarseTimeStep (Real time)
 {
@@ -2223,4 +2231,23 @@ AmrLevel::CreateLevelDirectory (const std::string &dir)
     levelDirectoryCreated = true;
 }
 
+void
+AmrLevel::FillRKPatch (int state_index, MultiFab& S, Real time,
+                       int stage, int iteration, int ncycle)
+{
+    StateDataPhysBCFunct physbcf(state[state_index], 0, geom);
+
+    if (level == 0) {
+        S.FillBoundary(geom.periodicity());
+        physbcf(S, 0, S.nComp(), S.nGrowVect(), time, 0);
+    } else {
+        auto& crse_level = parent->getLevel(level-1);
+        StateDataPhysBCFunct physbcf_crse(crse_level.state[state_index], 0,
+                                          crse_level.geom);
+        auto& fillpatcher = m_fillpatcher[state_index];
+        fillpatcher->fillRK(stage, iteration, ncycle, S, time, physbcf_crse,
+                            physbcf, AmrLevel::desc_lst[state_index].getBCs());
+    }
+}
+
 }
diff --git a/Src/AmrCore/AMReX_FillPatcher.H b/Src/AmrCore/AMReX_FillPatcher.H
index 41ed75318c6..4fe28e5ec8a 100644
--- a/Src/AmrCore/AMReX_FillPatcher.H
+++ b/Src/AmrCore/AMReX_FillPatcher.H
@@ -55,6 +55,17 @@ namespace amrex {
  * allowed to fill only some of the components with the fill function.
  *
  * (5) This only works for cell-centered and nodal data.
+ *
+ * This class also provides support for RungeKutta::RK3 and RungeKutta::RK4.
+ * The storeRKCoarseData function can be used to store coarse AMR level
+ * data that are needed for filling fine level data's ghost cells in this
+ * class.  The `fillRK` function can be used to fill ghost cells for fine
+ * AMR levels.  This operation at the coarse/fine boundary is non-trivial
+ * for RK orders higher than 2.  Note that it is expected that time stepping
+ * on the coarse level is perform before any fine level time stepping, and
+ * it's the user's reponsibility to properly create and destroy this object.
+ * See AmrLevel::RK for an example of using the RungeKutta functions and
+ * FillPatcher together.
  */
 
 template <class MF = MultiFab>
@@ -153,6 +164,37 @@ public:
                                  PreInterpHook const& pre_interp = {},
                                  PostInterpHook const& post_interp = {});
 
+    /**
+     * \brief Store coarse AMR level data for RK3 and RK4
+     *
+     * \tparam order RK order.  Must be 3 or 4.
+     * \param time   time at the beginning of the step
+     * \param dt     time step
+     * \param S_old  data at time
+     * \param RK_k   right-hand side at RK stages
+     */
+    template <std::size_t order>
+    void storeRKCoarseData (Real time, Real dt, MF const& S_old,
+                            Array<MF,order> const& RK_k);
+
+    /**
+     * \brief Fill ghost cells of fine AMR level for RK3 and RK4
+     *
+     * \param stage     RK stage number starting from 1
+     * \param iteration iteration number on fine level during a coarse time
+     *                  step.  For an AMR simulation with subcycling and a
+     *                  refinement ratio of 2, the number is either 1 or 2,
+     *                  denoting the first and second substep, respectively.
+     * \param ncycle    number of subcyling steps.  It's usually 2 or 4.
+     *                  Without subcycling, this will be 1.
+     * \param cbc       filling physical boundary on coarse level
+     * \param fbc       filling physical boundary on fine level
+     * \param bcs       physical BC types
+     */
+    template <typename BC>
+    void fillRK (int stage, int iteration, int ncycle, MF& mf, Real time,
+                 BC& cbc, BC& fbc, Vector<BCRec> const& bcs);
+
 private:
 
     BoxArray m_fba;
@@ -165,8 +207,14 @@ private:
     int m_ncomp;
     InterpBase* m_interp;
     EB2::IndexSpace const* m_eb_index_space = nullptr;
+    FabArrayBase m_sfine;
+    IntVect m_ratio;
     Vector<std::pair<Real,std::unique_ptr<MF>>> m_cf_crse_data;
+    std::unique_ptr<MF> m_cf_crse_data_tmp;
     std::unique_ptr<MF> m_cf_fine_data;
+    Real m_dt_coarse = std::numeric_limits<Real>::lowest();
+
+    FabArrayBase::FPinfo const& getFPinfo ();
 };
 
 template <class MF>
@@ -185,11 +233,17 @@ FillPatcher<MF>::FillPatcher (BoxArray const& fba, DistributionMapping const& fd
       m_nghost(nghost),
       m_ncomp(ncomp),
       m_interp(interp),
-      m_eb_index_space(eb_index_space)
+      m_eb_index_space(eb_index_space),
+      m_sfine(fba, fdm, 1, nghost)
 {
     static_assert(IsFabArray<MF>::value,
                   "FillPatcher<MF>: MF must be FabArray type");
     AMREX_ALWAYS_ASSERT(m_fba.ixType().cellCentered() || m_fba.ixType().nodeCentered());
+
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        m_ratio[idim] = m_fgeom.Domain().length(idim) / m_cgeom.Domain().length(idim);
+    }
+    AMREX_ASSERT(m_fgeom.Domain() == amrex::refine(m_cgeom.Domain(),m_ratio));
 }
 
 template <class MF>
@@ -217,6 +271,15 @@ FillPatcher<MF>::fill (MF& mf, IntVect const& nghost, Real time,
                          m_fgeom, fbc, fbccomp);
 }
 
+template <class MF>
+FabArrayBase::FPinfo const&
+FillPatcher<MF>::getFPinfo ()
+{
+    const InterpolaterBoxCoarsener& coarsener = m_interp->BoxCoarsener(m_ratio);
+    return FabArrayBase::TheFPinfo(m_sfine, m_sfine, m_nghost, coarsener,
+                                   m_fgeom, m_cgeom, m_eb_index_space);
+}
+
 template <class MF>
 template <typename BC, typename PreInterpHook, typename PostInterpHook>
 void
@@ -239,19 +302,7 @@ FillPatcher<MF>::fillCoarseFineBoundary (MF& mf, IntVect const& nghost, Real tim
                         m_ncomp >= ncomp &&
                         m_ncomp == cmf[0]->nComp());
 
-    IntVect ratio;
-    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-        ratio[idim] = m_fgeom.Domain().length(idim) / m_cgeom.Domain().length(idim);
-    }
-    AMREX_ASSERT(m_fgeom.Domain() == amrex::refine(m_cgeom.Domain(),ratio));
-
-    const InterpolaterBoxCoarsener& coarsener = m_interp->BoxCoarsener(ratio);
-    const FabArrayBase::FPinfo& fpc = FabArrayBase::TheFPinfo(mf, mf,
-                                                              m_nghost,
-                                                              coarsener,
-                                                              m_fgeom,
-                                                              m_cgeom,
-                                                              m_eb_index_space);
+    auto const& fpc = getFPinfo();
 
     if ( ! fpc.ba_crse_patch.empty())
     {
@@ -294,7 +345,11 @@ FillPatcher<MF>::fillCoarseFineBoundary (MF& mf, IntVect const& nghost, Real tim
         }
         else if (m_cf_crse_data.size() == 2)
         {
-            mf_crse_patch = make_mf_crse_patch<MF>(fpc, ncomp);
+            if (m_cf_crse_data_tmp == nullptr) {
+                m_cf_crse_data_tmp = std::make_unique<MF>
+                    (make_mf_crse_patch<MF>(fpc, m_ncomp));
+            }
+            mf_crse_patch = MF(*m_cf_crse_data_tmp, amrex::make_alias, scomp, ncomp);
             int const ng_space_interp = 8; // Need to be big enough
             Box domain = m_cgeom.growPeriodicDomain(ng_space_interp);
             domain.convert(mf.ixType());
@@ -330,7 +385,7 @@ FillPatcher<MF>::fillCoarseFineBoundary (MF& mf, IntVect const& nghost, Real tim
                         ncomp, IntVect(0), m_cgeom, m_fgeom,
                         amrex::grow(amrex::convert(m_fgeom.Domain(),
                                                    mf.ixType()),nghost),
-                        ratio, m_interp, bcs, bcscomp);
+                        m_ratio, m_interp, bcs, bcscomp);
 
         post_interp(*m_cf_fine_data, scomp, ncomp);
 
@@ -338,6 +393,194 @@ FillPatcher<MF>::fillCoarseFineBoundary (MF& mf, IntVect const& nghost, Real tim
     }
 }
 
+template <typename MF>
+template <std::size_t order>
+void FillPatcher<MF>::storeRKCoarseData (Real /*time*/, Real dt, MF const& S_old,
+                                         Array<MF,order> const& RK_k)
+{
+    m_dt_coarse = dt;
+    m_cf_crse_data.resize(order+1);
+
+    auto const& fpc = getFPinfo();
+
+    for (auto& tmf : m_cf_crse_data) {
+        tmf.first = std::numeric_limits<Real>::lowest(); // because we dont' need it
+        tmf.second = std::make_unique<MF>(make_mf_crse_patch<MF>(fpc, m_ncomp));
+    }
+    m_cf_crse_data[0].second->ParallelCopy(S_old, m_cgeom.periodicity());
+    for (std::size_t i = 0; i < order; ++i) {
+        m_cf_crse_data[i+1].second->ParallelCopy(RK_k[i], m_cgeom.periodicity());
+    }
+}
+
+template <typename MF>
+template <typename BC>
+void FillPatcher<MF>::fillRK (int stage, int iteration, int ncycle,
+                              MF& mf, Real time, BC& cbc, BC& fbc,
+                              Vector<BCRec> const& bcs)
+{
+    int rk_order = m_cf_crse_data.size()-1;
+    if (rk_order != 3 && rk_order != 4) {
+        amrex::Abort("FillPatcher: unsupported RK order "+std::to_string(rk_order));
+        return;
+    }
+    AMREX_ASSERT(stage > 0 && stage <= rk_order);
+
+    auto const& fpc = getFPinfo();
+    if (m_cf_crse_data_tmp == nullptr) {
+        m_cf_crse_data_tmp = std::make_unique<MF>
+            (make_mf_crse_patch<MF>(fpc, m_ncomp));
+    }
+
+    auto const& u = m_cf_crse_data_tmp->arrays();
+    auto const& u0 = m_cf_crse_data[0].second->const_arrays();
+    auto const& k1 = m_cf_crse_data[1].second->const_arrays();
+    auto const& k2 = m_cf_crse_data[2].second->const_arrays();
+    auto const& k3 = m_cf_crse_data[3].second->const_arrays();
+
+    Real dtc = m_dt_coarse;
+    Real r = Real(1) / Real(ncycle);
+    Real xsi = Real(iteration-1) / Real(ncycle);
+
+    if (rk_order == 3) {
+        // coefficients for U
+        Real b1 = xsi - Real(5./6.)*xsi*xsi;
+        Real b2 = Real(1./6.)*xsi*xsi;
+        Real b3 = Real(2./3)*xsi*xsi;
+        // coefficients for Ut
+        Real c1 = Real(1.) - Real(5./3.)*xsi;
+        Real c2 = Real(1./3.)*xsi;
+        Real c3 = Real(4./3.)*xsi;
+        // coefficients for Utt
+        constexpr Real d1 = Real(-5./3.);
+        constexpr Real d2 = Real(1./3.);
+        constexpr Real d3 = Real(4./3.);
+        if (stage == 1) {
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), m_ncomp,
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+            {
+                Real kk1 = k1[bi](i,j,k,n);
+                Real kk2 = k2[bi](i,j,k,n);
+                Real kk3 = k3[bi](i,j,k,n);
+                Real uu  = b1*kk1 + b2*kk2 + b3*kk3;
+                u[bi](i,j,k,n) = u0[bi](i,j,k,n) + dtc*uu;
+            });
+        } else if (stage == 2) {
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), m_ncomp,
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+            {
+                Real kk1 = k1[bi](i,j,k,n);
+                Real kk2 = k2[bi](i,j,k,n);
+                Real kk3 = k3[bi](i,j,k,n);
+                Real uu = b1*kk1 + b2*kk2 + b3*kk3;
+                Real ut = c1*kk1 + c2*kk2 + c3*kk3;
+                u[bi](i,j,k,n) = u0[bi](i,j,k,n) + dtc*(uu + r*ut);
+            });
+        } else if (stage == 3) {
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), m_ncomp,
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+            {
+                Real kk1 = k1[bi](i,j,k,n);
+                Real kk2 = k2[bi](i,j,k,n);
+                Real kk3 = k3[bi](i,j,k,n);
+                Real uu  = b1*kk1 + b2*kk2 + b3*kk3;
+                Real ut  = c1*kk1 + c2*kk2 + c3*kk3;
+                Real utt = d1*kk1 + d2*kk2 + d3*kk3;
+                u[bi](i,j,k,n) = u0[bi](i,j,k,n) + dtc*
+                    (uu + Real(0.5)*r*ut + Real(0.25)*r*r*utt);
+            });
+        }
+    } else if (rk_order == 4) {
+        auto const& k4 = m_cf_crse_data[4].second->const_arrays();
+        Real xsi2 = xsi*xsi;
+        Real xsi3 = xsi2*xsi;
+        // coefficients for U
+        Real b1 = xsi - Real(1.5)*xsi2 + Real(2./3.)*xsi3;
+        Real b2 = xsi2 - Real(2./3.)*xsi3;
+        Real b3 = b2;
+        Real b4 = Real(-0.5)*xsi2 + Real(2./3.)*xsi3;
+        // coefficients for Ut
+        Real c1 = Real(1.) - Real(3.)*xsi + Real(2.)*xsi2;
+        Real c2 = Real(2.)*xsi - Real(2.)*xsi2;
+        Real c3 = c2;
+        Real c4 = -xsi + Real(2.)*xsi2;
+        // coefficients for Utt
+        Real d1 = Real(-3.) + Real(4.)*xsi;
+        Real d2 = Real( 2.) - Real(4.)*xsi;
+        Real d3 = d2;
+        Real d4 = Real(-1.) + Real(4.)*xsi;
+        // coefficients for Uttt
+        constexpr Real e1 = Real( 4.);
+        constexpr Real e2 = Real(-4.);
+        constexpr Real e3 = Real(-4.);
+        constexpr Real e4 = Real( 4.);
+        if (stage == 1) {
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), m_ncomp,
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+            {
+                Real kk1 = k1[bi](i,j,k,n);
+                Real kk2 = k2[bi](i,j,k,n);
+                Real kk3 = k3[bi](i,j,k,n);
+                Real kk4 = k4[bi](i,j,k,n);
+                Real uu  = b1*kk1 + b2*kk2 + b3*kk3 + b4*kk4;
+                u[bi](i,j,k,n) = u0[bi](i,j,k,n) + dtc*uu;
+            });
+        } else if (stage == 2) {
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), m_ncomp,
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+            {
+                Real kk1 = k1[bi](i,j,k,n);
+                Real kk2 = k2[bi](i,j,k,n);
+                Real kk3 = k3[bi](i,j,k,n);
+                Real kk4 = k4[bi](i,j,k,n);
+                Real uu = b1*kk1 + b2*kk2 + b3*kk3 + b4*kk4;
+                Real ut = c1*kk1 + c2*kk2 + c3*kk3 + c4*kk4;
+                u[bi](i,j,k,n) = u0[bi](i,j,k,n) + dtc*(uu + Real(0.5)*r*ut);
+            });
+        } else if (stage == 3 || stage == 4) {
+            Real r2 = r*r;
+            Real r3 = r2*r;
+            Real at = (stage == 3) ? Real(0.5)*r : r;
+            Real att = (stage == 3) ? Real(0.25)*r2 : Real(0.5)*r2;
+            Real attt = (stage == 3) ? Real(0.0625)*r3 : Real(0.125)*r3;
+            Real akk = (stage == 3) ? Real(-4.) : Real(4.);
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), m_ncomp,
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+            {
+                Real kk1 = k1[bi](i,j,k,n);
+                Real kk2 = k2[bi](i,j,k,n);
+                Real kk3 = k3[bi](i,j,k,n);
+                Real kk4 = k4[bi](i,j,k,n);
+                Real uu   = b1*kk1 + b2*kk2 + b3*kk3 + b4*kk4;
+                Real ut   = c1*kk1 + c2*kk2 + c3*kk3 + c4*kk4;
+                Real utt  = d1*kk1 + d2*kk2 + d3*kk3 + d4*kk4;
+                Real uttt = e1*kk1 + e2*kk2 + e3*kk3 + e4*kk4;
+                u[bi](i,j,k,n) = u0[bi](i,j,k,n) + dtc *
+                    (uu + at*ut + att*utt + attt*(uttt+akk*(kk3-kk2)));
+            });
+        }
+    }
+    Gpu::streamSynchronize();
+
+    cbc(*m_cf_crse_data_tmp, 0, m_ncomp, m_nghost, time, 0);
+
+    if (m_cf_fine_data == nullptr) {
+        m_cf_fine_data = std::make_unique<MF>(make_mf_fine_patch<MF>(fpc, m_ncomp));
+    }
+
+    FillPatchInterp(*m_cf_fine_data, 0, *m_cf_crse_data_tmp, 0,
+                    m_ncomp, IntVect(0), m_cgeom, m_fgeom,
+                    amrex::grow(amrex::convert(m_fgeom.Domain(),
+                                               mf.ixType()),m_nghost),
+                    m_ratio, m_interp, bcs, 0);
+
+    // xxxxx We can optimize away this ParallelCopy by making a special fpinfo.
+    mf.ParallelCopy(*m_cf_fine_data, 0, 0, m_ncomp, IntVect(0), m_nghost);
+
+    mf.FillBoundary(m_fgeom.periodicity());
+    fbc(mf, 0, m_ncomp, m_nghost, time, 0);
+}
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_RungeKutta.H b/Src/Base/AMReX_RungeKutta.H
new file mode 100644
index 00000000000..5c8cff7a676
--- /dev/null
+++ b/Src/Base/AMReX_RungeKutta.H
@@ -0,0 +1,287 @@
+#ifndef AMREX_RUNGE_KUTTA_H_
+#define AMREX_RUNGE_KUTTA_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_FabArray.H>
+
+namespace amrex {
+
+/**
+ * \brief Functions for Runge-Kutta methods
+ *
+ * This namespace RungeKutta has functions for a number RK methods, RK2, RK3
+ * and RK4.  The function templates take the old data in FabArray/MultiFab
+ * as input, and evolve the system for one time step.  The result is stored
+ * in another FabArray/MultiFab.  These two FabArrays must have ghost cells
+ * if they are needed for evaluating the right-hand side.  The functions
+ * take three callable objects for computing the right-hand side, filling
+ * ghost cells, and optionally post-processing RK stage results.  For RK3
+ * and RK4, they also need a callable object for storing the data needed for
+ * filling coarse/fine boundaries in AMR simulations.
+ *
+ * The callable object for righ-hand side has the signature of `void(int
+ * stage, MF& dudt, MF const& u, Real dt)`, where `stage` is the RK stage
+ * number starting from 1, `dudt` is the output, `u` is the input, and `dt`
+ * is the sub-time step, which can be used for reflux operations in AMR
+ * simulations.
+ *
+ * The callable object for filling ghost cells has the signature of
+ * `void(int stage, MF& u, Real t)`, where `stage` is the RK stage number
+ * starting from 1, `u` is a FabArray/MultiFab whose ghost cells need to be
+ * filled, and `t` is the first-order approximate time of the data at that
+ * stage.  The FillPatcher class can be useful for implementing such a
+ * callable.  See AmrLevel::RK for an example.
+ *
+ * The callable object for post-processing stage results is optional.  It's
+ * no-op by default.  Its function signature is `void(int stage, MF& u)`,
+ * where `stage` is the RK stage number and `u` is the result of that stage.
+ *
+ * For RK3 and RK4, one must also provide a callable object with the
+ * signature of `void(Array<MF,order> const& rkk)`, where `order` is the RK
+ * order and `rkk` contains the righ-hand side at all the RK stages.  The
+ * FillPatcher class can be useful for implementing such a callable.  See
+ * AmrLevel::RK for an example.
+ */
+namespace RungeKutta {
+
+struct PostStageNoOp {
+    template <typename MF>
+    std::enable_if_t<IsFabArray<MF>::value> operator() (int, MF&) const {}
+};
+
+namespace detail {
+//! Unew = Uold + dUdt * dt
+template <typename MF>
+void rk_update (MF& Unew, MF const& Uold, MF const& dUdt, Real dt)
+{
+    auto const& snew = Unew.arrays();
+    auto const& sold = Uold.const_arrays();
+    auto const& sdot = dUdt.const_arrays();
+    amrex::ParallelFor(Unew, IntVect(0), Unew.nComp(), [=] AMREX_GPU_DEVICE
+                       (int bi, int i, int j, int k, int n) noexcept
+    {
+            snew[bi](i,j,k,n) = sold[bi](i,j,k,n) + dt*sdot[bi](i,j,k,n);
+    });
+    Gpu::streamSynchronize();
+}
+
+//! Unew = Uold + (dUdt1 + dUdt2) * dt
+template <typename MF>
+void rk_update (MF& Unew, MF const& Uold, MF const& dUdt1, MF const& dUdt2, Real dt)
+{
+    auto const& snew = Unew.arrays();
+    auto const& sold = Uold.const_arrays();
+    auto const& sdot1 = dUdt1.const_arrays();
+    auto const& sdot2 = dUdt2.const_arrays();
+    amrex::ParallelFor(Unew, IntVect(0), Unew.nComp(), [=] AMREX_GPU_DEVICE
+                       (int bi, int i, int j, int k, int n) noexcept
+    {
+            snew[bi](i,j,k,n) = sold[bi](i,j,k,n) + dt*(sdot1[bi](i,j,k,n) +
+                                                        sdot2[bi](i,j,k,n));
+    });
+    Gpu::streamSynchronize();
+}
+
+//! Unew = (Uold+Unew)/2 + dUdt * dt/2
+template <typename MF>
+void rk2_update_2 (MF& Unew, MF const& Uold, MF const& dUdt, Real dt)
+{
+    auto const& snew = Unew.arrays();
+    auto const& sold = Uold.const_arrays();
+    auto const& sdot = dUdt.const_arrays();
+    amrex::ParallelFor(Unew, IntVect(0), Unew.nComp(), [=] AMREX_GPU_DEVICE
+                       (int bi, int i, int j, int k, int n) noexcept
+    {
+        snew[bi](i,j,k,n) = Real(0.5)*(snew[bi](i,j,k,n) +
+                                       sold[bi](i,j,k,n) +
+                                       sdot[bi](i,j,k,n) * dt);
+    });
+    Gpu::streamSynchronize();
+}
+
+//! Unew = Uold + (k1 + k2 + 4*k3) * dt6, where dt6 = dt/6
+template <typename MF>
+void rk3_update_3 (MF& Unew, MF const& Uold, Array<MF,3> const& rkk, Real dt6)
+{
+    auto const& snew = Unew.arrays();
+    auto const& sold = Uold.const_arrays();
+    auto const& k1 = rkk[0].const_arrays();
+    auto const& k2 = rkk[1].const_arrays();
+    auto const& k3 = rkk[2].const_arrays();
+    amrex::ParallelFor(Unew, IntVect(0), Unew.nComp(), [=] AMREX_GPU_DEVICE
+                       (int bi, int i, int j, int k, int n) noexcept
+    {
+        snew[bi](i,j,k,n) = sold[bi](i,j,k,n)
+            + dt6 * (k1[bi](i,j,k,n) + k2[bi](i,j,k,n)
+                     +      Real(4.) * k3[bi](i,j,k,n));
+    });
+    Gpu::streamSynchronize();
+}
+
+//! Unew = Uold + (k1+k4+2*(k2+k3))*dt6, where dt6 = dt/6
+template <typename MF>
+void rk4_update_4 (MF& Unew, MF const& Uold, Array<MF,4> const& rkk, Real dt6)
+{
+    auto const& snew = Unew.arrays();
+    auto const& sold = Uold.const_arrays();
+    auto const& k1 = rkk[0].const_arrays();
+    auto const& k2 = rkk[1].const_arrays();
+    auto const& k3 = rkk[2].const_arrays();
+    auto const& k4 = rkk[3].const_arrays();
+    amrex::ParallelFor(Unew, IntVect(0), Unew.nComp(), [=] AMREX_GPU_DEVICE
+                       (int bi, int i, int j, int k, int n) noexcept
+    {
+        snew[bi](i,j,k,n) = sold[bi](i,j,k,n)
+            + dt6 * (            k1[bi](i,j,k,n) + k4[bi](i,j,k,n)
+                     + Real(2.)*(k2[bi](i,j,k,n) + k3[bi](i,j,k,n)));
+    });
+    Gpu::streamSynchronize();
+}
+}
+
+/**
+ * \brief Time stepping with RK2
+ *
+ * \param Uold       input FabArray/MultiFab data at time
+ * \param Unew       output FabArray/MultiFab data at time+dt
+ * \param time       time at the beginning of the step
+ * \param dt         time step
+ * \param frhs       computing the right-hand side
+ * \param fillbndry  filling ghost cells
+ * \param post_stage post-processing stage results
+ */
+template <typename MF, typename F, typename FB, typename P = PostStageNoOp>
+void RK2 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
+          P&& post_stage = PostStageNoOp())
+{
+    BL_PROFILE("RungeKutta2");
+
+    MF dUdt(Unew.boxArray(), Unew.DistributionMap(), Unew.nComp(), 0,
+            MFInfo(), Unew.Factory());
+
+    // RK2 stage 1
+    fillbndry(1, Uold, time);
+    frhs(1, dUdt, Uold, Real(0.5)*dt);
+    // Unew = Uold + dt * dUdt
+    detail::rk_update(Unew, Uold, dUdt, dt);
+    post_stage(1, Unew);
+
+    // RK2 stage 2
+    fillbndry(2, Unew, time+dt);
+    frhs(2, dUdt, Unew, Real(0.5)*dt);
+    // Unew = (Uold+Unew)/2 + dUdt_2 * dt/2,
+    // which is Unew = Uold + dt/2 * (dUdt_1 + dUdt_2)
+    detail::rk2_update_2(Unew, Uold, dUdt, dt);
+    post_stage(2, Unew);
+}
+
+/**
+ * \brief Time stepping with RK3
+ *
+ * \param Uold            input FabArray/MultiFab data at time
+ * \param Unew            output FabArray/MultiFab data at time+dt
+ * \param time            time at the beginning of the step
+ * \param dt              time step
+ * \param frhs            computing the right-hand side
+ * \param fillbndry       filling ghost cells
+ * \param store_crse_data storing right-hand side data for AMR
+ * \param post_stage      post-processing stage results
+ */
+template <typename MF, typename F, typename FB, typename R,
+          typename P = PostStageNoOp>
+void RK3 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
+          R&& store_crse_data, P&& post_stage = PostStageNoOp())
+{
+    BL_PROFILE("RungeKutta3");
+
+    Array<MF,3> rkk;
+    for (auto& mf : rkk) {
+        mf.define(Unew.boxArray(), Unew.DistributionMap(), Unew.nComp(), 0,
+                  MFInfo(), Unew.Factory());
+    }
+
+    // RK3 stage 1
+    fillbndry(1, Uold, time);
+    frhs(1, rkk[0], Uold, dt/Real(6.));
+    // Unew = Uold + k1 * dt
+    detail::rk_update(Unew, Uold, rkk[0], dt);
+    post_stage(1, Unew);
+
+    // RK3 stage 2
+    fillbndry(2, Unew, time+dt);
+    frhs(2, rkk[1], Unew, dt/Real(6.));
+    // Unew = Uold + (k1+k2) * dt/4
+    detail::rk_update(Unew, Uold, rkk[0], rkk[1], Real(0.25)*dt);
+    post_stage(2, Unew);
+
+    // RK3 stage 3
+    Real t_half = time + Real(0.5)*dt;
+    fillbndry(3, Unew, t_half);
+    frhs(3, rkk[2], Unew, dt*Real(2./3.));
+    // Unew = Uold + (k1/6 + k2/6 + k3*(2/3)) * dt
+    detail::rk3_update_3(Unew, Uold, rkk, Real(1./6.)*dt);
+    post_stage(3, Unew);
+
+    store_crse_data(rkk);
+}
+
+/**
+ * \brief Time stepping with RK4
+ *
+ * \param Uold            input FabArray/MultiFab data at time
+ * \param Unew            output FabArray/MultiFab data at time+dt
+ * \param time            time at the beginning of the step
+ * \param dt              time step
+ * \param frhs            computing the right-hand side
+ * \param fillbndry       filling ghost cells
+ * \param store_crse_data storing right-hand side data for AMR
+ * \param post_stage      post-processing stage results
+ */
+template <typename MF, typename F, typename FB, typename R,
+          typename P = PostStageNoOp>
+void RK4 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
+          R&& store_crse_data, P&& post_stage = PostStageNoOp())
+{
+    BL_PROFILE("RungeKutta4");
+
+    Array<MF,4> rkk;
+    for (auto& mf : rkk) {
+        mf.define(Unew.boxArray(), Unew.DistributionMap(), Unew.nComp(), 0,
+                  MFInfo(), Unew.Factory());
+    }
+
+    // RK4 stage 1
+    fillbndry(1, Uold, time);
+    frhs(1, rkk[0], Uold, dt/Real(6.));
+    // Unew = Uold + k1 * dt/2
+    detail::rk_update(Unew, Uold, rkk[0], Real(0.5)*dt);
+    post_stage(1, Unew);
+
+    // RK4 stage 2
+    Real t_half = time + Real(0.5)*dt;
+    fillbndry(2, Unew, t_half);
+    frhs(2, rkk[1], Unew, dt/Real(3.));
+    // Unew = Uold + k2 * dt/2
+    detail::rk_update(Unew, Uold, rkk[1], Real(0.5)*dt);
+    post_stage(2, Unew);
+
+    // RK4 Step 3
+    fillbndry(3, Unew, t_half);
+    frhs(3, rkk[2], Unew, dt/Real(3.));
+    // Unew = Uold + k3 * dt;
+    detail::rk_update(Unew, Uold, rkk[2], dt);
+    post_stage(3, Unew);
+
+    // Step 4 of RK4
+    fillbndry(4, Unew, time+dt);
+    frhs(4, rkk[3], Unew, dt/Real(6.));
+    // Unew = Uold + (k1/6 + k2/3 + k3/3 + k4/6) * dt
+    detail::rk4_update_4(Unew, Uold, rkk, Real(1./6.)*dt);
+    post_stage(4, Unew);
+
+    store_crse_data(rkk);
+}
+
+}}
+
+#endif
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index c47fdcae706..f09897ff6f7 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -188,6 +188,7 @@ target_sources( amrex
    AMReX_IntegratorBase.H
    AMReX_RKIntegrator.H
    AMReX_TimeIntegrator.H
+   AMReX_RungeKutta.H
    # GPU --------------------------------------------------------------------
    AMReX_Gpu.H
    AMReX_GpuQualifiers.H
diff --git a/Src/Base/Make.package b/Src/Base/Make.package
index 79085ae70a1..cd15687dce1 100644
--- a/Src/Base/Make.package
+++ b/Src/Base/Make.package
@@ -203,7 +203,7 @@ C$(AMREX_BASE)_headers += AMReX_FEIntegrator.H
 C$(AMREX_BASE)_headers += AMReX_IntegratorBase.H
 C$(AMREX_BASE)_headers += AMReX_RKIntegrator.H
 C$(AMREX_BASE)_headers += AMReX_TimeIntegrator.H
-
+C$(AMREX_BASE)_headers += AMReX_RungeKutta.H
 
 #
 # Fortran interface routines.
diff --git a/Tests/GPU/CNS/Source/CNS.H b/Tests/GPU/CNS/Source/CNS.H
index 877f0b523da..eedb7d486ba 100644
--- a/Tests/GPU/CNS/Source/CNS.H
+++ b/Tests/GPU/CNS/Source/CNS.H
@@ -157,6 +157,8 @@ protected:
 
     static int do_reflux;
 
+    static int rk_order;
+
     static bool do_visc;
     static bool use_const_visc;
 
diff --git a/Tests/GPU/CNS/Source/CNS.cpp b/Tests/GPU/CNS/Source/CNS.cpp
index c3b5e2fb600..1a073c68c8a 100644
--- a/Tests/GPU/CNS/Source/CNS.cpp
+++ b/Tests/GPU/CNS/Source/CNS.cpp
@@ -19,6 +19,7 @@ int       CNS::verbose = 0;
 IntVect   CNS::hydro_tile_size {AMREX_D_DECL(1024,16,16)};
 Real      CNS::cfl       = 0.3;
 int       CNS::do_reflux = 1;
+int       CNS::rk_order = 2;
 int       CNS::refine_max_dengrad_lev   = -1;
 Real      CNS::refine_dengrad           = 1.0e10;
 
@@ -241,6 +242,9 @@ CNS::post_timestep (int /*iteration*/)
 
     if (level < parent->finestLevel()) {
         avgDown();
+        // fillpatcher on level+1 needs to be reset because data on this
+        // level have changed.
+        getLevel(level+1).resetFillPatcher();
     }
 }
 
@@ -354,6 +358,7 @@ CNS::read_params ()
     }
 
     pp.query("do_reflux", do_reflux);
+    pp.query("rk_order", rk_order);
 
     pp.query("do_visc", do_visc);
 
diff --git a/Tests/GPU/CNS/Source/CNS_advance.cpp b/Tests/GPU/CNS/Source/CNS_advance.cpp
index c086cac0e9f..3bcb4c388bf 100644
--- a/Tests/GPU/CNS/Source/CNS_advance.cpp
+++ b/Tests/GPU/CNS/Source/CNS_advance.cpp
@@ -7,7 +7,7 @@
 using namespace amrex;
 
 Real
-CNS::advance (Real time, Real dt, int /*iteration*/, int /*ncycle*/)
+CNS::advance (Real time, Real dt, int iteration, int ncycle)
 {
     BL_PROFILE("CNS::advance()");
 
@@ -16,11 +16,6 @@ CNS::advance (Real time, Real dt, int /*iteration*/, int /*ncycle*/)
         state[i].swapTimeLevels(dt);
     }
 
-    MultiFab& S_new = get_new_data(State_Type);
-    MultiFab& S_old = get_old_data(State_Type);
-    MultiFab dSdt(grids,dmap,NUM_STATE,0,MFInfo(),Factory());
-    MultiFab Sborder(grids,dmap,NUM_STATE,NUM_GROW,MFInfo(),Factory());
-
     FluxRegister* fr_as_crse = nullptr;
     if (do_reflux && level < parent->finestLevel()) {
         CNS& fine_level = getLevel(level+1);
@@ -36,23 +31,13 @@ CNS::advance (Real time, Real dt, int /*iteration*/, int /*ncycle*/)
         fr_as_crse->setVal(Real(0.0));
     }
 
-    // RK2 stage 1
-    FillPatch(*this, Sborder, NUM_GROW, time, State_Type, 0, NUM_STATE);
-    compute_dSdt(Sborder, dSdt, Real(0.5)*dt, fr_as_crse, fr_as_fine);
-    // U^* = U^n + dt*dUdt^n
-    MultiFab::LinComb(S_new, Real(1.0), Sborder, 0, dt, dSdt, 0, 0, NUM_STATE, 0);
-    computeTemp(S_new,0);
-
-    // RK2 stage 2
-    // After fillpatch Sborder = U^n+dt*dUdt^n
-    FillPatch(*this, Sborder, NUM_GROW, time+dt, State_Type, 0, NUM_STATE);
-    compute_dSdt(Sborder, dSdt, Real(0.5)*dt, fr_as_crse, fr_as_fine);
-    // S_new = 0.5*(Sborder+S_old) = U^n + 0.5*dt*dUdt^n
-    MultiFab::LinComb(S_new, Real(0.5), Sborder, 0, Real(0.5), S_old, 0, 0, NUM_STATE, 0);
-    // S_new += 0.5*dt*dSdt
-    MultiFab::Saxpy(S_new, Real(0.5)*dt, dSdt, 0, 0, NUM_STATE, 0);
-    // We now have S_new = U^{n+1} = (U^n+0.5*dt*dUdt^n) + 0.5*dt*dUdt^*
-    computeTemp(S_new,0);
+    RK(rk_order, State_Type, time, dt, iteration, ncycle,
+       // Given state S, compute dSdt. dtsub is needed for flux register operations
+       [&] (int /*stage*/, MultiFab& dSdt, MultiFab const& S, Real dtsub) {
+           compute_dSdt(S, dSdt, dtsub, fr_as_crse, fr_as_fine);
+       },
+       // Optional. In case if there is anything needed after each RK substep.
+       [&] (int /*stage*/, MultiFab& S) { computeTemp(S,0); });
 
     return dt;
 }
@@ -254,5 +239,3 @@ CNS::compute_dSdt (const MultiFab& S, MultiFab& dSdt, Real dt,
         }
     }
 }
-
-
diff --git a/Tests/GPU/CNS/Source/diffusion/CNS_diffusion_K.H b/Tests/GPU/CNS/Source/diffusion/CNS_diffusion_K.H
index b9bf5a18f78..75f4f784fad 100644
--- a/Tests/GPU/CNS/Source/diffusion/CNS_diffusion_K.H
+++ b/Tests/GPU/CNS/Source/diffusion/CNS_diffusion_K.H
@@ -17,24 +17,24 @@ cns_diffcoef (int i, int j, int k,
 {
     using amrex::Real;
 
-     coefs(i,j,k,CETA) = parm.C_S * std::sqrt(q(i,j,k,QTEMP)) * q(i,j,k,QTEMP) / (q(i,j,k,QTEMP)+parm.T_S);
-     coefs(i,j,k,CXI)  = Real(0.0);
-     coefs(i,j,k,CLAM) = coefs(i,j,k,CETA)*parm.cp/parm.Pr;
+    coefs(i,j,k,CETA) = parm.C_S * std::sqrt(q(i,j,k,QTEMP)) * q(i,j,k,QTEMP) / (q(i,j,k,QTEMP)+parm.T_S);
+    coefs(i,j,k,CXI)  = Real(0.0);
+    coefs(i,j,k,CLAM) = coefs(i,j,k,CETA)*parm.cp/parm.Pr;
 }
 
 AMREX_GPU_DEVICE
 inline
 void
 cns_constcoef (int i, int j, int k,
-              amrex::Array4<amrex::Real const> const& q,
+              amrex::Array4<amrex::Real const> const& /*q*/,
               amrex::Array4<amrex::Real> const& coefs,
               Parm const& parm) noexcept
 {
     using amrex::Real;
 
-     coefs(i,j,k,CETA) = parm.const_visc_mu;
-     coefs(i,j,k,CXI)  = parm.const_visc_ki;
-     coefs(i,j,k,CLAM) = parm.const_lambda;
+    coefs(i,j,k,CETA) = parm.const_visc_mu;
+    coefs(i,j,k,CXI)  = parm.const_visc_ki;
+    coefs(i,j,k,CLAM) = parm.const_lambda;
 }
 
 AMREX_GPU_DEVICE
@@ -45,7 +45,7 @@ cns_diff_x (int i, int j, int k,
                amrex::Array4<amrex::Real const> const& coeffs,
                amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxinv,
                amrex::Array4<amrex::Real> const& fx,
-               Parm const& parm) noexcept
+               Parm const& /*parm*/) noexcept
 {
     using amrex::Real;
 
@@ -81,7 +81,7 @@ cns_diff_y (int i, int j, int k, amrex::Array4<amrex::Real const> const& q,
                amrex::Array4<amrex::Real const> const& coeffs,
                amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxinv,
                amrex::Array4<amrex::Real> const& fy,
-               Parm const& parm) noexcept
+               Parm const& /*parm*/) noexcept
 {
     using amrex::Real;
 
@@ -119,7 +119,7 @@ cns_diff_z (int i, int j, int k,
                amrex::Array4<amrex::Real const> const& coeffs,
                amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxinv,
                amrex::Array4<amrex::Real> const& fz,
-               Parm const& parm) noexcept
+               Parm const& /*parm*/) noexcept
 {
     using amrex::Real;
 

From 2409dd09cf099f52b0e277b4fd77e2bd8a141151 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Mon, 10 Oct 2022 10:06:46 -0700
Subject: [PATCH 2/8] Update Src/Base/AMReX_RungeKutta.H

Co-authored-by: Jean M. Sexton <jmsexton@lbl.gov>
---
 Src/Base/AMReX_RungeKutta.H | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_RungeKutta.H b/Src/Base/AMReX_RungeKutta.H
index 5c8cff7a676..73ac9c5cd28 100644
--- a/Src/Base/AMReX_RungeKutta.H
+++ b/Src/Base/AMReX_RungeKutta.H
@@ -19,7 +19,7 @@ namespace amrex {
  * and RK4, they also need a callable object for storing the data needed for
  * filling coarse/fine boundaries in AMR simulations.
  *
- * The callable object for righ-hand side has the signature of `void(int
+ * The callable object for right-hand side has the signature of `void(int
  * stage, MF& dudt, MF const& u, Real dt)`, where `stage` is the RK stage
  * number starting from 1, `dudt` is the output, `u` is the input, and `dt`
  * is the sub-time step, which can be used for reflux operations in AMR

From 26568440d0226c05c05a249166fb9e5f6008ec5a Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Mon, 10 Oct 2022 10:06:54 -0700
Subject: [PATCH 3/8] Update Src/Base/AMReX_RungeKutta.H

Co-authored-by: Jean M. Sexton <jmsexton@lbl.gov>
---
 Src/Base/AMReX_RungeKutta.H | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_RungeKutta.H b/Src/Base/AMReX_RungeKutta.H
index 73ac9c5cd28..9fe37eea641 100644
--- a/Src/Base/AMReX_RungeKutta.H
+++ b/Src/Base/AMReX_RungeKutta.H
@@ -265,7 +265,7 @@ void RK4 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
     detail::rk_update(Unew, Uold, rkk[1], Real(0.5)*dt);
     post_stage(2, Unew);
 
-    // RK4 Step 3
+    // RK4 stage 3
     fillbndry(3, Unew, t_half);
     frhs(3, rkk[2], Unew, dt/Real(3.));
     // Unew = Uold + k3 * dt;

From 10966094eafd4bc3dd1decbb376cf6218f7fa00c Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Mon, 10 Oct 2022 10:07:05 -0700
Subject: [PATCH 4/8] Update Src/Base/AMReX_RungeKutta.H

Co-authored-by: Jean M. Sexton <jmsexton@lbl.gov>
---
 Src/Base/AMReX_RungeKutta.H | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_RungeKutta.H b/Src/Base/AMReX_RungeKutta.H
index 9fe37eea641..29affc69a55 100644
--- a/Src/Base/AMReX_RungeKutta.H
+++ b/Src/Base/AMReX_RungeKutta.H
@@ -272,7 +272,7 @@ void RK4 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
     detail::rk_update(Unew, Uold, rkk[2], dt);
     post_stage(3, Unew);
 
-    // Step 4 of RK4
+    // RK4 stage 4
     fillbndry(4, Unew, time+dt);
     frhs(4, rkk[3], Unew, dt/Real(6.));
     // Unew = Uold + (k1/6 + k2/3 + k3/3 + k4/6) * dt

From 67964940252393bc2e5f6c7af638357136e2c83d Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Mon, 10 Oct 2022 10:07:17 -0700
Subject: [PATCH 5/8] Update Src/Base/AMReX_RungeKutta.H

Co-authored-by: Jean M. Sexton <jmsexton@lbl.gov>
---
 Src/Base/AMReX_RungeKutta.H | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_RungeKutta.H b/Src/Base/AMReX_RungeKutta.H
index 29affc69a55..114f05a2387 100644
--- a/Src/Base/AMReX_RungeKutta.H
+++ b/Src/Base/AMReX_RungeKutta.H
@@ -38,7 +38,7 @@ namespace amrex {
  *
  * For RK3 and RK4, one must also provide a callable object with the
  * signature of `void(Array<MF,order> const& rkk)`, where `order` is the RK
- * order and `rkk` contains the righ-hand side at all the RK stages.  The
+ * order and `rkk` contains the right-hand side at all the RK stages.  The
  * FillPatcher class can be useful for implementing such a callable.  See
  * AmrLevel::RK for an example.
  */

From d94d92a3207db06c2d334452a8f78c0e31926438 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Mon, 10 Oct 2022 10:52:36 -0700
Subject: [PATCH 6/8] Add references to RK3 and RK4

---
 Src/Base/AMReX_RungeKutta.H | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/Src/Base/AMReX_RungeKutta.H b/Src/Base/AMReX_RungeKutta.H
index 114f05a2387..291b820872c 100644
--- a/Src/Base/AMReX_RungeKutta.H
+++ b/Src/Base/AMReX_RungeKutta.H
@@ -10,14 +10,19 @@ namespace amrex {
  * \brief Functions for Runge-Kutta methods
  *
  * This namespace RungeKutta has functions for a number RK methods, RK2, RK3
- * and RK4.  The function templates take the old data in FabArray/MultiFab
- * as input, and evolve the system for one time step.  The result is stored
- * in another FabArray/MultiFab.  These two FabArrays must have ghost cells
- * if they are needed for evaluating the right-hand side.  The functions
- * take three callable objects for computing the right-hand side, filling
- * ghost cells, and optionally post-processing RK stage results.  For RK3
- * and RK4, they also need a callable object for storing the data needed for
- * filling coarse/fine boundaries in AMR simulations.
+ * and RK4.  Here, RK2 refers to the explicit trapezoid rule, RK3 refers to
+ * the SSPRK3
+ * (https://en.wikipedia.org/wiki/List_of_Runge%E2%80%93Kutta_methods#Third-order_Strong_Stability_Preserving_Runge-Kutta_(SSPRK3)),
+ * and RK4 is the classical fourth-order method
+ * (https://en.wikipedia.org/wiki/List_of_Runge%E2%80%93Kutta_methods#Classic_fourth-order_method).
+ * The function templates take the old data in FabArray/MultiFab as input,
+ * and evolve the system for one time step.  The result is stored in another
+ * FabArray/MultiFab.  These two FabArrays must have ghost cells if they are
+ * needed for evaluating the right-hand side.  The functions take three
+ * callable objects for computing the right-hand side, filling ghost cells,
+ * and optionally post-processing RK stage results.  For RK3 and RK4, they
+ * also need a callable object for storing the data needed for filling
+ * coarse/fine boundaries in AMR simulations.
  *
  * The callable object for right-hand side has the signature of `void(int
  * stage, MF& dudt, MF const& u, Real dt)`, where `stage` is the RK stage

From 438a8466e0733381f97ffd711fb272c99c3a729a Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Thu, 13 Oct 2022 19:10:15 -0700
Subject: [PATCH 7/8] Use MF instread of FabArrayBase so that the meta-data is
 properly destroyed

---
 Src/AmrCore/AMReX_FillPatcher.H | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Src/AmrCore/AMReX_FillPatcher.H b/Src/AmrCore/AMReX_FillPatcher.H
index 4fe28e5ec8a..22b14d35c0d 100644
--- a/Src/AmrCore/AMReX_FillPatcher.H
+++ b/Src/AmrCore/AMReX_FillPatcher.H
@@ -207,7 +207,7 @@ private:
     int m_ncomp;
     InterpBase* m_interp;
     EB2::IndexSpace const* m_eb_index_space = nullptr;
-    FabArrayBase m_sfine;
+    MF m_sfine;
     IntVect m_ratio;
     Vector<std::pair<Real,std::unique_ptr<MF>>> m_cf_crse_data;
     std::unique_ptr<MF> m_cf_crse_data_tmp;
@@ -234,7 +234,7 @@ FillPatcher<MF>::FillPatcher (BoxArray const& fba, DistributionMapping const& fd
       m_ncomp(ncomp),
       m_interp(interp),
       m_eb_index_space(eb_index_space),
-      m_sfine(fba, fdm, 1, nghost)
+      m_sfine(fba, fdm, 1, nghost, MFInfo().SetAlloc(false))
 {
     static_assert(IsFabArray<MF>::value,
                   "FillPatcher<MF>: MF must be FabArray type");

From 9ddc01537b2781de5ce68695eea687bade8336e1 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Fri, 14 Oct 2022 09:31:04 -0700
Subject: [PATCH 8/8] Pass t to user's RHS function

---
 Src/Base/AMReX_RungeKutta.H          | 25 +++++++++++++------------
 Tests/GPU/CNS/Source/CNS_advance.cpp |  3 ++-
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/Src/Base/AMReX_RungeKutta.H b/Src/Base/AMReX_RungeKutta.H
index 291b820872c..b5e35f783c5 100644
--- a/Src/Base/AMReX_RungeKutta.H
+++ b/Src/Base/AMReX_RungeKutta.H
@@ -25,9 +25,10 @@ namespace amrex {
  * coarse/fine boundaries in AMR simulations.
  *
  * The callable object for right-hand side has the signature of `void(int
- * stage, MF& dudt, MF const& u, Real dt)`, where `stage` is the RK stage
- * number starting from 1, `dudt` is the output, `u` is the input, and `dt`
- * is the sub-time step, which can be used for reflux operations in AMR
+ * stage, MF& dudt, MF const& u, Real t, Real dt)`, where `stage` is the RK
+ * stage number starting from 1, `dudt` is the output, `u` is the input, `t`
+ * is the first-order approximate time of the stage, and `dt` is the
+ * sub-time step, which can be used for reflux operations in AMR
  * simulations.
  *
  * The callable object for filling ghost cells has the signature of
@@ -166,14 +167,14 @@ void RK2 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
 
     // RK2 stage 1
     fillbndry(1, Uold, time);
-    frhs(1, dUdt, Uold, Real(0.5)*dt);
+    frhs(1, dUdt, Uold, time, Real(0.5)*dt);
     // Unew = Uold + dt * dUdt
     detail::rk_update(Unew, Uold, dUdt, dt);
     post_stage(1, Unew);
 
     // RK2 stage 2
     fillbndry(2, Unew, time+dt);
-    frhs(2, dUdt, Unew, Real(0.5)*dt);
+    frhs(2, dUdt, Unew, time, Real(0.5)*dt);
     // Unew = (Uold+Unew)/2 + dUdt_2 * dt/2,
     // which is Unew = Uold + dt/2 * (dUdt_1 + dUdt_2)
     detail::rk2_update_2(Unew, Uold, dUdt, dt);
@@ -207,14 +208,14 @@ void RK3 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
 
     // RK3 stage 1
     fillbndry(1, Uold, time);
-    frhs(1, rkk[0], Uold, dt/Real(6.));
+    frhs(1, rkk[0], Uold, time, dt/Real(6.));
     // Unew = Uold + k1 * dt
     detail::rk_update(Unew, Uold, rkk[0], dt);
     post_stage(1, Unew);
 
     // RK3 stage 2
     fillbndry(2, Unew, time+dt);
-    frhs(2, rkk[1], Unew, dt/Real(6.));
+    frhs(2, rkk[1], Unew, time+dt, dt/Real(6.));
     // Unew = Uold + (k1+k2) * dt/4
     detail::rk_update(Unew, Uold, rkk[0], rkk[1], Real(0.25)*dt);
     post_stage(2, Unew);
@@ -222,7 +223,7 @@ void RK3 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
     // RK3 stage 3
     Real t_half = time + Real(0.5)*dt;
     fillbndry(3, Unew, t_half);
-    frhs(3, rkk[2], Unew, dt*Real(2./3.));
+    frhs(3, rkk[2], Unew, t_half, dt*Real(2./3.));
     // Unew = Uold + (k1/6 + k2/6 + k3*(2/3)) * dt
     detail::rk3_update_3(Unew, Uold, rkk, Real(1./6.)*dt);
     post_stage(3, Unew);
@@ -257,7 +258,7 @@ void RK4 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
 
     // RK4 stage 1
     fillbndry(1, Uold, time);
-    frhs(1, rkk[0], Uold, dt/Real(6.));
+    frhs(1, rkk[0], Uold, time, dt/Real(6.));
     // Unew = Uold + k1 * dt/2
     detail::rk_update(Unew, Uold, rkk[0], Real(0.5)*dt);
     post_stage(1, Unew);
@@ -265,21 +266,21 @@ void RK4 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
     // RK4 stage 2
     Real t_half = time + Real(0.5)*dt;
     fillbndry(2, Unew, t_half);
-    frhs(2, rkk[1], Unew, dt/Real(3.));
+    frhs(2, rkk[1], Unew, t_half, dt/Real(3.));
     // Unew = Uold + k2 * dt/2
     detail::rk_update(Unew, Uold, rkk[1], Real(0.5)*dt);
     post_stage(2, Unew);
 
     // RK4 stage 3
     fillbndry(3, Unew, t_half);
-    frhs(3, rkk[2], Unew, dt/Real(3.));
+    frhs(3, rkk[2], Unew, t_half, dt/Real(3.));
     // Unew = Uold + k3 * dt;
     detail::rk_update(Unew, Uold, rkk[2], dt);
     post_stage(3, Unew);
 
     // RK4 stage 4
     fillbndry(4, Unew, time+dt);
-    frhs(4, rkk[3], Unew, dt/Real(6.));
+    frhs(4, rkk[3], Unew, time+dt, dt/Real(6.));
     // Unew = Uold + (k1/6 + k2/3 + k3/3 + k4/6) * dt
     detail::rk4_update_4(Unew, Uold, rkk, Real(1./6.)*dt);
     post_stage(4, Unew);
diff --git a/Tests/GPU/CNS/Source/CNS_advance.cpp b/Tests/GPU/CNS/Source/CNS_advance.cpp
index 3bcb4c388bf..99749dded19 100644
--- a/Tests/GPU/CNS/Source/CNS_advance.cpp
+++ b/Tests/GPU/CNS/Source/CNS_advance.cpp
@@ -33,7 +33,8 @@ CNS::advance (Real time, Real dt, int iteration, int ncycle)
 
     RK(rk_order, State_Type, time, dt, iteration, ncycle,
        // Given state S, compute dSdt. dtsub is needed for flux register operations
-       [&] (int /*stage*/, MultiFab& dSdt, MultiFab const& S, Real dtsub) {
+       [&] (int /*stage*/, MultiFab& dSdt, MultiFab const& S,
+            Real /*t*/, Real dtsub) {
            compute_dSdt(S, dSdt, dtsub, fr_as_crse, fr_as_fine);
        },
        // Optional. In case if there is anything needed after each RK substep.