diff --git a/cicecore/cicedynB/dynamics/ice_dyn_eap.F90 b/cicecore/cicedynB/dynamics/ice_dyn_eap.F90
index 3b31fa8cd..8d4f5cccf 100644
--- a/cicecore/cicedynB/dynamics/ice_dyn_eap.F90
+++ b/cicecore/cicedynB/dynamics/ice_dyn_eap.F90
@@ -122,7 +122,8 @@ subroutine eap (dt)
       use ice_dyn_shared, only: fcor_blk, ndte, dtei, &
           denom1, uvel_init, vvel_init, arlx1i, &
           dyn_prep1, dyn_prep2, stepu, dyn_finish, &
-          basal_stress_coeff, basalstress
+          basal_stress_coeff, basalstress, &
+          stack_velocity_field, unstack_velocity_field
       use ice_flux, only: rdg_conv, strairxT, strairyT, &
           strairx, strairy, uocn, vocn, ss_tltx, ss_tlty, iceumask, fm, &
           strtltx, strtlty, strocnx, strocny, strintx, strinty, taubx, tauby, &
@@ -354,11 +355,6 @@ subroutine eap (dt)
                                       vicen    = vicen   (i,j,:,iblk), & 
                                       strength = strength(i,j,  iblk) )
          enddo  ! ij
-
-         ! load velocity into array for boundary updates
-         fld2(:,:,1,iblk) = uvel(:,:,iblk)
-         fld2(:,:,2,iblk) = vvel(:,:,iblk)
-
       enddo  ! iblk
       !$TCXOMP END PARALLEL DO
 
@@ -369,19 +365,8 @@ subroutine eap (dt)
       call ice_timer_start(timer_bound)
       call ice_HaloUpdate (strength,           halo_info, &
                            field_loc_center,   field_type_scalar)
-      ! velocities may have changed in dyn_prep2
-      call ice_HaloUpdate (fld2,               halo_info, &
-                           field_loc_NEcorner, field_type_vector)
       call ice_timer_stop(timer_bound)
 
-      ! unload
-      !$OMP PARALLEL DO PRIVATE(iblk)
-      do iblk = 1, nblocks
-         uvel(:,:,iblk) = fld2(:,:,1,iblk)
-         vvel(:,:,iblk) = fld2(:,:,2,iblk)
-      enddo
-      !$OMP END PARALLEL DO
-
       if (maskhalo_dyn) then
          call ice_timer_start(timer_bound)
          halomask = 0
@@ -392,6 +377,19 @@ subroutine eap (dt)
          call ice_HaloMask(halo_info_mask, halo_info, halomask)
       endif
 
+      ! velocities may have changed in dyn_prep2
+      call stack_velocity_field(uvel, vvel, fld2)
+      call ice_timer_start(timer_bound)
+      if (maskhalo_dyn) then
+         call ice_HaloUpdate (fld2,               halo_info_mask, &
+                              field_loc_NEcorner, field_type_vector)
+      else
+         call ice_HaloUpdate (fld2,               halo_info, &
+                              field_loc_NEcorner, field_type_vector)
+      endif
+      call ice_timer_stop(timer_bound)
+      call unstack_velocity_field(fld2, uvel, vvel)
+
       !-----------------------------------------------------------------
       ! basal stress coefficients (landfast ice)
       !-----------------------------------------------------------------
@@ -472,10 +470,6 @@ subroutine eap (dt)
                         uvel     (:,:,iblk), vvel    (:,:,iblk), &
                         Tbu      (:,:,iblk))
 
-            ! load velocity into array for boundary updates
-            fld2(:,:,1,iblk) = uvel(:,:,iblk)
-            fld2(:,:,2,iblk) = vvel(:,:,iblk)
-
       !-----------------------------------------------------------------
       ! evolution of structure tensor A
       !-----------------------------------------------------------------
@@ -501,6 +495,7 @@ subroutine eap (dt)
          enddo
          !$TCXOMP END PARALLEL DO
 
+         call stack_velocity_field(uvel, vvel, fld2)
          call ice_timer_start(timer_bound)
          if (maskhalo_dyn) then
             call ice_HaloUpdate (fld2,               halo_info_mask, &
@@ -510,14 +505,7 @@ subroutine eap (dt)
                                  field_loc_NEcorner, field_type_vector)
          endif
          call ice_timer_stop(timer_bound)
-
-         ! unload
-         !$OMP PARALLEL DO PRIVATE(iblk)
-         do iblk = 1, nblocks
-            uvel(:,:,iblk) = fld2(:,:,1,iblk)
-            vvel(:,:,iblk) = fld2(:,:,2,iblk)
-         enddo
-         !$OMP END PARALLEL DO
+         call unstack_velocity_field(fld2, uvel, vvel)
 
       enddo                     ! subcycling
 
@@ -556,16 +544,12 @@ end subroutine eap
 !=======================================================================
 
 ! Initialize parameters and variables needed for the eap dynamics
-! (based on init_evp)
+! (based on init_dyn)
 
-      subroutine init_eap (dt)
+      subroutine init_eap
 
       use ice_blocks, only: nx_block, ny_block
       use ice_domain, only: nblocks
-      use ice_dyn_shared, only: init_evp
-
-      real (kind=dbl_kind), intent(in) :: &
-         dt      ! time step
 
       ! local variables
 
@@ -595,8 +579,6 @@ subroutine init_eap (dt)
          file=__FILE__, line=__LINE__)
       phi = pi/c12 ! diamond shaped floe smaller angle (default phi = 30 deg)
 
-      call init_evp (dt)
-
       !$OMP PARALLEL DO PRIVATE(iblk,i,j)
       do iblk = 1, nblocks
       do j = 1, ny_block
@@ -1321,7 +1303,7 @@ subroutine stress_eap  (nx_block,   ny_block,       &
          tensionse = -cym(i,j)*uvel(i  ,j-1) - dyt(i,j)*uvel(i-1,j-1) &
                    +  cxp(i,j)*vvel(i  ,j-1) - dxt(i,j)*vvel(i  ,j  )
 
-         ! shearing strain rate  =  e_12
+         ! shearing strain rate  =  2*e_12
          shearne = -cym(i,j)*vvel(i  ,j  ) - dyt(i,j)*vvel(i-1,j  ) &
                  -  cxm(i,j)*uvel(i  ,j  ) - dxt(i,j)*uvel(i  ,j-1)
          shearnw = -cyp(i,j)*vvel(i-1,j  ) + dyt(i,j)*vvel(i  ,j  ) &
diff --git a/cicecore/cicedynB/dynamics/ice_dyn_evp.F90 b/cicecore/cicedynB/dynamics/ice_dyn_evp.F90
index 0f8acd547..c4312c325 100644
--- a/cicecore/cicedynB/dynamics/ice_dyn_evp.F90
+++ b/cicecore/cicedynB/dynamics/ice_dyn_evp.F90
@@ -94,7 +94,7 @@ subroutine evp (dt)
           ice_timer_start, ice_timer_stop, timer_evp_1d, timer_evp_2d
       use ice_dyn_evp_1d, only: ice_dyn_evp_1d_copyin, ice_dyn_evp_1d_kernel, &
           ice_dyn_evp_1d_copyout
-      use ice_dyn_shared, only: kevp_kernel
+      use ice_dyn_shared, only: kevp_kernel, stack_velocity_field, unstack_velocity_field
 
       real (kind=dbl_kind), intent(in) :: &
          dt      ! time step
@@ -297,10 +297,6 @@ subroutine evp (dt)
                                       strength = strength(i,j,  iblk) )
          enddo  ! ij
 
-         ! load velocity into array for boundary updates
-         fld2(:,:,1,iblk) = uvel(:,:,iblk)
-         fld2(:,:,2,iblk) = vvel(:,:,iblk)
-
       enddo  ! iblk
       !$TCXOMP END PARALLEL DO
 
@@ -311,19 +307,8 @@ subroutine evp (dt)
       call ice_timer_start(timer_bound)
       call ice_HaloUpdate (strength,           halo_info, &
                            field_loc_center,   field_type_scalar)
-      ! velocities may have changed in dyn_prep2
-      call ice_HaloUpdate (fld2,               halo_info, &
-                           field_loc_NEcorner, field_type_vector)
       call ice_timer_stop(timer_bound)
 
-      ! unload
-      !$OMP PARALLEL DO PRIVATE(iblk)
-      do iblk = 1, nblocks
-         uvel(:,:,iblk) = fld2(:,:,1,iblk)
-         vvel(:,:,iblk) = fld2(:,:,2,iblk)
-      enddo
-      !$OMP END PARALLEL DO
-
       if (maskhalo_dyn) then
          call ice_timer_start(timer_bound)
          halomask = 0
@@ -334,6 +319,19 @@ subroutine evp (dt)
          call ice_HaloMask(halo_info_mask, halo_info, halomask)
       endif
 
+      ! velocities may have changed in dyn_prep2
+      call stack_velocity_field(uvel, vvel, fld2)
+      call ice_timer_start(timer_bound)
+      if (maskhalo_dyn) then
+         call ice_HaloUpdate (fld2,               halo_info_mask, &
+                              field_loc_NEcorner, field_type_vector)
+      else
+         call ice_HaloUpdate (fld2,               halo_info, &
+                              field_loc_NEcorner, field_type_vector)
+      endif
+      call ice_timer_stop(timer_bound)
+      call unstack_velocity_field(fld2, uvel, vvel)
+
       !-----------------------------------------------------------------
       ! basal stress coefficients (landfast ice)
       !-----------------------------------------------------------------
@@ -442,13 +440,10 @@ subroutine evp (dt)
                         uvel_init(:,:,iblk), vvel_init(:,:,iblk),&
                         uvel     (:,:,iblk), vvel    (:,:,iblk), &
                         Tbu      (:,:,iblk))
-
-            ! load velocity into array for boundary updates
-            fld2(:,:,1,iblk) = uvel(:,:,iblk)
-            fld2(:,:,2,iblk) = vvel(:,:,iblk)
          enddo
          !$TCXOMP END PARALLEL DO
 
+         call stack_velocity_field(uvel, vvel, fld2)
          call ice_timer_start(timer_bound)
          if (maskhalo_dyn) then
             call ice_HaloUpdate (fld2,               halo_info_mask, &
@@ -458,14 +453,7 @@ subroutine evp (dt)
                                  field_loc_NEcorner, field_type_vector)
          endif
          call ice_timer_stop(timer_bound)
-
-         ! unload
-         !$OMP PARALLEL DO PRIVATE(iblk)
-         do iblk = 1, nblocks
-            uvel(:,:,iblk) = fld2(:,:,1,iblk)
-            vvel(:,:,iblk) = fld2(:,:,2,iblk)
-         enddo
-         !$OMP END PARALLEL DO
+         call unstack_velocity_field(fld2, uvel, vvel)
          
       enddo                     ! subcycling
       endif  ! kevp_kernel
@@ -599,6 +587,8 @@ subroutine stress (nx_block,   ny_block,   &
                          rdg_conv,   rdg_shear,  & 
                          str )
 
+      use ice_dyn_shared, only: strain_rates, deformations
+
       integer (kind=int_kind), intent(in) :: & 
          nx_block, ny_block, & ! block dimensions
          ksub              , & ! subcycling step
@@ -676,58 +666,20 @@ subroutine stress (nx_block,   ny_block,   &
       ! strain rates
       ! NOTE these are actually strain rates * area  (m^2/s)
       !-----------------------------------------------------------------
-         ! divergence  =  e_11 + e_22
-         divune    = cyp(i,j)*uvel(i  ,j  ) - dyt(i,j)*uvel(i-1,j  ) &
-                   + cxp(i,j)*vvel(i  ,j  ) - dxt(i,j)*vvel(i  ,j-1)
-         divunw    = cym(i,j)*uvel(i-1,j  ) + dyt(i,j)*uvel(i  ,j  ) &
-                   + cxp(i,j)*vvel(i-1,j  ) - dxt(i,j)*vvel(i-1,j-1)
-         divusw    = cym(i,j)*uvel(i-1,j-1) + dyt(i,j)*uvel(i  ,j-1) &
-                   + cxm(i,j)*vvel(i-1,j-1) + dxt(i,j)*vvel(i-1,j  )
-         divuse    = cyp(i,j)*uvel(i  ,j-1) - dyt(i,j)*uvel(i-1,j-1) &
-                   + cxm(i,j)*vvel(i  ,j-1) + dxt(i,j)*vvel(i  ,j  )
-
-         ! tension strain rate  =  e_11 - e_22
-         tensionne = -cym(i,j)*uvel(i  ,j  ) - dyt(i,j)*uvel(i-1,j  ) &
-                   +  cxm(i,j)*vvel(i  ,j  ) + dxt(i,j)*vvel(i  ,j-1)
-         tensionnw = -cyp(i,j)*uvel(i-1,j  ) + dyt(i,j)*uvel(i  ,j  ) &
-                   +  cxm(i,j)*vvel(i-1,j  ) + dxt(i,j)*vvel(i-1,j-1)
-         tensionsw = -cyp(i,j)*uvel(i-1,j-1) + dyt(i,j)*uvel(i  ,j-1) &
-                   +  cxp(i,j)*vvel(i-1,j-1) - dxt(i,j)*vvel(i-1,j  )
-         tensionse = -cym(i,j)*uvel(i  ,j-1) - dyt(i,j)*uvel(i-1,j-1) &
-                   +  cxp(i,j)*vvel(i  ,j-1) - dxt(i,j)*vvel(i  ,j  )
-
-         ! shearing strain rate  =  e_12
-         shearne = -cym(i,j)*vvel(i  ,j  ) - dyt(i,j)*vvel(i-1,j  ) &
-                 -  cxm(i,j)*uvel(i  ,j  ) - dxt(i,j)*uvel(i  ,j-1)
-         shearnw = -cyp(i,j)*vvel(i-1,j  ) + dyt(i,j)*vvel(i  ,j  ) &
-                 -  cxm(i,j)*uvel(i-1,j  ) - dxt(i,j)*uvel(i-1,j-1)
-         shearsw = -cyp(i,j)*vvel(i-1,j-1) + dyt(i,j)*vvel(i  ,j-1) &
-                 -  cxp(i,j)*uvel(i-1,j-1) + dxt(i,j)*uvel(i-1,j  )
-         shearse = -cym(i,j)*vvel(i  ,j-1) - dyt(i,j)*vvel(i-1,j-1) &
-                 -  cxp(i,j)*uvel(i  ,j-1) + dxt(i,j)*uvel(i  ,j  )
-         
-         ! Delta (in the denominator of zeta, eta)
-         Deltane = sqrt(divune**2 + ecci*(tensionne**2 + shearne**2))
-         Deltanw = sqrt(divunw**2 + ecci*(tensionnw**2 + shearnw**2))
-         Deltasw = sqrt(divusw**2 + ecci*(tensionsw**2 + shearsw**2))
-         Deltase = sqrt(divuse**2 + ecci*(tensionse**2 + shearse**2))
-
-      !-----------------------------------------------------------------
-      ! on last subcycle, save quantities for mechanical redistribution
-      !-----------------------------------------------------------------
-         if (ksub == ndte) then
-            divu(i,j) = p25*(divune + divunw + divuse + divusw) * tarear(i,j)
-            tmp = p25*(Deltane + Deltanw + Deltase + Deltasw)   * tarear(i,j)
-            rdg_conv(i,j)  = -min(divu(i,j),c0)
-            rdg_shear(i,j) = p5*(tmp-abs(divu(i,j))) 
-
-            ! diagnostic only
-            ! shear = sqrt(tension**2 + shearing**2)
-            shear(i,j) = p25*tarear(i,j)*sqrt( &
-                 (tensionne + tensionnw + tensionse + tensionsw)**2 &
-                +  (shearne +   shearnw +   shearse +   shearsw)**2)
-
-         endif
+         call strain_rates (nx_block,   ny_block,   &
+                            i,          j,          &
+                            uvel,       vvel,       &
+                            dxt,        dyt,        &
+                            cxp,        cyp,        &
+                            cxm,        cym,        &
+                            divune,     divunw,     &
+                            divuse,     divusw,     &
+                            tensionne,  tensionnw,  &
+                            tensionse,  tensionsw,  &
+                            shearne,    shearnw,    &
+                            shearse,    shearsw,    &
+                            Deltane,    Deltanw,    &
+                            Deltase,    Deltasw     )
 
       !-----------------------------------------------------------------
       ! strength/Delta                   ! kg/s
@@ -902,6 +854,23 @@ subroutine stress (nx_block,   ny_block,   &
 
       enddo                     ! ij
 
+      !-----------------------------------------------------------------
+      ! on last subcycle, save quantities for mechanical redistribution
+      !-----------------------------------------------------------------
+      if (ksub == ndte) then
+         call deformations (nx_block  , ny_block  , &
+                            icellt    ,             &
+                            indxti    , indxtj    , &
+                            uvel      , vvel      , &
+                            dxt       , dyt       , &
+                            cxp       , cyp       , &
+                            cxm       , cym       , &
+                            tarear    ,             &
+                            shear     , divu      , &
+                            rdg_conv  , rdg_shear )
+
+      endif
+
       end subroutine stress
 
 !=======================================================================
diff --git a/cicecore/cicedynB/dynamics/ice_dyn_evp_1d.F90 b/cicecore/cicedynB/dynamics/ice_dyn_evp_1d.F90
index c88a7de3a..9fac97a89 100644
--- a/cicecore/cicedynB/dynamics/ice_dyn_evp_1d.F90
+++ b/cicecore/cicedynB/dynamics/ice_dyn_evp_1d.F90
@@ -326,7 +326,7 @@ subroutine stress_i(NA_len, &
       ! tension strain rate  =  e_11 - e_22
       tensionne = -cym*uvel(iw) - dyt(iw)*tmp_uvel_ee                      &
                 +  cxm*vvel(iw) + dxt(iw)*tmp_vvel_se
-      ! shearing strain rate  =  e_12
+      ! shearing strain rate  =  2*e_12
       shearne = -cym*vvel(iw) - dyt(iw)*tmp_vvel_ee                        &
               -  cxm*uvel(iw) - dxt(iw)*tmp_uvel_se
       ! Delta (in the denominator of zeta, eta)
@@ -614,7 +614,7 @@ subroutine stress_l(NA_len, tarear, &
       tensionse = -cym*tmp_uvel_se - dyt(iw)*tmp_uvel_ne                  &
                 +  cxp*tmp_vvel_se - dxt(iw)*vvel(iw)
   
-      ! shearing strain rate  =  e_12
+      ! shearing strain rate  =  2*e_12
       shearne = -cym*vvel(iw) - dyt(iw)*tmp_vvel_ee                        &
               -  cxm*uvel(iw) - dxt(iw)*tmp_uvel_se
       shearnw = -cyp*tmp_vvel_ee + dyt(iw)*vvel(iw)                        &
diff --git a/cicecore/cicedynB/dynamics/ice_dyn_shared.F90 b/cicecore/cicedynB/dynamics/ice_dyn_shared.F90
index c3dc83a24..d9a0919e6 100644
--- a/cicecore/cicedynB/dynamics/ice_dyn_shared.F90
+++ b/cicecore/cicedynB/dynamics/ice_dyn_shared.F90
@@ -22,9 +22,10 @@ module ice_dyn_shared
 
       implicit none
       private
-      public :: init_evp, set_evp_parameters, stepu, principal_stress, &
+      public :: init_dyn, set_evp_parameters, stepu, principal_stress, &
                 dyn_prep1, dyn_prep2, dyn_finish, basal_stress_coeff,  &
-                alloc_dyn_shared
+                alloc_dyn_shared, deformations, strain_rates, &
+                stack_velocity_field, unstack_velocity_field
 
       ! namelist parameters
 
@@ -78,7 +79,7 @@ module ice_dyn_shared
       real (kind=dbl_kind), dimension (:,:,:), allocatable, public :: &
          uvel_init, & ! x-component of velocity (m/s), beginning of timestep
          vvel_init    ! y-component of velocity (m/s), beginning of timestep
-         
+
       ! ice isotropic tensile strength parameter
       real (kind=dbl_kind), public :: &
          Ktens         ! T=Ktens*P (tensile strength: see Konig and Holland, 2010)   
@@ -91,9 +92,9 @@ module ice_dyn_shared
          k1, &        ! 1st free parameter for landfast parameterization
          k2, &        ! second free parameter (N/m^3) for landfast parametrization 
          alphab, &    ! alphab=Cb factor in Lemieux et al 2015
-         threshold_hw ! max water depth for grounding 
-                      ! see keel data from Amundrud et al. 2004 (JGR)
-
+         threshold_hw, & ! max water depth for grounding 
+                         ! see keel data from Amundrud et al. 2004 (JGR)
+         u0 = 5e-5_dbl_kind ! residual velocity for basal stress (m/s)
 
 !=======================================================================
 
@@ -117,10 +118,10 @@ end subroutine alloc_dyn_shared
 
 !=======================================================================
 
-! Initialize parameters and variables needed for the evp dynamics
+! Initialize parameters and variables needed for the dynamics
 ! author: Elizabeth C. Hunke, LANL
 
-      subroutine init_evp (dt)
+      subroutine init_dyn (dt)
 
       use ice_blocks, only: nx_block, ny_block
       use ice_domain, only: nblocks
@@ -141,7 +142,7 @@ subroutine init_evp (dt)
          i, j, &
          iblk            ! block index
 
-      character(len=*), parameter :: subname = '(init_evp)'
+      character(len=*), parameter :: subname = '(init_dyn)'
 
       call set_evp_parameters (dt)
 
@@ -199,7 +200,7 @@ subroutine init_evp (dt)
       enddo                     ! iblk
       !$OMP END PARALLEL DO
 
-      end subroutine init_evp
+      end subroutine init_dyn
 
 !=======================================================================
 
@@ -690,9 +691,6 @@ subroutine stepu (nx_block,   ny_block, &
          Cb                , & ! complete basal stress coeff
          rhow                  !
 
-      real (kind=dbl_kind) :: &
-         u0 = 5e-5_dbl_kind    ! residual velocity for basal stress (m/s)
-         
       character(len=*), parameter :: subname = '(stepu)'
 
       !-----------------------------------------------------------------
@@ -993,6 +991,262 @@ end subroutine principal_stress
 
 !=======================================================================
 
+! Compute deformations for mechanical redistribution
+!
+! author: Elizabeth C. Hunke, LANL
+!
+! 2019: subroutine created by Philippe Blain, ECCC
+
+      subroutine deformations (nx_block,   ny_block,   &
+                               icellt,                 &
+                               indxti,     indxtj,     &
+                               uvel,       vvel,       &
+                               dxt,        dyt,        &
+                               cxp,        cyp,        &
+                               cxm,        cym,        &
+                               tarear,                 &
+                               shear,      divu,       &
+                               rdg_conv,   rdg_shear )
+
+      use ice_constants, only: p25, p5
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         icellt                ! no. of cells where icetmask = 1
+
+      integer (kind=int_kind), dimension (nx_block*ny_block), &
+         intent(in) :: &
+         indxti   , & ! compressed index in i-direction
+         indxtj       ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         uvel     , & ! x-component of velocity (m/s)
+         vvel     , & ! y-component of velocity (m/s)
+         dxt      , & ! width of T-cell through the middle (m)
+         dyt      , & ! height of T-cell through the middle (m)
+         cyp      , & ! 1.5*HTE - 0.5*HTE
+         cxp      , & ! 1.5*HTN - 0.5*HTN
+         cym      , & ! 0.5*HTE - 1.5*HTE
+         cxm      , & ! 0.5*HTN - 1.5*HTN
+         tarear       ! 1/tarea
+         
+      real (kind=dbl_kind), dimension (nx_block,ny_block), &
+         intent(inout) :: &
+         shear    , & ! strain rate II component (1/s)
+         divu     , & ! strain rate I component, velocity divergence (1/s)
+         rdg_conv , & ! convergence term for ridging (1/s)
+         rdg_shear    ! shear term for ridging (1/s)
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, ij
+
+      real (kind=dbl_kind) :: &                       ! at each corner :
+        divune, divunw, divuse, divusw            , & ! divergence
+        tensionne, tensionnw, tensionse, tensionsw, & ! tension
+        shearne, shearnw, shearse, shearsw        , & ! shearing
+        Deltane, Deltanw, Deltase, Deltasw        , & ! Delta
+        tmp                                           ! useful combination
+
+      character(len=*), parameter :: subname = '(deformations)'
+      
+      do ij = 1, icellt
+         i = indxti(ij)
+         j = indxtj(ij)
+         
+      !-----------------------------------------------------------------
+      ! strain rates
+      ! NOTE these are actually strain rates * area  (m^2/s)
+      !-----------------------------------------------------------------
+         call strain_rates (nx_block,   ny_block,   &
+                            i,          j,          &
+                            uvel,       vvel,       &
+                            dxt,        dyt,        &
+                            cxp,        cyp,        &
+                            cxm,        cym,        &
+                            divune,     divunw,     &
+                            divuse,     divusw,     &
+                            tensionne,  tensionnw,  &
+                            tensionse,  tensionsw,  &
+                            shearne,    shearnw,    &
+                            shearse,    shearsw,    &
+                            Deltane,    Deltanw,    &
+                            Deltase,    Deltasw     )
+      !-----------------------------------------------------------------
+      ! deformations for mechanical redistribution
+      !-----------------------------------------------------------------
+         divu(i,j) = p25*(divune + divunw + divuse + divusw) * tarear(i,j)
+         tmp = p25*(Deltane + Deltanw + Deltase + Deltasw)   * tarear(i,j)
+         rdg_conv(i,j)  = -min(divu(i,j),c0)
+         rdg_shear(i,j) = p5*(tmp-abs(divu(i,j)))
+
+         ! diagnostic only
+         ! shear = sqrt(tension**2 + shearing**2)
+         shear(i,j) = p25*tarear(i,j)*sqrt( &
+                      (tensionne + tensionnw + tensionse + tensionsw)**2 + &
+                      (shearne   + shearnw   + shearse   + shearsw  )**2)
+
+      enddo                     ! ij
+
+      end subroutine deformations
+
+!=======================================================================
+
+! Compute strain rates
+!
+! author: Elizabeth C. Hunke, LANL
+!
+! 2019: subroutine created by Philippe Blain, ECCC
+
+      subroutine strain_rates (nx_block,   ny_block,   &
+                               i,          j,          &
+                               uvel,       vvel,       &
+                               dxt,        dyt,        &
+                               cxp,        cyp,        &
+                               cxm,        cym,        &
+                               divune,     divunw,     &
+                               divuse,     divusw,     &
+                               tensionne,  tensionnw,  &
+                               tensionse,  tensionsw,  &
+                               shearne,    shearnw,    &
+                               shearse,    shearsw,    &
+                               Deltane,    Deltanw,    &
+                               Deltase,    Deltasw     )
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block    ! block dimensions
+         
+      integer (kind=int_kind) :: &
+         i, j                  ! indices
+         
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         uvel     , & ! x-component of velocity (m/s)
+         vvel     , & ! y-component of velocity (m/s)
+         dxt      , & ! width of T-cell through the middle (m)
+         dyt      , & ! height of T-cell through the middle (m)
+         cyp      , & ! 1.5*HTE - 0.5*HTE
+         cxp      , & ! 1.5*HTN - 0.5*HTN
+         cym      , & ! 0.5*HTE - 1.5*HTE
+         cxm          ! 0.5*HTN - 1.5*HTN
+         
+      real (kind=dbl_kind), intent(out):: &           ! at each corner :
+        divune, divunw, divuse, divusw            , & ! divergence
+        tensionne, tensionnw, tensionse, tensionsw, & ! tension
+        shearne, shearnw, shearse, shearsw        , & ! shearing
+        Deltane, Deltanw, Deltase, Deltasw            ! Delta
+         
+      character(len=*), parameter :: subname = '(strain_rates)'
+         
+      !-----------------------------------------------------------------
+      ! strain rates
+      ! NOTE these are actually strain rates * area  (m^2/s)
+      !-----------------------------------------------------------------
+
+      ! divergence  =  e_11 + e_22
+      divune    = cyp(i,j)*uvel(i  ,j  ) - dyt(i,j)*uvel(i-1,j  ) &
+                + cxp(i,j)*vvel(i  ,j  ) - dxt(i,j)*vvel(i  ,j-1)
+      divunw    = cym(i,j)*uvel(i-1,j  ) + dyt(i,j)*uvel(i  ,j  ) &
+                + cxp(i,j)*vvel(i-1,j  ) - dxt(i,j)*vvel(i-1,j-1)
+      divusw    = cym(i,j)*uvel(i-1,j-1) + dyt(i,j)*uvel(i  ,j-1) &
+                + cxm(i,j)*vvel(i-1,j-1) + dxt(i,j)*vvel(i-1,j  )
+      divuse    = cyp(i,j)*uvel(i  ,j-1) - dyt(i,j)*uvel(i-1,j-1) &
+                + cxm(i,j)*vvel(i  ,j-1) + dxt(i,j)*vvel(i  ,j  )
+
+      ! tension strain rate  =  e_11 - e_22
+      tensionne = -cym(i,j)*uvel(i  ,j  ) - dyt(i,j)*uvel(i-1,j  ) &
+                +  cxm(i,j)*vvel(i  ,j  ) + dxt(i,j)*vvel(i  ,j-1)
+      tensionnw = -cyp(i,j)*uvel(i-1,j  ) + dyt(i,j)*uvel(i  ,j  ) &
+                +  cxm(i,j)*vvel(i-1,j  ) + dxt(i,j)*vvel(i-1,j-1)
+      tensionsw = -cyp(i,j)*uvel(i-1,j-1) + dyt(i,j)*uvel(i  ,j-1) &
+                +  cxp(i,j)*vvel(i-1,j-1) - dxt(i,j)*vvel(i-1,j  )
+      tensionse = -cym(i,j)*uvel(i  ,j-1) - dyt(i,j)*uvel(i-1,j-1) &
+                +  cxp(i,j)*vvel(i  ,j-1) - dxt(i,j)*vvel(i  ,j  )
+
+      ! shearing strain rate  =  2*e_12
+      shearne = -cym(i,j)*vvel(i  ,j  ) - dyt(i,j)*vvel(i-1,j  ) &
+              -  cxm(i,j)*uvel(i  ,j  ) - dxt(i,j)*uvel(i  ,j-1)
+      shearnw = -cyp(i,j)*vvel(i-1,j  ) + dyt(i,j)*vvel(i  ,j  ) &
+              -  cxm(i,j)*uvel(i-1,j  ) - dxt(i,j)*uvel(i-1,j-1)
+      shearsw = -cyp(i,j)*vvel(i-1,j-1) + dyt(i,j)*vvel(i  ,j-1) &
+              -  cxp(i,j)*uvel(i-1,j-1) + dxt(i,j)*uvel(i-1,j  )
+      shearse = -cym(i,j)*vvel(i  ,j-1) - dyt(i,j)*vvel(i-1,j-1) &
+              -  cxp(i,j)*uvel(i  ,j-1) + dxt(i,j)*uvel(i  ,j  )
+      
+      ! Delta (in the denominator of zeta, eta)
+      Deltane = sqrt(divune**2 + ecci*(tensionne**2 + shearne**2))
+      Deltanw = sqrt(divunw**2 + ecci*(tensionnw**2 + shearnw**2))
+      Deltasw = sqrt(divusw**2 + ecci*(tensionsw**2 + shearsw**2))
+      Deltase = sqrt(divuse**2 + ecci*(tensionse**2 + shearse**2))
+
+      end subroutine strain_rates
+
+!=======================================================================
+
+! Load velocity components into array for boundary updates
+
+   subroutine stack_velocity_field(uvel, vvel, fld2)
+
+      use ice_domain, only: nblocks
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,max_blocks), intent(in) :: &
+         uvel    , & ! u components of velocity vector
+         vvel        ! v components of velocity vector
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,2,max_blocks), intent(out) :: &
+         fld2        ! work array for boundary updates
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         iblk        ! block index
+
+      character(len=*), parameter :: subname = '(stack_velocity_field)'
+
+      ! load velocity into array for boundary updates
+      !$OMP PARALLEL DO PRIVATE(iblk)
+      do iblk = 1, nblocks
+         fld2(:,:,1,iblk) = uvel(:,:,iblk)
+         fld2(:,:,2,iblk) = vvel(:,:,iblk)
+      enddo
+      !$OMP END PARALLEL DO
+
+      end subroutine stack_velocity_field
+
+!=======================================================================
+
+! Unload velocity components from array after boundary updates
+
+   subroutine unstack_velocity_field(fld2, uvel, vvel)
+
+      use ice_domain, only: nblocks
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,2,max_blocks), intent(in) :: &
+         fld2        ! work array for boundary updates
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,max_blocks), intent(out) :: &
+         uvel    , & ! u components of velocity vector
+         vvel        ! v components of velocity vector
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         iblk        ! block index
+
+      character(len=*), parameter :: subname = '(unstack_velocity_field)'
+
+      ! Unload velocity from array after boundary updates
+      !$OMP PARALLEL DO PRIVATE(iblk)
+      do iblk = 1, nblocks
+         uvel(:,:,iblk) = fld2(:,:,1,iblk)
+         vvel(:,:,iblk) = fld2(:,:,2,iblk)
+      enddo
+      !$OMP END PARALLEL DO
+
+      end subroutine unstack_velocity_field
+
+!=======================================================================
+      
       end module ice_dyn_shared
 
 !=======================================================================
diff --git a/cicecore/cicedynB/dynamics/ice_dyn_vp.F90 b/cicecore/cicedynB/dynamics/ice_dyn_vp.F90
new file mode 100644
index 000000000..773d76440
--- /dev/null
+++ b/cicecore/cicedynB/dynamics/ice_dyn_vp.F90
@@ -0,0 +1,3697 @@
+!=======================================================================
+!
+! Viscous-plastic sea ice dynamics model
+! Computes ice velocity and deformation
+!
+! See:
+!
+! Lemieux, J.‐F., Tremblay, B., Thomas, S., Sedláček, J., and Mysak, L. A. (2008),
+! Using the preconditioned Generalized Minimum RESidual (GMRES) method to solve
+! the sea‐ice momentum equation, J. Geophys. Res., 113, C10004, doi:10.1029/2007JC004680.
+!
+! Hibler, W. D., and Ackley, S. F. (1983), Numerical simulation of the Weddell Sea pack ice,
+! J. Geophys. Res., 88( C5), 2873– 2887, doi:10.1029/JC088iC05p02873.
+!
+! Y. Saad. A Flexible Inner-Outer Preconditioned GMRES Algorithm. SIAM J. Sci. Comput.,
+! 14(2):461–469, 1993. URL: https://doi.org/10.1137/0914028, doi:10.1137/0914028.
+!
+! C. T. Kelley, Iterative Methods for Linear and Nonlinear Equations, SIAM, 1995.
+! (https://www.siam.org/books/textbooks/fr16_book.pdf)
+!
+! Y. Saad, Iterative Methods for Sparse Linear Systems. SIAM, 2003.
+! (http://www-users.cs.umn.edu/~saad/IterMethBook_2ndEd.pdf)
+!
+! Walker, H. F., & Ni, P. (2011). Anderson Acceleration for Fixed-Point Iterations.
+! SIAM Journal on Numerical Analysis, 49(4), 1715–1735. https://doi.org/10.1137/10078356X
+!
+! Fang, H., & Saad, Y. (2009). Two classes of multisecant methods for nonlinear acceleration.
+! Numerical Linear Algebra with Applications, 16(3), 197–221. https://doi.org/10.1002/nla.617
+!
+! Birken, P. (2015) Termination criteria for inexact fixed‐point schemes.
+! Numer. Linear Algebra Appl., 22: 702– 716. doi: 10.1002/nla.1982.
+!
+! authors: JF Lemieux, ECCC, Philppe Blain, ECCC
+!
+
+      module ice_dyn_vp
+
+      use ice_kinds_mod
+      use ice_blocks, only: nx_block, ny_block
+      use ice_boundary, only: ice_halo
+      use ice_communicate, only: my_task, master_task, get_num_procs
+      use ice_constants, only: field_loc_center, field_loc_NEcorner, &
+          field_type_scalar, field_type_vector
+      use ice_constants, only: c0, p027, p055, p111, p166, &
+          p222, p25, p333, p5, c1
+      use ice_domain, only: nblocks, distrb_info
+      use ice_domain_size, only: max_blocks
+      use ice_dyn_shared, only: dyn_prep1, dyn_prep2, dyn_finish, &
+          ecci, cosw, sinw, fcor_blk, uvel_init,  &
+          vvel_init, basal_stress_coeff, basalstress, Ktens, &
+          stack_velocity_field,  unstack_velocity_field
+      use ice_fileunits, only: nu_diag
+      use ice_flux, only: fm
+      use ice_global_reductions, only: global_sum, global_allreduce_sum
+      use ice_grid, only: dxt, dyt, dxhy, dyhx, cxp, cyp, cxm, cym, uarear
+      use ice_exit, only: abort_ice
+      use icepack_intfc, only: icepack_warnings_flush, icepack_warnings_aborted
+      use icepack_intfc, only: icepack_ice_strength, icepack_query_parameters
+
+      implicit none
+      private
+      public :: implicit_solver, init_vp
+
+      ! namelist parameters
+
+      integer (kind=int_kind), public :: &
+         maxits_nonlin  , & ! max nb of iteration for nonlinear solver
+         dim_fgmres     , & ! size of fgmres Krylov subspace
+         dim_pgmres     , & ! size of pgmres Krylov subspace
+         maxits_fgmres  , & ! max nb of iteration for fgmres
+         maxits_pgmres  , & ! max nb of iteration for pgmres
+         fpfunc_andacc  , & ! fixed point function for Anderson acceleration: 1: g(x) = FMGRES(A(x),b(x)), 2: g(x) = x - A(x)x + b(x)
+         dim_andacc     , & ! size of Anderson minimization matrix (number of saved previous residuals)
+         start_andacc       ! acceleration delay factor (acceleration starts at this iteration)
+
+      logical (kind=log_kind), public :: &
+         monitor_nonlin , & ! print nonlinear residual norm
+         monitor_fgmres , & ! print fgmres residual norm
+         monitor_pgmres , & ! print pgmres residual norm
+         use_mean_vrel      ! use mean of previous 2 iterates to compute vrel (see Hibler and Ackley 1983)
+
+      real (kind=dbl_kind), public :: &
+         reltol_nonlin  , & ! nonlinear stopping criterion: reltol_nonlin*res(k=0)
+         reltol_fgmres  , & ! fgmres stopping criterion: reltol_fgmres*res(k)
+         reltol_pgmres  , & ! pgmres stopping criterion: reltol_pgmres*res(k)
+         damping_andacc , & ! damping factor for Anderson acceleration
+         reltol_andacc      ! relative tolerance for Anderson acceleration
+
+      character (len=char_len), public :: &
+         precond        , & ! preconditioner for fgmres: 'ident' (identity), 'diag' (diagonal), 'pgmres' (Jacobi-preconditioned GMRES)
+         algo_nonlin    , & ! nonlinear algorithm: 'picard' (Picard iteration), 'anderson' (Anderson acceleration)
+         ortho_type         ! type of orthogonalization for FGMRES ('cgs' or 'mgs')
+
+      ! module variables
+
+      integer (kind=int_kind), allocatable :: &
+         icellt(:)    , & ! no. of cells where icetmask = 1
+         icellu(:)        ! no. of cells where iceumask = 1
+
+      integer (kind=int_kind), allocatable :: &
+         indxti(:,:)  , & ! compressed index in i-direction
+         indxtj(:,:)  , & ! compressed index in j-direction
+         indxui(:,:)  , & ! compressed index in i-direction
+         indxuj(:,:)      ! compressed index in j-direction
+
+      real (kind=dbl_kind), allocatable :: & 
+         fld2(:,:,:,:)    ! work array for boundary updates
+
+!=======================================================================
+
+      contains
+
+!=======================================================================
+
+! Initialize parameters and variables needed for the vp dynamics
+! author: Philippe Blain, ECCC
+
+      subroutine init_vp
+
+      use ice_blocks, only: get_block, block
+      use ice_boundary, only: ice_HaloUpdate
+      use ice_constants, only: c1, &
+          field_loc_center, field_type_scalar
+      use ice_domain, only: blocks_ice, halo_info
+      use ice_grid, only: tarea, tinyarea
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, iblk, &
+         ilo,ihi,jlo,jhi      ! beginning and end of physical domain
+
+      type (block) :: &
+         this_block           ! block information for current block
+
+      real (kind=dbl_kind) :: &
+         min_strain_rate = 2e-09_dbl_kind      ! used for recomputing tinyarea
+      
+      ! Initialize module variables
+      allocate(icellt(max_blocks), icellu(max_blocks))
+      allocate(indxti(nx_block*ny_block, max_blocks), &
+               indxtj(nx_block*ny_block, max_blocks), &
+               indxui(nx_block*ny_block, max_blocks), &
+               indxuj(nx_block*ny_block, max_blocks))
+      allocate(fld2(nx_block,ny_block,2,max_blocks))
+      
+      ! Redefine tinyarea using min_strain_rate
+      
+      !$OMP PARALLEL DO PRIVATE(iblk,i,j,ilo,ihi,jlo,jhi,this_block)
+      do iblk = 1, nblocks
+         this_block = get_block(blocks_ice(iblk),iblk)
+         ilo = this_block%ilo
+         ihi = this_block%ihi
+         jlo = this_block%jlo
+         jhi = this_block%jhi
+
+         do j = jlo, jhi
+         do i = ilo, ihi
+            tinyarea(i,j,iblk) = min_strain_rate*tarea(i,j,iblk)
+         enddo
+         enddo
+      enddo                     ! iblk
+      !$OMP END PARALLEL DO
+      
+      call ice_HaloUpdate (tinyarea,           halo_info, &
+                           field_loc_center,   field_type_scalar, &
+                           fillValue=c1)
+      
+      end subroutine init_vp
+
+!=======================================================================
+
+! Viscous-plastic dynamics driver
+!
+#ifdef CICE_IN_NEMO
+! Wind stress is set during this routine from the values supplied
+! via NEMO (unless calc_strair is true).  These values are supplied
+! rotated on u grid and multiplied by aice.  strairxT = 0 in this
+! case so operations in dyn_prep1 are pointless but carried out to
+! minimise code changes.
+#endif
+!
+! author: JF Lemieux, A. Qaddouri and F. Dupont ECCC
+
+      subroutine implicit_solver (dt)
+
+      use ice_arrays_column, only: Cdn_ocn
+      use ice_boundary, only: ice_HaloMask, ice_HaloUpdate, &
+          ice_HaloDestroy, ice_HaloUpdate_stress
+      use ice_blocks, only: block, get_block, nx_block, ny_block
+      use ice_domain, only: blocks_ice, halo_info, maskhalo_dyn
+      use ice_domain_size, only: max_blocks, ncat
+      use ice_dyn_shared, only: deformations
+      use ice_flux, only: rdg_conv, rdg_shear, strairxT, strairyT, &
+          strairx, strairy, uocn, vocn, ss_tltx, ss_tlty, iceumask, fm, &
+          strtltx, strtlty, strocnx, strocny, strintx, strinty, taubx, tauby, &
+          strocnxT, strocnyT, strax, stray, &
+          Tbu, hwater, &
+          stressp_1, stressp_2, stressp_3, stressp_4, &
+          stressm_1, stressm_2, stressm_3, stressm_4, &
+          stress12_1, stress12_2, stress12_3, stress12_4
+      use ice_grid, only: tmask, umask, dxt, dyt, cxp, cyp, cxm, cym, &
+          tarear, to_ugrid, t2ugrid_vector, u2tgrid_vector, &
+          grid_type
+      use ice_state, only: aice, vice, vsno, uvel, vvel, divu, shear, &
+          aice_init, aice0, aicen, vicen, strength
+      use ice_timers, only: timer_dynamics, timer_bound, &
+          ice_timer_start, ice_timer_stop
+
+      real (kind=dbl_kind), intent(in) :: &
+         dt      ! time step
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         ntot           , & ! size of problem for Anderson
+         iblk           , & ! block index
+         ilo,ihi,jlo,jhi, & ! beginning and end of physical domain
+         i, j, ij
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,max_blocks) :: &
+         tmass    , & ! total mass of ice and snow (kg/m^2)
+         waterx   , & ! for ocean stress calculation, x (m/s)
+         watery   , & ! for ocean stress calculation, y (m/s)
+         forcex   , & ! work array: combined atm stress and ocn tilt, x
+         forcey   , & ! work array: combined atm stress and ocn tilt, y
+         bxfix    , & ! part of bx that is constant during Picard
+         byfix    , & ! part of by that is constant during Picard
+         Cb       , & ! seabed stress coefficient
+         fpresx   , & ! fixed point residual vector, x components: fx = uvel - uprev_k
+         fpresy   , & ! fixed point residual vector, y components: fy = vvel - vprev_k
+         aiu      , & ! ice fraction on u-grid
+         umass    , & ! total mass of ice and snow (u grid)
+         umassdti     ! mass of U-cell/dte (kg/m^2 s)
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,max_blocks,4):: &
+         zetaD        ! zetaD = 2zeta (viscous coeff)
+
+      logical (kind=log_kind) :: calc_strair
+
+      integer (kind=int_kind), dimension (nx_block,ny_block,max_blocks) :: &
+         icetmask, &  ! ice extent mask (T-cell)
+         halomask     ! generic halo mask
+
+      type (ice_halo) :: &
+         halo_info_mask !  ghost cell update info for masked halo
+
+      type (block) :: &
+         this_block           ! block information for current block
+
+      real (kind=dbl_kind), allocatable :: &
+         sol(:)          ! solution vector
+
+      character(len=*), parameter :: subname = '(implicit_solver)'
+
+      call ice_timer_start(timer_dynamics) ! dynamics
+
+      !-----------------------------------------------------------------
+      ! Initialize
+      !-----------------------------------------------------------------
+      
+       ! This call is needed only if dt changes during runtime.
+!      call set_evp_parameters (dt)
+
+      !-----------------------------------------------------------------
+      ! boundary updates
+      ! commented out because the ghost cells are freshly
+      ! updated after cleanup_itd
+      !-----------------------------------------------------------------
+
+!      call ice_timer_start(timer_bound)
+!      call ice_HaloUpdate (aice,              halo_info, &
+!                           field_loc_center,  field_type_scalar)
+!      call ice_HaloUpdate (vice,              halo_info, &
+!                           field_loc_center,  field_type_scalar)
+!      call ice_HaloUpdate (vsno,              halo_info, &
+!                           field_loc_center,  field_type_scalar)
+!      call ice_timer_stop(timer_bound)
+
+      !$OMP PARALLEL DO PRIVATE(iblk,i,j,ilo,ihi,jlo,jhi,this_block)
+      do iblk = 1, nblocks
+
+         do j = 1, ny_block
+         do i = 1, nx_block
+            rdg_conv (i,j,iblk) = c0
+            rdg_shear(i,j,iblk) = c0
+            divu (i,j,iblk) = c0
+            shear(i,j,iblk) = c0
+         enddo
+         enddo
+
+      !-----------------------------------------------------------------
+      ! preparation for dynamics
+      !-----------------------------------------------------------------
+
+         this_block = get_block(blocks_ice(iblk),iblk)
+         ilo = this_block%ilo
+         ihi = this_block%ihi
+         jlo = this_block%jlo
+         jhi = this_block%jhi
+
+         call dyn_prep1 (nx_block,           ny_block,           &
+                         ilo, ihi,           jlo, jhi,           &
+                         aice    (:,:,iblk), vice    (:,:,iblk), &
+                         vsno    (:,:,iblk), tmask   (:,:,iblk), &
+                         strairxT(:,:,iblk), strairyT(:,:,iblk), &
+                         strairx (:,:,iblk), strairy (:,:,iblk), &
+                         tmass   (:,:,iblk), icetmask(:,:,iblk))
+
+      enddo                     ! iblk
+      !$OMP END PARALLEL DO
+
+      call ice_timer_start(timer_bound)
+      call ice_HaloUpdate (icetmask,          halo_info, &
+                           field_loc_center,  field_type_scalar)
+      call ice_timer_stop(timer_bound)
+
+      !-----------------------------------------------------------------
+      ! convert fields from T to U grid
+      !-----------------------------------------------------------------
+
+      call to_ugrid(tmass,umass)
+      call to_ugrid(aice_init, aiu)
+
+      !----------------------------------------------------------------
+      ! Set wind stress to values supplied via NEMO or other forcing
+      ! This wind stress is rotated on u grid and multiplied by aice
+      !----------------------------------------------------------------
+      call icepack_query_parameters(calc_strair_out=calc_strair)
+      call icepack_warnings_flush(nu_diag)
+      if (icepack_warnings_aborted()) call abort_ice(error_message=subname, &
+         file=__FILE__, line=__LINE__)
+
+      if (.not. calc_strair) then
+         strairx(:,:,:) = strax(:,:,:)
+         strairy(:,:,:) = stray(:,:,:)
+      else
+         call t2ugrid_vector(strairx)
+         call t2ugrid_vector(strairy)
+      endif
+
+! tcraig, tcx, threading here leads to some non-reproducbile results and failures in icepack_ice_strength
+! need to do more debugging
+      !$TCXOMP PARALLEL DO PRIVATE(iblk,ilo,ihi,jlo,jhi,this_block)
+      do iblk = 1, nblocks
+
+      !-----------------------------------------------------------------
+      ! more preparation for dynamics
+      !-----------------------------------------------------------------
+
+         this_block = get_block(blocks_ice(iblk),iblk)
+         ilo = this_block%ilo
+         ihi = this_block%ihi
+         jlo = this_block%jlo
+         jhi = this_block%jhi
+
+         call dyn_prep2 (nx_block,             ny_block,             &
+                         ilo, ihi,             jlo, jhi,             &
+                         icellt(iblk),         icellu(iblk),         &
+                         indxti      (:,iblk), indxtj      (:,iblk), &
+                         indxui      (:,iblk), indxuj      (:,iblk), &
+                         aiu       (:,:,iblk), umass     (:,:,iblk), &
+                         umassdti  (:,:,iblk), fcor_blk  (:,:,iblk), &
+                         umask     (:,:,iblk),                       &
+                         uocn      (:,:,iblk), vocn      (:,:,iblk), &
+                         strairx   (:,:,iblk), strairy   (:,:,iblk), &
+                         ss_tltx   (:,:,iblk), ss_tlty   (:,:,iblk), &
+                         icetmask  (:,:,iblk), iceumask  (:,:,iblk), &
+                         fm        (:,:,iblk), dt,                   &
+                         strtltx   (:,:,iblk), strtlty   (:,:,iblk), &
+                         strocnx   (:,:,iblk), strocny   (:,:,iblk), &
+                         strintx   (:,:,iblk), strinty   (:,:,iblk), &
+                         taubx     (:,:,iblk), tauby     (:,:,iblk), &
+                         waterx    (:,:,iblk), watery    (:,:,iblk), &
+                         forcex    (:,:,iblk), forcey    (:,:,iblk), &
+                         stressp_1 (:,:,iblk), stressp_2 (:,:,iblk), &
+                         stressp_3 (:,:,iblk), stressp_4 (:,:,iblk), &
+                         stressm_1 (:,:,iblk), stressm_2 (:,:,iblk), &
+                         stressm_3 (:,:,iblk), stressm_4 (:,:,iblk), &
+                         stress12_1(:,:,iblk), stress12_2(:,:,iblk), &
+                         stress12_3(:,:,iblk), stress12_4(:,:,iblk), &
+                         uvel_init (:,:,iblk), vvel_init (:,:,iblk), &
+                         uvel      (:,:,iblk), vvel      (:,:,iblk), &
+                         Tbu       (:,:,iblk))
+
+         call calc_bfix (nx_block            , ny_block            , &
+                         icellu(iblk)        ,                       &
+                         indxui      (:,iblk), indxuj      (:,iblk), &
+                         umassdti  (:,:,iblk),                       &
+                         forcex    (:,:,iblk), forcey    (:,:,iblk), &
+                         uvel_init (:,:,iblk), vvel_init (:,:,iblk), &
+                         bxfix     (:,:,iblk), byfix     (:,:,iblk))
+         
+      !-----------------------------------------------------------------
+      ! ice strength
+      !-----------------------------------------------------------------
+
+         strength(:,:,iblk) = c0  ! initialize
+         do ij = 1, icellt(iblk)
+            i = indxti(ij, iblk)
+            j = indxtj(ij, iblk)
+            call icepack_ice_strength (ncat,                 &
+                                       aice    (i,j,  iblk), &
+                                       vice    (i,j,  iblk), &
+                                       aice0   (i,j,  iblk), &
+                                       aicen   (i,j,:,iblk), &
+                                       vicen   (i,j,:,iblk), &
+                                       strength(i,j,  iblk))
+         enddo  ! ij
+
+      enddo  ! iblk
+      !$TCXOMP END PARALLEL DO
+
+      call icepack_warnings_flush(nu_diag)
+      if (icepack_warnings_aborted()) call abort_ice(error_message=subname, &
+         file=__FILE__, line=__LINE__)
+
+      call ice_timer_start(timer_bound)
+      call ice_HaloUpdate (strength,           halo_info, &
+                           field_loc_center,   field_type_scalar)
+      call ice_timer_stop(timer_bound)
+
+      if (maskhalo_dyn) then
+         call ice_timer_start(timer_bound)
+         halomask = 0
+         where (iceumask) halomask = 1
+         call ice_HaloUpdate (halomask,          halo_info, &
+                              field_loc_center,  field_type_scalar)
+         call ice_timer_stop(timer_bound)
+         call ice_HaloMask(halo_info_mask, halo_info, halomask)
+      endif
+
+      ! velocities may have changed in dyn_prep2
+      call stack_velocity_field(uvel, vvel, fld2)
+      call ice_timer_start(timer_bound)
+      if (maskhalo_dyn) then
+         call ice_HaloUpdate (fld2,               halo_info_mask, &
+                              field_loc_NEcorner, field_type_vector)
+      else
+         call ice_HaloUpdate (fld2,               halo_info, &
+                              field_loc_NEcorner, field_type_vector)
+      endif
+      call ice_timer_stop(timer_bound)
+      call unstack_velocity_field(fld2, uvel, vvel)
+
+      !-----------------------------------------------------------------
+      ! basal stress coefficients (landfast ice)
+      !-----------------------------------------------------------------
+      
+      if (basalstress) then
+         !$OMP PARALLEL DO PRIVATE(iblk)
+         do iblk = 1, nblocks
+            call basal_stress_coeff (nx_block        , ny_block      , &
+                                     icellu    (iblk),                 &
+                                     indxui  (:,iblk), indxuj(:,iblk), &
+                                     vice  (:,:,iblk), aice(:,:,iblk), &
+                                     hwater(:,:,iblk), Tbu (:,:,iblk))
+         enddo
+         !$OMP END PARALLEL DO
+      endif
+      
+      !-----------------------------------------------------------------
+      ! calc size of problem (ntot) and allocate solution vector
+      !-----------------------------------------------------------------
+      
+      ntot = 0
+      do iblk = 1, nblocks
+         ntot = ntot + icellu(iblk)
+      enddo
+      ntot = 2 * ntot ! times 2 because of u and v
+      
+      allocate(sol(ntot))
+      
+      !-----------------------------------------------------------------
+      ! Start of nonlinear iteration
+      !-----------------------------------------------------------------
+      call anderson_solver (icellt  , icellu, &
+                            indxti  , indxtj, &
+                            indxui  , indxuj, &
+                            aiu     , ntot  , &
+                            waterx  , watery, &
+                            bxfix   , byfix , &
+                            umassdti, sol   , &
+                            fpresx  , fpresy, &
+                            zetaD   , Cb    , &
+                            halo_info_mask)
+      !-----------------------------------------------------------------
+      ! End of nonlinear iteration
+      !-----------------------------------------------------------------
+
+      deallocate(sol)
+      
+      if (maskhalo_dyn) call ice_HaloDestroy(halo_info_mask)
+
+      !-----------------------------------------------------------------
+      ! Compute stresses
+      !-----------------------------------------------------------------
+      !$OMP PARALLEL DO PRIVATE(iblk)
+      do iblk = 1, nblocks
+         call stress_vp (nx_block            , ny_block            , &
+                         icellt(iblk)        ,                       &
+                         indxti      (:,iblk), indxtj      (:,iblk), &
+                         uvel      (:,:,iblk), vvel      (:,:,iblk), &
+                         dxt       (:,:,iblk), dyt       (:,:,iblk), &
+                         cxp       (:,:,iblk), cyp       (:,:,iblk), &
+                         cxm       (:,:,iblk), cym       (:,:,iblk), &
+                         zetaD   (:,:,iblk,:),                       &
+                         stressp_1 (:,:,iblk), stressp_2 (:,:,iblk), &
+                         stressp_3 (:,:,iblk), stressp_4 (:,:,iblk), &
+                         stressm_1 (:,:,iblk), stressm_2 (:,:,iblk), &
+                         stressm_3 (:,:,iblk), stressm_4 (:,:,iblk), &
+                         stress12_1(:,:,iblk), stress12_2(:,:,iblk), &
+                         stress12_3(:,:,iblk), stress12_4(:,:,iblk))
+      enddo ! iblk
+      !$OMP END PARALLEL DO
+
+      !-----------------------------------------------------------------
+      ! Compute deformations
+      !-----------------------------------------------------------------
+      !$OMP PARALLEL DO PRIVATE(iblk)
+      do iblk = 1, nblocks
+         call deformations (nx_block            , ny_block            , &
+                            icellt(iblk)        ,                       &
+                            indxti      (:,iblk), indxtj      (:,iblk), &
+                            uvel      (:,:,iblk), vvel      (:,:,iblk), &
+                            dxt       (:,:,iblk), dyt       (:,:,iblk), &
+                            cxp       (:,:,iblk), cyp       (:,:,iblk), &
+                            cxm       (:,:,iblk), cym       (:,:,iblk), &
+                            tarear    (:,:,iblk),                       &
+                            shear     (:,:,iblk), divu      (:,:,iblk), &
+                            rdg_conv  (:,:,iblk), rdg_shear (:,:,iblk))
+      enddo
+      !$OMP END PARALLEL DO
+      
+      !-----------------------------------------------------------------
+      ! Compute seabed stress (diagnostic)
+      !-----------------------------------------------------------------
+      if (basalstress) then
+         !$OMP PARALLEL DO PRIVATE(iblk)
+         do iblk = 1, nblocks
+            call calc_seabed_stress (nx_block            ,  ny_block           , &
+                                     icellu(iblk)        ,                       &
+                                     indxui      (:,iblk), indxuj      (:,iblk), &
+                                     uvel      (:,:,iblk), vvel      (:,:,iblk), &
+                                     Cb        (:,:,iblk),                       &
+                                     taubx     (:,:,iblk), tauby     (:,:,iblk))
+         enddo
+         !$OMP END PARALLEL DO
+      endif
+      
+      ! Force symmetry across the tripole seam
+      if (trim(grid_type) == 'tripole') then
+      if (maskhalo_dyn) then
+         !-------------------------------------------------------
+         ! set halomask to zero because ice_HaloMask always keeps
+         ! local copies AND tripole zipper communication
+         !-------------------------------------------------------
+         halomask = 0
+         call ice_HaloMask(halo_info_mask, halo_info, halomask)
+
+         call ice_HaloUpdate_stress(stressp_1, stressp_3, halo_info_mask, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stressp_3, stressp_1, halo_info_mask, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stressp_2, stressp_4, halo_info_mask, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stressp_4, stressp_2, halo_info_mask, &
+                              field_loc_center,  field_type_scalar)
+
+         call ice_HaloUpdate_stress(stressm_1, stressm_3, halo_info_mask, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stressm_3, stressm_1, halo_info_mask, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stressm_2, stressm_4, halo_info_mask, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stressm_4, stressm_2, halo_info_mask, &
+                              field_loc_center,  field_type_scalar)
+
+         call ice_HaloUpdate_stress(stress12_1, stress12_3, halo_info_mask, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stress12_3, stress12_1, halo_info_mask, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stress12_2, stress12_4, halo_info_mask, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stress12_4, stress12_2, halo_info_mask, &
+                              field_loc_center,  field_type_scalar)
+
+         call ice_HaloDestroy(halo_info_mask)
+      else
+         call ice_HaloUpdate_stress(stressp_1, stressp_3, halo_info, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stressp_3, stressp_1, halo_info, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stressp_2, stressp_4, halo_info, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stressp_4, stressp_2, halo_info, &
+                              field_loc_center,  field_type_scalar)
+
+         call ice_HaloUpdate_stress(stressm_1, stressm_3, halo_info, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stressm_3, stressm_1, halo_info, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stressm_2, stressm_4, halo_info, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stressm_4, stressm_2, halo_info, &
+                              field_loc_center,  field_type_scalar)
+
+         call ice_HaloUpdate_stress(stress12_1, stress12_3, halo_info, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stress12_3, stress12_1, halo_info, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stress12_2, stress12_4, halo_info, &
+                              field_loc_center,  field_type_scalar)
+         call ice_HaloUpdate_stress(stress12_4, stress12_2, halo_info, &
+                              field_loc_center,  field_type_scalar)
+      endif   ! maskhalo
+      endif   ! tripole
+
+      !-----------------------------------------------------------------
+      ! ice-ocean stress
+      !-----------------------------------------------------------------
+
+      !$OMP PARALLEL DO PRIVATE(iblk)
+      do iblk = 1, nblocks
+
+         call dyn_finish                               &
+              (nx_block,           ny_block,           &
+               icellu      (iblk), Cdn_ocn (:,:,iblk), &
+               indxui    (:,iblk), indxuj    (:,iblk), &
+               uvel    (:,:,iblk), vvel    (:,:,iblk), &
+               uocn    (:,:,iblk), vocn    (:,:,iblk), &
+               aiu     (:,:,iblk), fm      (:,:,iblk), &
+               strintx (:,:,iblk), strinty (:,:,iblk), &
+               strairx (:,:,iblk), strairy (:,:,iblk), &
+               strocnx (:,:,iblk), strocny (:,:,iblk), &
+               strocnxT(:,:,iblk), strocnyT(:,:,iblk))
+
+      enddo
+      !$OMP END PARALLEL DO
+
+      call u2tgrid_vector(strocnxT)    ! shift
+      call u2tgrid_vector(strocnyT)
+
+      call ice_timer_stop(timer_dynamics)    ! dynamics
+
+      end subroutine implicit_solver
+
+!=======================================================================
+
+! Solve the nonlinear equation F(u,v) = 0, where
+! F(u,v) := A(u,v) * (u,v) - b(u,v)
+! using Anderson acceleration (accelerated fixed point (Picard) iteration)
+!
+! author: JF Lemieux, A. Qaddouri, F. Dupont and P. Blain ECCC
+!
+! Anderson algorithm adadpted from:
+! H. F. Walker, “Anderson Acceleration: Algorithms and Implementations”
+!   [Online]. Available: https://users.wpi.edu/~walker/Papers/anderson_accn_algs_imps.pdf
+
+      subroutine anderson_solver (icellt  , icellu, &
+                                  indxti  , indxtj, &
+                                  indxui  , indxuj, &
+                                  aiu     , ntot  , &
+                                  waterx  , watery, &
+                                  bxfix   , byfix , &
+                                  umassdti, sol   , &
+                                  fpresx  , fpresy, &
+                                  zetaD   , Cb    , &
+                                  halo_info_mask)
+
+      use ice_arrays_column, only: Cdn_ocn
+      use ice_blocks, only: nx_block, ny_block
+      use ice_boundary, only: ice_HaloUpdate
+      use ice_constants, only: c1
+      use ice_domain, only: maskhalo_dyn, halo_info
+      use ice_domain_size, only: max_blocks
+      use ice_flux, only:   uocn, vocn, fm, Tbu
+      use ice_grid, only: dxt, dyt, dxhy, dyhx, cxp, cyp, cxm, cym, &
+          uarear, tinyarea
+      use ice_state, only: uvel, vvel, strength
+      use ice_timers, only: ice_timer_start, ice_timer_stop, timer_bound
+
+      integer (kind=int_kind), intent(in) :: &
+         ntot         ! size of problem for Anderson
+
+      integer (kind=int_kind), dimension(max_blocks), intent(in) :: &
+         icellt   , & ! no. of cells where icetmask = 1
+         icellu       ! no. of cells where iceumask = 1
+
+      integer (kind=int_kind), dimension (nx_block*ny_block, max_blocks), intent(in) :: &
+         indxti   , & ! compressed index in i-direction
+         indxtj   , & ! compressed index in j-direction
+         indxui   , & ! compressed index in i-direction
+         indxuj       ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,max_blocks), intent(in) :: &
+         aiu      , & ! ice fraction on u-grid
+         waterx   , & ! for ocean stress calculation, x (m/s)
+         watery   , & ! for ocean stress calculation, y (m/s)
+         bxfix    , & ! part of bx that is constant during Picard
+         byfix    , & ! part of by that is constant during Picard
+         umassdti     ! mass of U-cell/dte (kg/m^2 s)
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,max_blocks,4), intent(out) :: &
+         zetaD        ! zetaD = 2zeta (viscous coeff)
+
+      type (ice_halo), intent(in) :: &
+         halo_info_mask !  ghost cell update info for masked halo
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,max_blocks), intent(inout) :: &
+         fpresx   , & ! fixed point residual vector, x components: fx = uvel - uprev_k
+         fpresy   , & ! fixed point residual vector, y components: fy = vvel - vprev_k
+         Cb           ! seabed stress coefficient
+
+      real (kind=dbl_kind), dimension (ntot), intent(inout) :: &
+         sol          ! current approximate solution
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         it_nl      , & ! nonlinear loop iteration index
+         res_num    , & ! current number of stored residuals
+         j          , & ! iteration index for QR update
+         iblk       , & ! block index
+         nbiter         ! number of FGMRES iterations performed
+
+      integer (kind=int_kind), parameter :: &
+         inc = 1        ! increment value for BLAS calls
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,max_blocks) :: &
+         uprev_k  , & ! uvel at previous Picard iteration
+         vprev_k  , & ! vvel at previous Picard iteration
+         ulin     , & ! uvel to linearize vrel
+         vlin     , & ! vvel to linearize vrel
+         vrel     , & ! coeff for tauw
+         bx       , & ! b vector
+         by       , & ! b vector
+         diagx    , & ! Diagonal (x component) of the matrix A
+         diagy    , & ! Diagonal (y component) of the matrix A
+         Au       , & ! matvec, Fx = bx - Au
+         Av       , & ! matvec, Fy = by - Av
+         Fx       , & ! x residual vector, Fx = bx - Au
+         Fy       , & ! y residual vector, Fy = by - Av
+         solx     , & ! solution of FGMRES (x components)
+         soly         ! solution of FGMRES (y components)
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,8):: &
+         stress_Pr,   & ! x,y-derivatives of the replacement pressure
+         diag_rheo      ! contributions of the rhelogy term to the diagonal
+
+      real (kind=dbl_kind), dimension (max_blocks) :: &
+         L2norm       ! array used to compute l^2 norm of grid function
+
+      real (kind=dbl_kind), dimension (ntot) :: &
+         res        , & ! current residual
+         res_old    , & ! previous residual
+         res_diff   , & ! difference between current and previous residuals
+         fpfunc     , & ! current value of fixed point function
+         fpfunc_old , & ! previous value of fixed point function
+         tmp            ! temporary vector for BLAS calls
+
+      real (kind=dbl_kind), dimension(ntot,dim_andacc) :: &
+         Q        , & ! Q factor for QR factorization of F (residuals) matrix
+         G_diff       ! Matrix containing the differences of g(x) (fixed point function) evaluations
+
+      real (kind=dbl_kind), dimension(dim_andacc,dim_andacc) :: &
+         R            ! R factor for QR factorization of F (residuals) matrix
+
+      real (kind=dbl_kind), dimension(dim_andacc) :: &
+         rhs_tri  , & ! right hand side vector for matrix-vector product
+         coeffs       ! coeffs used to combine previous solutions
+
+      real (kind=dbl_kind) :: &
+         ! tol         , & ! tolerance for fixed point convergence: reltol_andacc * (initial fixed point residual norm)  [unused for now]
+         tol_nl      , & ! tolerance for nonlinear convergence: reltol_nonlin * (initial nonlinear residual norm)
+         fpres_norm  , & ! norm of current fixed point residual : f(x) = g(x) - x
+         prog_norm   , & ! norm of difference between current and previous solution
+         nlres_norm      ! norm of current nonlinear residual : F(x) = A(x)x -b(x)
+
+#ifdef USE_LAPACK
+      real (kind=dbl_kind) :: &
+         ddot, dnrm2     ! external BLAS functions
+#endif
+
+      character(len=*), parameter :: subname = '(anderson_solver)'
+
+      ! Initialization
+      res_num = 0
+      L2norm  = c0
+      
+      !$OMP PARALLEL DO PRIVATE(iblk)
+      do iblk = 1, nblocks
+         uprev_k(:,:,iblk) = uvel(:,:,iblk)
+         vprev_k(:,:,iblk) = vvel(:,:,iblk)
+      enddo
+      !$OMP END PARALLEL DO
+      
+      ! Start iterations
+      do it_nl = 0, maxits_nonlin        ! nonlinear iteration loop
+         ! Compute quantities needed for computing PDE residual and g(x) (fixed point map)
+         !-----------------------------------------------------------------
+         ! Calc zetaD, dPr/dx, dPr/dy, Cb and vrel = f(uprev_k, vprev_k)
+         !-----------------------------------------------------------------
+         !$OMP PARALLEL DO PRIVATE(iblk)
+         do iblk = 1, nblocks
+            
+            if (use_mean_vrel) then
+               ulin(:,:,iblk) = p5*uprev_k(:,:,iblk) + p5*uvel(:,:,iblk)
+               vlin(:,:,iblk) = p5*vprev_k(:,:,iblk) + p5*vvel(:,:,iblk)
+            else
+               ulin(:,:,iblk) = uvel(:,:,iblk)
+               vlin(:,:,iblk) = vvel(:,:,iblk)
+            endif
+            uprev_k(:,:,iblk) = uvel(:,:,iblk)
+            vprev_k(:,:,iblk) = vvel(:,:,iblk)
+            
+            call calc_zeta_dPr (nx_block           , ny_block          , &
+                                icellt       (iblk),                     &
+                                indxti     (:,iblk), indxtj    (:,iblk), &
+                                uprev_k  (:,:,iblk), vprev_k (:,:,iblk), &
+                                dxt      (:,:,iblk), dyt     (:,:,iblk), &
+                                dxhy     (:,:,iblk), dyhx    (:,:,iblk), &
+                                cxp      (:,:,iblk), cyp     (:,:,iblk), &
+                                cxm      (:,:,iblk), cym     (:,:,iblk), &
+                                tinyarea (:,:,iblk),                     &
+                                strength (:,:,iblk), zetaD (:,:,iblk,:), &
+                                stress_Pr  (:,:,:))
+            
+            call calc_vrel_Cb (nx_block           , ny_block          , &
+                               icellu       (iblk), Cdn_ocn (:,:,iblk), &
+                               indxui     (:,iblk), indxuj    (:,iblk), &
+                               aiu      (:,:,iblk), Tbu     (:,:,iblk), &
+                               uocn     (:,:,iblk), vocn    (:,:,iblk), &
+                               ulin     (:,:,iblk), vlin    (:,:,iblk), &
+                               vrel     (:,:,iblk), Cb      (:,:,iblk))
+            
+            ! prepare b vector (RHS)
+            call calc_bvec (nx_block           , ny_block          , &
+                            icellu       (iblk),                     &
+                            indxui     (:,iblk), indxuj    (:,iblk), &
+                            stress_Pr   (:,:,:), uarear  (:,:,iblk), &
+                            waterx   (:,:,iblk), watery  (:,:,iblk), &
+                            bxfix    (:,:,iblk), byfix   (:,:,iblk), &
+                            bx       (:,:,iblk), by      (:,:,iblk), &
+                            vrel     (:,:,iblk))
+            
+            ! Compute nonlinear residual norm (PDE residual)
+            call matvec (nx_block             , ny_block           , &
+                         icellu       (iblk)  , icellt       (iblk), &
+                         indxui     (:,iblk)  , indxuj     (:,iblk), &
+                         indxti     (:,iblk)  , indxtj     (:,iblk), &
+                         dxt      (:,:,iblk)  , dyt      (:,:,iblk), &
+                         dxhy     (:,:,iblk)  , dyhx     (:,:,iblk), &
+                         cxp      (:,:,iblk)  , cyp      (:,:,iblk), &
+                         cxm      (:,:,iblk)  , cym      (:,:,iblk), &
+                         uprev_k  (:,:,iblk)  , vprev_k  (:,:,iblk), &
+                         vrel     (:,:,iblk)  , Cb       (:,:,iblk), &
+                         zetaD    (:,:,iblk,:),                      &
+                         umassdti (:,:,iblk)  , fm       (:,:,iblk), &
+                         uarear   (:,:,iblk)  ,                      &
+                         Au       (:,:,iblk)  , Av       (:,:,iblk))
+            call residual_vec (nx_block           , ny_block          , &
+                               icellu       (iblk),                     &
+                               indxui     (:,iblk), indxuj    (:,iblk), &
+                               bx       (:,:,iblk), by      (:,:,iblk), &
+                               Au       (:,:,iblk), Av      (:,:,iblk), &
+                               Fx       (:,:,iblk), Fy      (:,:,iblk), &
+                               L2norm       (iblk))
+         enddo
+         !$OMP END PARALLEL DO
+         nlres_norm = sqrt(global_sum(sum(L2norm), distrb_info))
+         if (my_task == master_task .and. monitor_nonlin) then
+            write(nu_diag, '(a,i4,a,d26.16)') "monitor_nonlin: iter_nonlin= ", it_nl, &
+                                              " nonlin_res_L2norm= ", nlres_norm
+         endif
+         ! Compute relative tolerance at first iteration
+         if (it_nl == 0) then
+            tol_nl = reltol_nonlin*nlres_norm
+         endif
+         
+         ! Check for nonlinear convergence
+         if (nlres_norm < tol_nl) then
+            exit
+         endif
+         
+         ! Put initial guess for FGMRES in solx,soly and sol (needed for anderson)
+         solx = uprev_k
+         soly = vprev_k
+         call arrays_to_vec (nx_block       , ny_block       , &
+                             nblocks        , max_blocks     , &
+                             icellu      (:), ntot           , &
+                             indxui    (:,:), indxuj    (:,:), &
+                             uprev_k (:,:,:), vprev_k (:,:,:), &
+                             sol         (:))
+         
+         ! Compute fixed point map g(x)
+         if (fpfunc_andacc == 1) then
+            ! g_1(x) = FGMRES(A(x), b(x))
+            
+            ! Prepare diagonal for preconditioner
+            if (precond == 'diag' .or. precond == 'pgmres') then
+               !$OMP PARALLEL DO PRIVATE(iblk)
+               do iblk = 1, nblocks
+                  ! first compute diagonal contributions due to rheology term
+                  call formDiag_step1 (nx_block             , ny_block      , &
+                                       icellu       (iblk)  ,                 &
+                                       indxui     (:,iblk)  , indxuj(:,iblk), &
+                                       dxt      (:,:,iblk)  , dyt (:,:,iblk), &
+                                       dxhy     (:,:,iblk)  , dyhx(:,:,iblk), &
+                                       cxp      (:,:,iblk)  , cyp (:,:,iblk), &
+                                       cxm      (:,:,iblk)  , cym (:,:,iblk), &
+                                       zetaD    (:,:,iblk,:), diag_rheo(:,:,:))
+                  ! second compute the full diagonal
+                  call formDiag_step2 (nx_block           , ny_block          , &
+                                       icellu       (iblk),                     &
+                                       indxui     (:,iblk), indxuj    (:,iblk), &
+                                       diag_rheo   (:,:,:), vrel    (:,:,iblk), &
+                                       umassdti (:,:,iblk),                     &
+                                       uarear   (:,:,iblk), Cb      (:,:,iblk), &
+                                       diagx    (:,:,iblk), diagy   (:,:,iblk))
+               enddo
+               !$OMP END PARALLEL DO
+            endif
+            
+            ! FGMRES linear solver
+            call fgmres (zetaD         ,             &
+                         Cb            , vrel      , &
+                         umassdti      ,             &
+                         halo_info_mask,             &
+                         bx            , by        , &
+                         diagx         , diagy     , &
+                         reltol_fgmres , dim_fgmres, &
+                         maxits_fgmres ,             &
+                         solx          , soly      , &
+                         nbiter)
+            ! Put FGMRES solution solx,soly in fpfunc vector (needed for Anderson)
+            call arrays_to_vec (nx_block       , ny_block     , &
+                                nblocks        , max_blocks   , &
+                                icellu      (:), ntot         , &
+                                indxui    (:,:), indxuj  (:,:), &
+                                solx    (:,:,:), soly  (:,:,:), &
+                                fpfunc      (:))
+         elseif (fpfunc_andacc == 2) then
+            ! g_2(x) = x - A(x)x + b(x) = x - F(x)
+            call abort_ice(error_message=subname // " Fixed point function g_2(x) not yet implemented (fpfunc_andacc = 2)" , &
+               file=__FILE__, line=__LINE__)
+         endif
+
+         ! Compute fixed point residual f(x) = g(x) - x
+         res = fpfunc - sol
+#ifdef USE_LAPACK
+         fpres_norm = global_sum(dnrm2(size(res), res, inc)**2, distrb_info)
+#else
+         call vec_to_arrays (nx_block      , ny_block    , &
+                             nblocks       , max_blocks  , &
+                             icellu     (:), ntot        , &
+                             indxui   (:,:), indxuj(:,:) , &
+                             res        (:),               &
+                             fpresx (:,:,:), fpresy (:,:,:))
+         !$OMP PARALLEL DO PRIVATE(iblk)
+         do iblk = 1, nblocks
+            call calc_L2norm_squared (nx_block        , ny_block        , &
+                                      icellu    (iblk),                   &
+                                      indxui  (:,iblk), indxuj  (:,iblk), &
+                                      fpresx(:,:,iblk), fpresy(:,:,iblk), &
+                                      L2norm    (iblk))
+         enddo
+         !$OMP END PARALLEL DO
+         fpres_norm = sqrt(global_sum(sum(L2norm), distrb_info))
+#endif
+         if (my_task == master_task .and. monitor_nonlin) then
+            write(nu_diag, '(a,i4,a,d26.16)') "monitor_nonlin: iter_nonlin= ", it_nl, &
+                                              " fixed_point_res_L2norm= ", fpres_norm
+         endif
+         
+         ! Not used for now (only nonlinear residual is checked)
+         ! ! Store initial residual norm
+         ! if (it_nl == 0) then
+         !    tol = reltol_andacc*fpres_norm
+         ! endif
+         ! 
+         ! ! Check residual
+         ! if (fpres_norm < tol) then
+         !    exit
+         ! endif
+
+         if (dim_andacc == 0 .or. it_nl < start_andacc) then
+            ! Simple fixed point (Picard) iteration in this case
+            sol = fpfunc
+         else
+#ifdef USE_LAPACK
+            ! Begin Anderson acceleration
+            if (get_num_procs() > 1) then
+               ! Anderson solver is not yet parallelized; abort
+               if (my_task == master_task) then
+                  call abort_ice(error_message=subname // " Anderson solver (algo_nonlin = 'anderson') is not yet parallelized, and nprocs > 1 " , &
+                     file=__FILE__, line=__LINE__)
+               endif
+            endif
+            if (it_nl > start_andacc) then
+               ! Update residual difference vector
+               res_diff = res - res_old
+               ! Update fixed point function difference matrix
+               if (res_num < dim_andacc) then
+                  ! Add column
+                  G_diff(:,res_num+1) = fpfunc - fpfunc_old
+               else
+                  ! Delete first column and add column
+                  G_diff(:,1:res_num-1) = G_diff(:,2:res_num)
+                  G_diff(:,res_num) = fpfunc - fpfunc_old
+               endif
+               res_num = res_num + 1
+            endif
+            res_old = res
+            fpfunc_old = fpfunc
+            if (res_num == 0) then
+               sol = fpfunc
+            else
+               if (res_num == 1) then
+                  ! Initialize QR factorization
+                  R(1,1) = dnrm2(size(res_diff), res_diff, inc)
+                  Q(:,1) = res_diff/R(1,1)
+               else
+                  if (res_num > dim_andacc) then
+                     ! Update factorization since 1st column was deleted
+                     call qr_delete(Q,R)
+                     res_num = res_num - 1
+                  endif
+                  ! Update QR factorization for new column
+                  do j = 1, res_num - 1
+                     R(j,res_num) = ddot(ntot, Q(:,j), inc, res_diff, inc)
+                     res_diff = res_diff - R(j,res_num) * Q(:,j)
+                  enddo
+                  R(res_num, res_num) = dnrm2(size(res_diff) ,res_diff, inc)
+                  Q(:,res_num) = res_diff / R(res_num, res_num)
+               endif
+               ! TODO: here, drop more columns to improve conditioning
+               ! if (droptol) then
+               
+               ! endif
+               ! Solve least square problem for coefficients
+               ! 1. Compute rhs_tri = Q^T * res
+               call dgemv ('t', size(Q,1), res_num, c1, Q(:,1:res_num), size(Q,1), res, inc, c0, rhs_tri, inc)
+               ! 2. Solve R*coeffs = rhs_tri, put result in rhs_tri
+               call dtrsv ('u', 'n', 'n', res_num, R(1:res_num,1:res_num), res_num, rhs_tri, inc)
+               coeffs = rhs_tri
+               ! Update approximate solution: x = fpfunc - G_diff*coeffs, put result in fpfunc
+               call dgemv ('n', size(G_diff,1), res_num, -c1, G_diff(:,1:res_num), size(G_diff,1), coeffs, inc, c1, fpfunc, inc)
+               sol = fpfunc
+               ! Apply damping
+               if (damping_andacc > 0 .and. damping_andacc /= 1) then
+                  ! x = x - (1-beta) (res - Q*R*coeffs)
+                  
+                  ! tmp = R*coeffs
+                  call dgemv ('n', res_num, res_num, c1, R(1:res_num,1:res_num), res_num, coeffs, inc, c0, tmp, inc)
+                  ! res = res - Q*tmp
+                  call dgemv ('n', size(Q,1), res_num, -c1, Q(:,1:res_num), size(Q,1), tmp, inc, c1, res, inc)
+                  ! x = x - (1-beta)*res
+                  sol = sol - (1-damping_andacc)*res
+               endif
+            endif
+#else
+            ! Anderson solver is not usable without LAPACK; abort
+            call abort_ice(error_message=subname // " CICE was not compiled with LAPACK, and Anderson solver was chosen (algo_nonlin = 'anderson')" , &
+               file=__FILE__, line=__LINE__)
+#endif
+         endif
+         
+         !-----------------------------------------------------------------------
+         !     Put vector sol in uvel and vvel arrays
+         !-----------------------------------------------------------------------
+         call vec_to_arrays (nx_block    , ny_block    , &
+                             nblocks     , max_blocks  , &
+                             icellu   (:), ntot        , &
+                             indxui (:,:), indxuj (:,:), &
+                             sol      (:),               &
+                             uvel (:,:,:), vvel (:,:,:))
+         
+         ! Do halo update so that halo cells contain up to date info for advection
+         call stack_velocity_field(uvel, vvel, fld2)
+         call ice_timer_start(timer_bound)
+         if (maskhalo_dyn) then
+            call ice_HaloUpdate (fld2,               halo_info_mask, &
+                                 field_loc_NEcorner, field_type_vector)
+         else
+            call ice_HaloUpdate (fld2,               halo_info, &
+                                 field_loc_NEcorner, field_type_vector)
+         endif
+         call ice_timer_stop(timer_bound)
+         call unstack_velocity_field(fld2, uvel, vvel)
+         
+         ! Compute "progress" residual norm
+         !$OMP PARALLEL DO PRIVATE(iblk)
+         do iblk = 1, nblocks
+            fpresx(:,:,iblk) = uvel(:,:,iblk) - uprev_k(:,:,iblk)
+            fpresy(:,:,iblk) = vvel(:,:,iblk) - vprev_k(:,:,iblk)
+            call calc_L2norm_squared (nx_block        , ny_block        , &
+                                      icellu    (iblk),                   &
+                                      indxui  (:,iblk), indxuj  (:,iblk), &
+                                      fpresx(:,:,iblk), fpresy(:,:,iblk), &
+                                      L2norm    (iblk))
+         enddo
+         !$OMP END PARALLEL DO
+         prog_norm = sqrt(global_sum(sum(L2norm), distrb_info))
+         if (my_task == master_task .and. monitor_nonlin) then
+            write(nu_diag, '(a,i4,a,d26.16)') "monitor_nonlin: iter_nonlin= ", it_nl, &
+                                              " progress_res_L2norm= ", prog_norm
+         endif
+         
+      enddo ! nonlinear iteration loop
+      
+      end subroutine anderson_solver
+
+!=======================================================================
+
+! Computes the viscous coefficients (in fact zetaD=2*zeta) and dPr/dx, dPr/dy
+
+      subroutine calc_zeta_dPr (nx_block, ny_block, &
+                                icellt  ,           &
+                                indxti  , indxtj  , &
+                                uvel    , vvel    , &
+                                dxt     , dyt     , &
+                                dxhy    , dyhx    , &
+                                cxp     , cyp     , &
+                                cxm     , cym     , &
+                                tinyarea,           &
+                                strength, zetaD   , &
+                                stPr)
+
+      use ice_dyn_shared, only: strain_rates
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         icellt                ! no. of cells where icetmask = 1
+
+      integer (kind=int_kind), dimension (nx_block*ny_block), intent(in) :: &
+         indxti   , & ! compressed index in i-direction
+         indxtj       ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         strength , & ! ice strength (N/m)
+         uvel     , & ! x-component of velocity (m/s)
+         vvel     , & ! y-component of velocity (m/s)
+         dxt      , & ! width of T-cell through the middle (m)
+         dyt      , & ! height of T-cell through the middle (m)
+         dxhy     , & ! 0.5*(HTE - HTE)
+         dyhx     , & ! 0.5*(HTN - HTN)
+         cyp      , & ! 1.5*HTE - 0.5*HTE
+         cxp      , & ! 1.5*HTN - 0.5*HTN
+         cym      , & ! 0.5*HTE - 1.5*HTE
+         cxm      , & ! 0.5*HTN - 1.5*HTN
+         tinyarea     ! min_strain_rate*tarea
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,4), intent(out) :: &
+         zetaD          ! 2*zeta
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,8), intent(out) :: &
+         stPr          ! stress combinations from replacement pressure
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, ij
+
+      real (kind=dbl_kind) :: &
+        divune, divunw, divuse, divusw                , & ! divergence
+        tensionne, tensionnw, tensionse, tensionsw    , & ! tension
+        shearne, shearnw, shearse, shearsw            , & ! shearing
+        Deltane, Deltanw, Deltase, Deltasw            , & ! Delt
+        ssigpn, ssigps, ssigpe, ssigpw, ssigp1, ssigp2, &
+        csigpne, csigpnw, csigpsw, csigpse            , &
+        stressp_1, stressp_2, stressp_3, stressp_4    , &
+        strp_tmp
+
+      logical :: capping ! of the viscous coeff
+
+      character(len=*), parameter :: subname = '(calc_zeta_dPr)'
+
+      ! Initialize
+
+      capping = .false.
+      
+      ! Initialize stPr and zetaD to zero (for cells where icetmask is false)
+      stPr  = c0
+      zetaD = c0
+
+      do ij = 1, icellt
+         i = indxti(ij)
+         j = indxtj(ij)
+
+      !-----------------------------------------------------------------
+      ! strain rates
+      ! NOTE these are actually strain rates * area  (m^2/s)
+      !-----------------------------------------------------------------
+         call strain_rates (nx_block , ny_block , &
+                            i        , j        , &
+                            uvel     , vvel     , &
+                            dxt      , dyt      , &
+                            cxp      , cyp      , &
+                            cxm      , cym      , &
+                            divune   , divunw   , &
+                            divuse   , divusw   , &
+                            tensionne, tensionnw, &
+                            tensionse, tensionsw, &
+                            shearne  , shearnw  , &
+                            shearse  , shearsw  , &
+                            Deltane  , Deltanw  , &
+                            Deltase  , Deltasw)
+
+         if (capping) then
+            zetaD(i,j,1) = strength(i,j)/max(Deltane,tinyarea(i,j))
+            zetaD(i,j,2) = strength(i,j)/max(Deltanw,tinyarea(i,j))
+            zetaD(i,j,3) = strength(i,j)/max(Deltasw,tinyarea(i,j))
+            zetaD(i,j,4) = strength(i,j)/max(Deltase,tinyarea(i,j))
+         else
+            zetaD(i,j,1) = strength(i,j)/(Deltane + tinyarea(i,j))
+            zetaD(i,j,2) = strength(i,j)/(Deltanw + tinyarea(i,j))
+            zetaD(i,j,3) = strength(i,j)/(Deltasw + tinyarea(i,j))
+            zetaD(i,j,4) = strength(i,j)/(Deltase + tinyarea(i,j))
+         endif
+         
+      !-----------------------------------------------------------------
+      ! the stresses                            ! kg/s^2
+      ! (1) northeast, (2) northwest, (3) southwest, (4) southeast
+      !-----------------------------------------------------------------
+
+         stressp_1 = -zetaD(i,j,1)*(Deltane*(c1-Ktens))
+         stressp_2 = -zetaD(i,j,2)*(Deltanw*(c1-Ktens))
+         stressp_3 = -zetaD(i,j,3)*(Deltasw*(c1-Ktens))
+         stressp_4 = -zetaD(i,j,4)*(Deltase*(c1-Ktens))
+         
+      !-----------------------------------------------------------------
+      ! combinations of the Pr related stresses for the momentum equation ! kg/s^2
+      !-----------------------------------------------------------------
+
+         ssigpn  = stressp_1 + stressp_2
+         ssigps  = stressp_3 + stressp_4
+         ssigpe  = stressp_1 + stressp_4
+         ssigpw  = stressp_2 + stressp_3
+         ssigp1  =(stressp_1 + stressp_3)*p055
+         ssigp2  =(stressp_2 + stressp_4)*p055
+
+         csigpne = p111*stressp_1 + ssigp2 + p027*stressp_3
+         csigpnw = p111*stressp_2 + ssigp1 + p027*stressp_4
+         csigpsw = p111*stressp_3 + ssigp2 + p027*stressp_1
+         csigpse = p111*stressp_4 + ssigp1 + p027*stressp_2
+         
+      !-----------------------------------------------------------------
+      ! for dF/dx (u momentum)
+      !-----------------------------------------------------------------
+         strp_tmp  = p25*dyt(i,j)*(p333*ssigpn  + p166*ssigps)
+
+         ! northeast (i,j)
+         stPr(i,j,1) = -strp_tmp &
+              + dxhy(i,j)*(-csigpne)
+
+         ! northwest (i+1,j)
+         stPr(i,j,2) = strp_tmp  &
+              + dxhy(i,j)*(-csigpnw)
+
+         strp_tmp  = p25*dyt(i,j)*(p333*ssigps  + p166*ssigpn)
+
+         ! southeast (i,j+1)
+         stPr(i,j,3) = -strp_tmp &
+              + dxhy(i,j)*(-csigpse)
+
+         ! southwest (i+1,j+1)
+         stPr(i,j,4) = strp_tmp  &
+              + dxhy(i,j)*(-csigpsw)
+
+      !-----------------------------------------------------------------
+      ! for dF/dy (v momentum)
+      !-----------------------------------------------------------------
+         strp_tmp  = p25*dxt(i,j)*(p333*ssigpe  + p166*ssigpw)
+
+         ! northeast (i,j)
+         stPr(i,j,5) = -strp_tmp &
+              - dyhx(i,j)*(csigpne)
+
+         ! southeast (i,j+1)
+         stPr(i,j,6) = strp_tmp  &
+              - dyhx(i,j)*(csigpse)
+
+         strp_tmp  = p25*dxt(i,j)*(p333*ssigpw  + p166*ssigpe)
+
+         ! northwest (i+1,j)
+         stPr(i,j,7) = -strp_tmp &
+              - dyhx(i,j)*(csigpnw)
+
+         ! southwest (i+1,j+1)
+         stPr(i,j,8) = strp_tmp  &
+              - dyhx(i,j)*(csigpsw)
+
+      enddo                     ! ij
+
+      end subroutine calc_zeta_dPr
+
+!=======================================================================
+
+! Computes the VP stresses (as diagnostic)
+
+      subroutine stress_vp (nx_block  , ny_block  , &
+                            icellt    ,             &
+                            indxti    , indxtj    , &
+                            uvel      , vvel      , &
+                            dxt       , dyt       , &
+                            cxp       , cyp       , &
+                            cxm       , cym       , &
+                            zetaD     ,             &
+                            stressp_1 , stressp_2 , &
+                            stressp_3 , stressp_4 , &
+                            stressm_1 , stressm_2 , &
+                            stressm_3 , stressm_4 , &
+                            stress12_1, stress12_2, &
+                            stress12_3, stress12_4)
+
+      use ice_dyn_shared, only: strain_rates
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         icellt                ! no. of cells where icetmask = 1
+
+      integer (kind=int_kind), dimension (nx_block*ny_block), intent(in) :: &
+         indxti   , & ! compressed index in i-direction
+         indxtj       ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         uvel     , & ! x-component of velocity (m/s)
+         vvel     , & ! y-component of velocity (m/s)
+         dxt      , & ! width of T-cell through the middle (m)
+         dyt      , & ! height of T-cell through the middle (m)
+         cyp      , & ! 1.5*HTE - 0.5*HTE
+         cxp      , & ! 1.5*HTN - 0.5*HTN
+         cym      , & ! 0.5*HTE - 1.5*HTE
+         cxm          ! 0.5*HTN - 1.5*HTN
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,4), intent(in) :: &
+         zetaD          ! 2*zeta
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(inout) :: &
+         stressp_1, stressp_2, stressp_3, stressp_4 , & ! sigma11+sigma22
+         stressm_1, stressm_2, stressm_3, stressm_4 , & ! sigma11-sigma22
+         stress12_1,stress12_2,stress12_3,stress12_4    ! sigma12
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, ij
+
+      real (kind=dbl_kind) :: &
+        divune, divunw, divuse, divusw            , & ! divergence
+        tensionne, tensionnw, tensionse, tensionsw, & ! tension
+        shearne, shearnw, shearse, shearsw        , & ! shearing
+        Deltane, Deltanw, Deltase, Deltasw            ! Delt
+
+      character(len=*), parameter :: subname = '(stress_vp)'
+
+      do ij = 1, icellt
+         i = indxti(ij)
+         j = indxtj(ij)
+
+      !-----------------------------------------------------------------
+      ! strain rates
+      ! NOTE these are actually strain rates * area  (m^2/s)
+      !-----------------------------------------------------------------
+         call strain_rates (nx_block , ny_block , &
+                            i        , j        , &
+                            uvel     , vvel     , &
+                            dxt      , dyt      , &
+                            cxp      , cyp      , &
+                            cxm      , cym      , &
+                            divune   , divunw   , &
+                            divuse   , divusw   , &
+                            tensionne, tensionnw, &
+                            tensionse, tensionsw, &
+                            shearne  , shearnw  , &
+                            shearse  , shearsw  , &
+                            Deltane  , Deltanw  , &
+                            Deltase  , Deltasw)
+
+      !-----------------------------------------------------------------
+      ! the stresses                            ! kg/s^2
+      ! (1) northeast, (2) northwest, (3) southwest, (4) southeast
+      !-----------------------------------------------------------------
+
+         stressp_1(i,j) = zetaD(i,j,1)*(divune*(c1+Ktens) - Deltane*(c1-Ktens))
+         stressp_2(i,j) = zetaD(i,j,2)*(divunw*(c1+Ktens) - Deltanw*(c1-Ktens))
+         stressp_3(i,j) = zetaD(i,j,3)*(divusw*(c1+Ktens) - Deltasw*(c1-Ktens))
+         stressp_4(i,j) = zetaD(i,j,4)*(divuse*(c1+Ktens) - Deltase*(c1-Ktens))
+         
+         stressm_1(i,j) = zetaD(i,j,1)*tensionne*(c1+Ktens)*ecci
+         stressm_2(i,j) = zetaD(i,j,2)*tensionnw*(c1+Ktens)*ecci
+         stressm_3(i,j) = zetaD(i,j,3)*tensionsw*(c1+Ktens)*ecci
+         stressm_4(i,j) = zetaD(i,j,4)*tensionse*(c1+Ktens)*ecci
+         
+         stress12_1(i,j) = zetaD(i,j,1)*shearne*p5*(c1+Ktens)*ecci
+         stress12_2(i,j) = zetaD(i,j,2)*shearnw*p5*(c1+Ktens)*ecci
+         stress12_3(i,j) = zetaD(i,j,3)*shearsw*p5*(c1+Ktens)*ecci
+         stress12_4(i,j) = zetaD(i,j,4)*shearse*p5*(c1+Ktens)*ecci
+
+      enddo                     ! ij
+
+      end subroutine stress_vp
+
+!=======================================================================
+
+! Compute vrel and seabed stress coefficients
+
+      subroutine calc_vrel_Cb (nx_block, ny_block, &
+                               icellu  , Cw      , &
+                               indxui  , indxuj  , &
+                               aiu     , Tbu     , &
+                               uocn    , vocn    , &
+                               uvel    , vvel    , &
+                               vrel    , Cb)
+
+      use ice_dyn_shared, only: u0 ! residual velocity for basal stress (m/s)
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         icellu                ! total count when iceumask is true
+
+      integer (kind=int_kind), dimension (nx_block*ny_block), intent(in) :: &
+         indxui  , & ! compressed index in i-direction
+         indxuj      ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         Tbu,      & ! coefficient for basal stress (N/m^2)
+         aiu     , & ! ice fraction on u-grid
+         uocn    , & ! ocean current, x-direction (m/s)
+         vocn    , & ! ocean current, y-direction (m/s)
+         Cw          ! ocean-ice neutral drag coefficient
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         uvel    , & ! x-component of velocity (m/s)
+         vvel        ! y-component of velocity (m/s)
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(inout) :: &
+         vrel      , & ! coeff for tauw
+         Cb            ! seabed stress coeff
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, ij
+
+      real (kind=dbl_kind) :: &
+         rhow                  !
+
+      character(len=*), parameter :: subname = '(calc_vrel_Cb)'
+
+      call icepack_query_parameters(rhow_out=rhow)
+      call icepack_warnings_flush(nu_diag)
+      if (icepack_warnings_aborted()) call abort_ice(error_message=subname, &
+         file=__FILE__, line=__LINE__)
+
+      do ij = 1, icellu
+         i = indxui(ij)
+         j = indxuj(ij)
+
+         ! (magnitude of relative ocean current)*rhow*drag*aice
+         vrel(i,j) = aiu(i,j)*rhow*Cw(i,j)*sqrt((uocn(i,j) - uvel(i,j))**2 + &
+                                                (vocn(i,j) - vvel(i,j))**2)  ! m/s
+      
+         Cb(i,j)  = Tbu(i,j) / (sqrt(uvel(i,j)**2 + vvel(i,j)**2) + u0) ! for seabed stress
+      enddo                     ! ij
+
+      end subroutine calc_vrel_Cb
+
+!=======================================================================
+
+! Compute seabed stress (diagnostic)
+
+      subroutine calc_seabed_stress (nx_block, ny_block, &
+                                     icellu  ,           &
+                                     indxui  , indxuj  , &
+                                     uvel    , vvel    , &
+                                     Cb      ,           &
+                                     taubx   , tauby)
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         icellu                ! total count when iceumask is true
+
+      integer (kind=int_kind), dimension (nx_block*ny_block), intent(in) :: &
+         indxui  , & ! compressed index in i-direction
+         indxuj      ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         uvel    , & ! x-component of velocity (m/s)
+         vvel    , & ! y-component of velocity (m/s)
+         Cb          ! seabed stress coefficient
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(out) :: &
+         taubx   , & ! seabed stress, x-direction (N/m^2)
+         tauby       ! seabed stress, y-direction (N/m^2)
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, ij
+
+      character(len=*), parameter :: subname = '(calc_seabed_stress)'
+
+      do ij = 1, icellu
+         i = indxui(ij)
+         j = indxuj(ij)
+         
+         taubx(i,j) = -uvel(i,j)*Cb(i,j)
+         tauby(i,j) = -vvel(i,j)*Cb(i,j)
+      enddo                     ! ij
+
+      end subroutine calc_seabed_stress
+
+!=======================================================================
+
+! Computes the matrix vector product A(u,v) * (u,v)
+! Au = A(u,v)_[x] * uvel (x components of  A(u,v) * (u,v))
+! Av = A(u,v)_[y] * vvel (y components of  A(u,v) * (u,v))
+
+      subroutine matvec (nx_block, ny_block, &
+                         icellu  , icellt  , &
+                         indxui  , indxuj  , &
+                         indxti  , indxtj  , &
+                         dxt     , dyt     , &
+                         dxhy    , dyhx    , &
+                         cxp     , cyp     , &
+                         cxm     , cym     , &
+                         uvel    , vvel    , &
+                         vrel    , Cb      , &
+                         zetaD   ,           &
+                         umassdti, fm      , &
+                         uarear  ,           &
+                         Au      , Av)
+
+      use ice_dyn_shared, only: strain_rates
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         icellu,             & ! total count when iceumask is true
+         icellt                ! no. of cells where icetmask = 1
+
+      integer (kind=int_kind), dimension (nx_block*ny_block), intent(in) :: &
+         indxui  , & ! compressed index in i-direction
+         indxuj  , & ! compressed index in j-direction
+         indxti  , & ! compressed index in i-direction
+         indxtj      ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         dxt      , & ! width of T-cell through the middle (m)
+         dyt      , & ! height of T-cell through the middle (m)
+         dxhy     , & ! 0.5*(HTE - HTE)
+         dyhx     , & ! 0.5*(HTN - HTN)
+         cyp      , & ! 1.5*HTE - 0.5*HTE
+         cxp      , & ! 1.5*HTN - 0.5*HTN
+         cym      , & ! 0.5*HTE - 1.5*HTE
+         cxm          ! 0.5*HTN - 1.5*HTN
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         uvel    , & ! x-component of velocity (m/s)
+         vvel    , & ! y-component of velocity (m/s)
+         vrel    , & ! coefficient for tauw
+         Cb      , & ! coefficient for basal stress
+         umassdti, & ! mass of U-cell/dt (kg/m^2 s)
+         fm      , & ! Coriolis param. * mass in U-cell (kg/s)
+         uarear      ! 1/uarea
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,4), intent(in) :: &
+         zetaD          ! 2*zeta
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(inout) :: &
+         Au      , & ! matvec, Fx = bx - Au (N/m^2)
+         Av          ! matvec, Fy = by - Av (N/m^2)
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, ij
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,8):: &
+         str
+
+      real (kind=dbl_kind) :: &
+         ccaimp,ccb        , & ! intermediate variables
+         strintx, strinty      ! divergence of the internal stress tensor
+
+      real (kind=dbl_kind) :: &
+        divune, divunw, divuse, divusw            , & ! divergence
+        tensionne, tensionnw, tensionse, tensionsw, & ! tension
+        shearne, shearnw, shearse, shearsw        , & ! shearing
+        Deltane, Deltanw, Deltase, Deltasw        , & ! Delt
+        ssigpn, ssigps, ssigpe, ssigpw            , &
+        ssigmn, ssigms, ssigme, ssigmw            , &
+        ssig12n, ssig12s, ssig12e, ssig12w        , &
+        ssigp1, ssigp2, ssigm1, ssigm2, ssig121, ssig122, &
+        csigpne, csigpnw, csigpse, csigpsw        , &
+        csigmne, csigmnw, csigmse, csigmsw        , &
+        csig12ne, csig12nw, csig12se, csig12sw    , &
+        str12ew, str12we, str12ns, str12sn        , &
+        strp_tmp, strm_tmp
+
+      real (kind=dbl_kind) :: &
+         stressp_1, stressp_2, stressp_3, stressp_4 , & ! sigma11+sigma22 (without Pr)
+         stressm_1, stressm_2, stressm_3, stressm_4 , & ! sigma11-sigma22
+         stress12_1,stress12_2,stress12_3,stress12_4    ! sigma12
+
+      character(len=*), parameter :: subname = '(matvec)'
+
+      !-----------------------------------------------------------------
+      ! Initialize
+      !-----------------------------------------------------------------
+
+      str(:,:,:) = c0
+
+      do ij = 1, icellt
+         i = indxti(ij)
+         j = indxtj(ij)
+
+      !-----------------------------------------------------------------
+      ! strain rates
+      ! NOTE these are actually strain rates * area  (m^2/s)
+      !-----------------------------------------------------------------
+         call strain_rates (nx_block , ny_block , &
+                            i        , j        , &
+                            uvel     , vvel     , &
+                            dxt      , dyt      , &
+                            cxp      , cyp      , &
+                            cxm      , cym      , &
+                            divune   , divunw   , &
+                            divuse   , divusw   , &
+                            tensionne, tensionnw, &
+                            tensionse, tensionsw, &
+                            shearne  , shearnw  , &
+                            shearse  , shearsw  , &
+                            Deltane  , Deltanw  , &
+                            Deltase  , Deltasw)
+
+      !-----------------------------------------------------------------
+      ! the stresses                            ! kg/s^2
+      ! (1) northeast, (2) northwest, (3) southwest, (4) southeast
+      ! NOTE: commented part of stressp is from the replacement pressure Pr
+      !-----------------------------------------------------------------
+
+         stressp_1 = zetaD(i,j,1)*(divune*(c1+Ktens))! - Deltane*(c1-Ktens))
+         stressp_2 = zetaD(i,j,2)*(divunw*(c1+Ktens))! - Deltanw*(c1-Ktens))
+         stressp_3 = zetaD(i,j,3)*(divusw*(c1+Ktens))! - Deltasw*(c1-Ktens))
+         stressp_4 = zetaD(i,j,4)*(divuse*(c1+Ktens))! - Deltase*(c1-Ktens))
+         
+         stressm_1 = zetaD(i,j,1)*tensionne*(c1+Ktens)*ecci
+         stressm_2 = zetaD(i,j,2)*tensionnw*(c1+Ktens)*ecci
+         stressm_3 = zetaD(i,j,3)*tensionsw*(c1+Ktens)*ecci
+         stressm_4 = zetaD(i,j,4)*tensionse*(c1+Ktens)*ecci
+         
+         stress12_1 = zetaD(i,j,1)*shearne*p5*(c1+Ktens)*ecci
+         stress12_2 = zetaD(i,j,2)*shearnw*p5*(c1+Ktens)*ecci
+         stress12_3 = zetaD(i,j,3)*shearsw*p5*(c1+Ktens)*ecci
+         stress12_4 = zetaD(i,j,4)*shearse*p5*(c1+Ktens)*ecci
+
+      !-----------------------------------------------------------------
+      ! combinations of the stresses for the momentum equation ! kg/s^2
+      !-----------------------------------------------------------------
+
+         ssigpn  = stressp_1 + stressp_2
+         ssigps  = stressp_3 + stressp_4
+         ssigpe  = stressp_1 + stressp_4
+         ssigpw  = stressp_2 + stressp_3
+         ssigp1  =(stressp_1 + stressp_3)*p055
+         ssigp2  =(stressp_2 + stressp_4)*p055
+
+         ssigmn  = stressm_1 + stressm_2
+         ssigms  = stressm_3 + stressm_4
+         ssigme  = stressm_1 + stressm_4
+         ssigmw  = stressm_2 + stressm_3
+         ssigm1  =(stressm_1 + stressm_3)*p055
+         ssigm2  =(stressm_2 + stressm_4)*p055
+
+         ssig12n = stress12_1 + stress12_2
+         ssig12s = stress12_3 + stress12_4
+         ssig12e = stress12_1 + stress12_4
+         ssig12w = stress12_2 + stress12_3
+         ssig121 =(stress12_1 + stress12_3)*p111
+         ssig122 =(stress12_2 + stress12_4)*p111
+
+         csigpne = p111*stressp_1 + ssigp2 + p027*stressp_3
+         csigpnw = p111*stressp_2 + ssigp1 + p027*stressp_4
+         csigpsw = p111*stressp_3 + ssigp2 + p027*stressp_1
+         csigpse = p111*stressp_4 + ssigp1 + p027*stressp_2
+         
+         csigmne = p111*stressm_1 + ssigm2 + p027*stressm_3
+         csigmnw = p111*stressm_2 + ssigm1 + p027*stressm_4
+         csigmsw = p111*stressm_3 + ssigm2 + p027*stressm_1
+         csigmse = p111*stressm_4 + ssigm1 + p027*stressm_2
+         
+         csig12ne = p222*stress12_1 + ssig122 &
+                  + p055*stress12_3
+         csig12nw = p222*stress12_2 + ssig121 &
+                  + p055*stress12_4
+         csig12sw = p222*stress12_3 + ssig122 &
+                  + p055*stress12_1
+         csig12se = p222*stress12_4 + ssig121 &
+                  + p055*stress12_2
+
+         str12ew = p5*dxt(i,j)*(p333*ssig12e + p166*ssig12w)
+         str12we = p5*dxt(i,j)*(p333*ssig12w + p166*ssig12e)
+         str12ns = p5*dyt(i,j)*(p333*ssig12n + p166*ssig12s)
+         str12sn = p5*dyt(i,j)*(p333*ssig12s + p166*ssig12n)
+
+      !-----------------------------------------------------------------
+      ! for dF/dx (u momentum)
+      !-----------------------------------------------------------------
+         strp_tmp  = p25*dyt(i,j)*(p333*ssigpn  + p166*ssigps)
+         strm_tmp  = p25*dyt(i,j)*(p333*ssigmn  + p166*ssigms)
+
+         ! northeast (i,j)
+         str(i,j,1) = -strp_tmp - strm_tmp - str12ew &
+              + dxhy(i,j)*(-csigpne + csigmne) + dyhx(i,j)*csig12ne
+
+         ! northwest (i+1,j)
+         str(i,j,2) = strp_tmp + strm_tmp - str12we &
+              + dxhy(i,j)*(-csigpnw + csigmnw) + dyhx(i,j)*csig12nw
+
+         strp_tmp  = p25*dyt(i,j)*(p333*ssigps  + p166*ssigpn)
+         strm_tmp  = p25*dyt(i,j)*(p333*ssigms  + p166*ssigmn)
+
+         ! southeast (i,j+1)
+         str(i,j,3) = -strp_tmp - strm_tmp + str12ew &
+              + dxhy(i,j)*(-csigpse + csigmse) + dyhx(i,j)*csig12se
+
+         ! southwest (i+1,j+1)
+         str(i,j,4) = strp_tmp + strm_tmp + str12we &
+              + dxhy(i,j)*(-csigpsw + csigmsw) + dyhx(i,j)*csig12sw
+
+      !-----------------------------------------------------------------
+      ! for dF/dy (v momentum)
+      !-----------------------------------------------------------------
+         strp_tmp  = p25*dxt(i,j)*(p333*ssigpe  + p166*ssigpw)
+         strm_tmp  = p25*dxt(i,j)*(p333*ssigme  + p166*ssigmw)
+
+         ! northeast (i,j)
+         str(i,j,5) = -strp_tmp + strm_tmp - str12ns &
+              - dyhx(i,j)*(csigpne + csigmne) + dxhy(i,j)*csig12ne
+
+         ! southeast (i,j+1)
+         str(i,j,6) = strp_tmp - strm_tmp - str12sn &
+              - dyhx(i,j)*(csigpse + csigmse) + dxhy(i,j)*csig12se
+
+         strp_tmp  = p25*dxt(i,j)*(p333*ssigpw  + p166*ssigpe)
+         strm_tmp  = p25*dxt(i,j)*(p333*ssigmw  + p166*ssigme)
+
+         ! northwest (i+1,j)
+         str(i,j,7) = -strp_tmp + strm_tmp + str12ns &
+              - dyhx(i,j)*(csigpnw + csigmnw) + dxhy(i,j)*csig12nw
+
+         ! southwest (i+1,j+1)
+         str(i,j,8) = strp_tmp - strm_tmp + str12sn &
+              - dyhx(i,j)*(csigpsw + csigmsw) + dxhy(i,j)*csig12sw
+
+      enddo ! ij - icellt
+      
+      !-----------------------------------------------------------------
+      ! Form Au and Av
+      !-----------------------------------------------------------------
+
+      do ij = 1, icellu
+         i = indxui(ij)
+         j = indxuj(ij)
+
+         ccaimp = umassdti(i,j) + vrel(i,j) * cosw + Cb(i,j) ! kg/m^2 s
+         
+         ccb = fm(i,j) + sign(c1,fm(i,j)) * vrel(i,j) * sinw ! kg/m^2 s
+
+         ! divergence of the internal stress tensor
+         strintx = uarear(i,j)* &
+             (str(i,j,1) + str(i+1,j,2) + str(i,j+1,3) + str(i+1,j+1,4))
+         strinty = uarear(i,j)* &
+             (str(i,j,5) + str(i,j+1,6) + str(i+1,j,7) + str(i+1,j+1,8))
+
+         Au(i,j) = ccaimp*uvel(i,j) - ccb*vvel(i,j) - strintx
+         Av(i,j) = ccaimp*vvel(i,j) + ccb*uvel(i,j) - strinty
+      enddo ! ij - icellu
+
+      end subroutine matvec
+
+!=======================================================================
+
+! Compute the constant component of b(u,v) i.e. the part of b(u,v) that
+! does not depend on (u,v) and thus do not change during the nonlinear iteration
+
+     subroutine calc_bfix  (nx_block , ny_block , &
+                            icellu   ,            &
+                            indxui   , indxuj   , &
+                            umassdti ,            &
+                            forcex   , forcey   , &
+                            uvel_init, vvel_init, &
+                            bxfix    , byfix)
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         icellu                ! no. of cells where iceumask = 1
+
+      integer (kind=int_kind), dimension (nx_block*ny_block), intent(in) :: &
+         indxui   , & ! compressed index in i-direction
+         indxuj       ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         uvel_init,& ! x-component of velocity (m/s), beginning of time step
+         vvel_init,& ! y-component of velocity (m/s), beginning of time step
+         umassdti, & ! mass of U-cell/dt (kg/m^2 s)
+         forcex  , & ! work array: combined atm stress and ocn tilt, x
+         forcey      ! work array: combined atm stress and ocn tilt, y
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(out) :: &
+         bxfix   , & ! bx = taux + bxfix
+         byfix       ! by = tauy + byfix
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, ij
+
+      character(len=*), parameter :: subname = '(calc_bfix)'
+
+      do ij = 1, icellu
+         i = indxui(ij)
+         j = indxuj(ij)
+
+         bxfix(i,j) = umassdti(i,j)*uvel_init(i,j) + forcex(i,j)
+         byfix(i,j) = umassdti(i,j)*vvel_init(i,j) + forcey(i,j)
+      enddo
+
+      end subroutine calc_bfix
+
+!=======================================================================
+
+! Compute the vector b(u,v), i.e. the part of the nonlinear function F(u,v)
+! that cannot be written as A(u,v)*(u,v), where A(u,v) is a matrix with entries
+! depending on (u,v)
+
+      subroutine calc_bvec (nx_block, ny_block, &
+                            icellu  ,           &
+                            indxui  , indxuj  , &
+                            stPr    , uarear  , &
+                            waterx  , watery  , &
+                            bxfix   , byfix   , &
+                            bx      , by      , &
+                            vrel)
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         icellu                ! total count when iceumask is true
+
+      integer (kind=int_kind), dimension (nx_block*ny_block), intent(in) :: &
+         indxui  , & ! compressed index in i-direction
+         indxuj      ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         uarear  , & ! 1/uarea
+         waterx  , & ! for ocean stress calculation, x (m/s)
+         watery  , & ! for ocean stress calculation, y (m/s)
+         bxfix   , & ! bx = taux + bxfix
+         byfix   , & ! by = tauy + byfix
+         vrel        ! relative ice-ocean velocity
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,8), intent(in) :: &
+         stPr
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(out) :: &
+         bx      , & ! b vector, bx = taux + bxfix (N/m^2)
+         by          ! b vector, by = tauy + byfix (N/m^2)
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, ij
+
+      real (kind=dbl_kind) :: &
+         taux, tauy        , & ! part of ocean stress term
+         strintx, strinty  , & ! divergence of the internal stress tensor (only Pr contributions)
+         rhow                  !
+
+      character(len=*), parameter :: subname = '(calc_bvec)'
+      
+      !-----------------------------------------------------------------
+      ! calc b vector
+      !-----------------------------------------------------------------
+      
+      call icepack_query_parameters(rhow_out=rhow)
+      call icepack_warnings_flush(nu_diag)
+      if (icepack_warnings_aborted()) call abort_ice(error_message=subname, &
+         file=__FILE__, line=__LINE__)
+
+      do ij = 1, icellu
+         i = indxui(ij)
+         j = indxuj(ij)
+
+         ! ice/ocean stress
+         taux = vrel(i,j)*waterx(i,j) ! NOTE this is not the entire
+         tauy = vrel(i,j)*watery(i,j) ! ocn stress term
+         
+         ! divergence of the internal stress tensor (only Pr part, i.e. dPr/dx, dPr/dy)
+         strintx = uarear(i,j)* &
+             (stPr(i,j,1) + stPr(i+1,j,2) + stPr(i,j+1,3) + stPr(i+1,j+1,4))
+         strinty = uarear(i,j)* &
+             (stPr(i,j,5) + stPr(i,j+1,6) + stPr(i+1,j,7) + stPr(i+1,j+1,8))
+
+         bx(i,j) = bxfix(i,j) + taux + strintx
+         by(i,j) = byfix(i,j) + tauy + strinty
+      enddo                     ! ij
+
+      end subroutine calc_bvec
+
+!=======================================================================
+
+! Compute the non linear residual F(u,v) = b(u,v) - A(u,v) * (u,v),
+! with Au, Av precomputed as
+! Au = A(u,v)_[x] * uvel (x components of  A(u,v) * (u,v))
+! Av = A(u,v)_[y] * vvel (y components of  A(u,v) * (u,v))
+
+      subroutine residual_vec (nx_block   , ny_block, &
+                               icellu     ,           &
+                               indxui     , indxuj  , &
+                               bx         , by      , &
+                               Au         , Av      , &
+                               Fx         , Fy      , &
+                               sum_squared)
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         icellu                ! total count when iceumask is true
+
+      integer (kind=int_kind), dimension (nx_block*ny_block), intent(in) :: &
+         indxui  , & ! compressed index in i-direction
+         indxuj      ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         bx       , & ! b vector, bx = taux + bxfix (N/m^2)
+         by       , & ! b vector, by = tauy + byfix (N/m^2)
+         Au       , & ! matvec, Fx = bx - Au (N/m^2)
+         Av           ! matvec, Fy = by - Av (N/m^2)
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(inout) :: &
+         Fx      , & ! x residual vector, Fx = bx - Au (N/m^2)
+         Fy          ! y residual vector, Fy = by - Av (N/m^2)
+
+      real (kind=dbl_kind), intent(out), optional :: &
+         sum_squared ! sum of squared residual vector components
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, ij
+
+      character(len=*), parameter :: subname = '(residual_vec)'
+
+      !-----------------------------------------------------------------
+      ! compute residual and sum its squared components
+      !-----------------------------------------------------------------
+
+      if (present(sum_squared)) then
+         sum_squared = c0
+      endif
+      
+      do ij = 1, icellu
+         i = indxui(ij)
+         j = indxuj(ij)
+
+         Fx(i,j) = bx(i,j) - Au(i,j)
+         Fy(i,j) = by(i,j) - Av(i,j)
+         if (present(sum_squared)) then
+            sum_squared = sum_squared + Fx(i,j)**2 + Fy(i,j)**2
+         endif
+      enddo                     ! ij
+
+      end subroutine residual_vec
+
+!=======================================================================
+
+! Form the diagonal of the matrix A(u,v) (first part of the computation)
+! Part 1: compute the contributions to the diagonal from the rheology term
+
+      subroutine formDiag_step1  (nx_block, ny_block, &
+                                  icellu  ,           &
+                                  indxui  , indxuj  , &
+                                  dxt     , dyt     , &
+                                  dxhy    , dyhx    , &
+                                  cxp     , cyp     , &
+                                  cxm     , cym     , &
+                                  zetaD   , Drheo)
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         icellu                ! no. of cells where icetmask = 1
+
+      integer (kind=int_kind), dimension (nx_block*ny_block), intent(in) :: &
+         indxui   , & ! compressed index in i-direction
+         indxuj       ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         dxt      , & ! width of T-cell through the middle (m)
+         dyt      , & ! height of T-cell through the middle (m)
+         dxhy     , & ! 0.5*(HTE - HTE)
+         dyhx     , & ! 0.5*(HTN - HTN)
+         cyp      , & ! 1.5*HTE - 0.5*HTE
+         cxp      , & ! 1.5*HTN - 0.5*HTN
+         cym      , & ! 0.5*HTE - 1.5*HTE
+         cxm          ! 0.5*HTN - 1.5*HTN
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,4), intent(in) :: &
+         zetaD          ! 2*zeta
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,8), intent(out) :: &
+         Drheo          ! intermediate value for diagonal components of matrix A associated
+                       ! with rheology term
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, ij, iu, ju, di, dj, cc
+
+      real (kind=dbl_kind) :: &
+        divune, divunw, divuse, divusw            , & ! divergence
+        tensionne, tensionnw, tensionse, tensionsw, & ! tension
+        shearne, shearnw, shearse, shearsw        , & ! shearing
+        uij,ui1j,uij1,ui1j1,vij,vi1j,vij1,vi1j1   , & ! == c0 or c1
+        stressp_1, stressp_2, stressp_3, stressp_4, &
+        stressm_1, stressm_2, stressm_3, stressm_4, &
+        stress12_1,stress12_2,stress12_3,stress12_4,&
+        ssigpn, ssigps, ssigpe, ssigpw            , &
+        ssigmn, ssigms, ssigme, ssigmw            , &
+        ssig12n, ssig12s, ssig12e, ssig12w        , &
+        ssigp1, ssigp2, ssigm1, ssigm2, ssig121, ssig122, &
+        csigpne, csigpnw, csigpse, csigpsw        , &
+        csigmne, csigmnw, csigmse, csigmsw        , &
+        csig12ne, csig12nw, csig12se, csig12sw    , &
+        str12ew, str12we, str12ns, str12sn        , &
+        strp_tmp, strm_tmp
+
+      character(len=*), parameter :: subname = '(formDiag_step1)'
+
+      !-----------------------------------------------------------------
+      ! Initialize
+      !-----------------------------------------------------------------
+
+      Drheo(:,:,:) = c0
+      
+      ! Be careful: Drheo contains 4 terms for u and 4 terms for v.
+      ! These 8 terms come from the surrounding T cells but are all
+      ! refrerenced to the i,j (u point) :
+      
+      ! Drheo(i,j,1) corresponds to str(i,j,1)
+      ! Drheo(i,j,2) corresponds to str(i+1,j,2)
+      ! Drheo(i,j,3) corresponds to str(i,j+1,3)
+      ! Drheo(i,j,4) corresponds to str(i+1,j+1,4))
+      ! Drheo(i,j,5) corresponds to str(i,j,5)
+      ! Drheo(i,j,6) corresponds to str(i,j+1,6)
+      ! Drheo(i,j,7) corresponds to str(i+1,j,7)
+      ! Drheo(i,j,8) corresponds to str(i+1,j+1,8))
+      
+      do cc = 1, 8 ! 4 for u and 4 for v
+      
+         if (cc == 1) then     ! u comp, T cell i,j
+            uij   = c1
+            ui1j  = c0
+            uij1  = c0
+            ui1j1 = c0
+            vij   = c0
+            vi1j  = c0
+            vij1  = c0
+            vi1j1 = c0
+            di    = 0
+            dj    = 0
+         elseif (cc == 2) then ! u comp, T cell i+1,j
+            uij   = c0
+            ui1j  = c1
+            uij1  = c0
+            ui1j1 = c0
+            vij   = c0
+            vi1j  = c0
+            vij1  = c0
+            vi1j1 = c0
+            di    = 1
+            dj    = 0
+         elseif (cc == 3) then ! u comp, T cell i,j+1
+            uij   = c0
+            ui1j  = c0
+            uij1  = c1
+            ui1j1 = c0
+            vij   = c0
+            vi1j  = c0
+            vij1  = c0
+            vi1j1 = c0
+            di    = 0
+            dj    = 1
+         elseif (cc == 4) then ! u comp, T cell i+1,j+1
+            uij   = c0
+            ui1j  = c0
+            uij1  = c0
+            ui1j1 = c1
+            vij   = c0
+            vi1j  = c0
+            vij1  = c0
+            vi1j1 = c0
+            di    = 1
+            dj    = 1
+         elseif (cc == 5) then ! v comp, T cell i,j
+            uij   = c0
+            ui1j  = c0
+            uij1  = c0
+            ui1j1 = c0
+            vij   = c1
+            vi1j  = c0
+            vij1  = c0
+            vi1j1 = c0
+            di    = 0
+            dj    = 0
+         elseif (cc == 6) then ! v comp, T cell i,j+1
+            uij   = c0
+            ui1j  = c0
+            uij1  = c0
+            ui1j1 = c0
+            vij   = c0
+            vi1j  = c0
+            vij1  = c1
+            vi1j1 = c0
+            di    = 0
+            dj    = 1
+         elseif (cc == 7) then ! v comp, T cell i+1,j
+            uij   = c0
+            ui1j  = c0
+            uij1  = c0
+            ui1j1 = c0
+            vij   = c0
+            vi1j  = c1
+            vij1  = c0
+            vi1j1 = c0
+            di    = 1
+            dj    = 0
+         elseif (cc == 8) then ! v comp, T cell i+1,j+1
+            uij   = c0
+            ui1j  = c0
+            uij1  = c0
+            ui1j1 = c0
+            vij   = c0
+            vi1j  = c0
+            vij1  = c0
+            vi1j1 = c1
+            di    = 1
+            dj    = 1
+         endif
+
+         do ij = 1, icellu
+         
+            iu = indxui(ij)
+            ju = indxuj(ij)
+            i  = iu + di
+            j  = ju + dj
+             
+         !-----------------------------------------------------------------
+         ! strain rates
+         ! NOTE these are actually strain rates * area  (m^2/s)
+         !-----------------------------------------------------------------
+            ! divergence  =  e_11 + e_22
+            divune    = cyp(i,j)*uij   - dyt(i,j)*ui1j  &
+                      + cxp(i,j)*vij   - dxt(i,j)*vij1
+            divunw    = cym(i,j)*ui1j  + dyt(i,j)*uij   &
+                      + cxp(i,j)*vi1j  - dxt(i,j)*vi1j1
+            divusw    = cym(i,j)*ui1j1 + dyt(i,j)*uij1  &
+                      + cxm(i,j)*vi1j1 + dxt(i,j)*vi1j
+            divuse    = cyp(i,j)*uij1  - dyt(i,j)*ui1j1 &
+                      + cxm(i,j)*vij1  + dxt(i,j)*vij
+
+            ! tension strain rate  =  e_11 - e_22
+            tensionne = -cym(i,j)*uij   - dyt(i,j)*ui1j  &
+                      +  cxm(i,j)*vij   + dxt(i,j)*vij1
+            tensionnw = -cyp(i,j)*ui1j  + dyt(i,j)*uij   &
+                      +  cxm(i,j)*vi1j  + dxt(i,j)*vi1j1
+            tensionsw = -cyp(i,j)*ui1j1 + dyt(i,j)*uij1  &
+                      +  cxp(i,j)*vi1j1 - dxt(i,j)*vi1j
+            tensionse = -cym(i,j)*uij1  - dyt(i,j)*ui1j1 &
+                      +  cxp(i,j)*vij1  - dxt(i,j)*vij
+
+            ! shearing strain rate  =  2*e_12
+            shearne = -cym(i,j)*vij   - dyt(i,j)*vi1j  &
+                    -  cxm(i,j)*uij   - dxt(i,j)*uij1
+            shearnw = -cyp(i,j)*vi1j  + dyt(i,j)*vij   &
+                    -  cxm(i,j)*ui1j  - dxt(i,j)*ui1j1
+            shearsw = -cyp(i,j)*vi1j1 + dyt(i,j)*vij1  &
+                    -  cxp(i,j)*ui1j1 + dxt(i,j)*ui1j
+            shearse = -cym(i,j)*vij1  - dyt(i,j)*vi1j1 &
+                    -  cxp(i,j)*uij1  + dxt(i,j)*uij
+            
+         !-----------------------------------------------------------------
+         ! the stresses                            ! kg/s^2
+         ! (1) northeast, (2) northwest, (3) southwest, (4) southeast
+         !-----------------------------------------------------------------
+            
+            stressp_1 = zetaD(i,j,1)*divune*(c1+Ktens)
+            stressp_2 = zetaD(i,j,2)*divunw*(c1+Ktens)
+            stressp_3 = zetaD(i,j,3)*divusw*(c1+Ktens)
+            stressp_4 = zetaD(i,j,4)*divuse*(c1+Ktens)
+            
+            stressm_1 = zetaD(i,j,1)*tensionne*(c1+Ktens)*ecci
+            stressm_2 = zetaD(i,j,2)*tensionnw*(c1+Ktens)*ecci
+            stressm_3 = zetaD(i,j,3)*tensionsw*(c1+Ktens)*ecci
+            stressm_4 = zetaD(i,j,4)*tensionse*(c1+Ktens)*ecci
+            
+            stress12_1 = zetaD(i,j,1)*shearne*p5*(c1+Ktens)*ecci
+            stress12_2 = zetaD(i,j,2)*shearnw*p5*(c1+Ktens)*ecci
+            stress12_3 = zetaD(i,j,3)*shearsw*p5*(c1+Ktens)*ecci
+            stress12_4 = zetaD(i,j,4)*shearse*p5*(c1+Ktens)*ecci
+
+         !-----------------------------------------------------------------
+         ! combinations of the stresses for the momentum equation ! kg/s^2
+         !-----------------------------------------------------------------
+
+            ssigpn  = stressp_1 + stressp_2
+            ssigps  = stressp_3 + stressp_4
+            ssigpe  = stressp_1 + stressp_4
+            ssigpw  = stressp_2 + stressp_3
+            ssigp1  =(stressp_1 + stressp_3)*p055
+            ssigp2  =(stressp_2 + stressp_4)*p055
+
+            ssigmn  = stressm_1 + stressm_2
+            ssigms  = stressm_3 + stressm_4
+            ssigme  = stressm_1 + stressm_4
+            ssigmw  = stressm_2 + stressm_3
+            ssigm1  =(stressm_1 + stressm_3)*p055
+            ssigm2  =(stressm_2 + stressm_4)*p055
+
+            ssig12n = stress12_1 + stress12_2
+            ssig12s = stress12_3 + stress12_4
+            ssig12e = stress12_1 + stress12_4
+            ssig12w = stress12_2 + stress12_3
+            ssig121 =(stress12_1 + stress12_3)*p111
+            ssig122 =(stress12_2 + stress12_4)*p111
+
+            csigpne = p111*stressp_1 + ssigp2 + p027*stressp_3
+            csigpnw = p111*stressp_2 + ssigp1 + p027*stressp_4
+            csigpsw = p111*stressp_3 + ssigp2 + p027*stressp_1
+            csigpse = p111*stressp_4 + ssigp1 + p027*stressp_2
+            
+            csigmne = p111*stressm_1 + ssigm2 + p027*stressm_3
+            csigmnw = p111*stressm_2 + ssigm1 + p027*stressm_4
+            csigmsw = p111*stressm_3 + ssigm2 + p027*stressm_1
+            csigmse = p111*stressm_4 + ssigm1 + p027*stressm_2
+            
+            csig12ne = p222*stress12_1 + ssig122 &
+                     + p055*stress12_3
+            csig12nw = p222*stress12_2 + ssig121 &
+                     + p055*stress12_4
+            csig12sw = p222*stress12_3 + ssig122 &
+                     + p055*stress12_1
+            csig12se = p222*stress12_4 + ssig121 &
+                     + p055*stress12_2
+
+            str12ew = p5*dxt(i,j)*(p333*ssig12e + p166*ssig12w)
+            str12we = p5*dxt(i,j)*(p333*ssig12w + p166*ssig12e)
+            str12ns = p5*dyt(i,j)*(p333*ssig12n + p166*ssig12s)
+            str12sn = p5*dyt(i,j)*(p333*ssig12s + p166*ssig12n)
+
+         !-----------------------------------------------------------------
+         ! for dF/dx (u momentum)
+         !-----------------------------------------------------------------
+         
+            if (cc == 1) then ! T cell i,j
+            
+               strp_tmp  = p25*dyt(i,j)*(p333*ssigpn  + p166*ssigps)
+               strm_tmp  = p25*dyt(i,j)*(p333*ssigmn  + p166*ssigms)
+
+               ! northeast (i,j)
+               Drheo(iu,ju,1) = -strp_tmp - strm_tmp - str12ew &
+                  + dxhy(i,j)*(-csigpne + csigmne) + dyhx(i,j)*csig12ne
+               
+            elseif (cc == 2) then ! T cell i+1,j
+               
+               strp_tmp  = p25*dyt(i,j)*(p333*ssigpn  + p166*ssigps)
+               strm_tmp  = p25*dyt(i,j)*(p333*ssigmn  + p166*ssigms)
+               
+               ! northwest (i+1,j)
+               Drheo(iu,ju,2) = strp_tmp + strm_tmp - str12we &
+                  + dxhy(i,j)*(-csigpnw + csigmnw) + dyhx(i,j)*csig12nw
+
+            elseif (cc == 3) then ! T cell i,j+1
+               
+               strp_tmp  = p25*dyt(i,j)*(p333*ssigps  + p166*ssigpn)
+               strm_tmp  = p25*dyt(i,j)*(p333*ssigms  + p166*ssigmn)
+
+               ! southeast (i,j+1)
+               Drheo(iu,ju,3) = -strp_tmp - strm_tmp + str12ew &
+                  + dxhy(i,j)*(-csigpse + csigmse) + dyhx(i,j)*csig12se
+
+            elseif (cc == 4) then ! T cell i+1,j+1
+                 
+               strp_tmp  = p25*dyt(i,j)*(p333*ssigps  + p166*ssigpn)
+               strm_tmp  = p25*dyt(i,j)*(p333*ssigms  + p166*ssigmn)
+               
+               ! southwest (i+1,j+1)
+               Drheo(iu,ju,4) = strp_tmp + strm_tmp + str12we &
+                  + dxhy(i,j)*(-csigpsw + csigmsw) + dyhx(i,j)*csig12sw
+
+         !-----------------------------------------------------------------
+         ! for dF/dy (v momentum)
+         !-----------------------------------------------------------------
+            
+            elseif (cc == 5) then ! T cell i,j
+               
+               strp_tmp  = p25*dxt(i,j)*(p333*ssigpe  + p166*ssigpw)
+               strm_tmp  = p25*dxt(i,j)*(p333*ssigme  + p166*ssigmw)
+
+               ! northeast (i,j)
+               Drheo(iu,ju,5) = -strp_tmp + strm_tmp - str12ns &
+                  - dyhx(i,j)*(csigpne + csigmne) + dxhy(i,j)*csig12ne
+
+            elseif (cc == 6) then ! T cell i,j+1
+               
+               strp_tmp  = p25*dxt(i,j)*(p333*ssigpe  + p166*ssigpw)
+               strm_tmp  = p25*dxt(i,j)*(p333*ssigme  + p166*ssigmw)
+               
+               ! southeast (i,j+1)
+               Drheo(iu,ju,6) = strp_tmp - strm_tmp - str12sn &
+                  - dyhx(i,j)*(csigpse + csigmse) + dxhy(i,j)*csig12se
+
+            elseif (cc == 7) then ! T cell i,j+1
+               
+               strp_tmp  = p25*dxt(i,j)*(p333*ssigpw  + p166*ssigpe)
+               strm_tmp  = p25*dxt(i,j)*(p333*ssigmw  + p166*ssigme)
+
+               ! northwest (i+1,j)
+               Drheo(iu,ju,7) = -strp_tmp + strm_tmp + str12ns &
+                  - dyhx(i,j)*(csigpnw + csigmnw) + dxhy(i,j)*csig12nw
+
+            elseif (cc == 8) then ! T cell i+1,j+1
+               
+               strp_tmp  = p25*dxt(i,j)*(p333*ssigpw  + p166*ssigpe)
+               strm_tmp  = p25*dxt(i,j)*(p333*ssigmw  + p166*ssigme)
+               
+               ! southwest (i+1,j+1)
+               Drheo(iu,ju,8) = strp_tmp - strm_tmp + str12sn &
+                  - dyhx(i,j)*(csigpsw + csigmsw) + dxhy(i,j)*csig12sw
+               
+            endif
+
+         enddo                     ! ij
+
+      enddo                     ! cc
+
+      end subroutine formDiag_step1
+
+!=======================================================================
+
+! Form the diagonal of the matrix A(u,v) (second part of the computation)
+! Part 2: compute diagonal
+
+      subroutine formDiag_step2 (nx_block, ny_block, &
+                                 icellu  ,           &
+                                 indxui  , indxuj  , &
+                                 Drheo   , vrel    , &
+                                 umassdti,           &
+                                 uarear  , Cb      , &
+                                 diagx   , diagy)
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         icellu                ! total count when iceumask is true
+
+      integer (kind=int_kind), dimension (nx_block*ny_block), intent(in) :: &
+         indxui  , & ! compressed index in i-direction
+         indxuj      ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         vrel,     & ! coefficient for tauw
+         Cb,       & ! coefficient for basal stress
+         umassdti, & ! mass of U-cell/dt (kg/m^2 s)
+         uarear      ! 1/uarea
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,8), intent(in) :: &
+         Drheo
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(out) :: &
+         diagx   , & ! Diagonal (x component) of the matrix A
+         diagy       ! Diagonal (y component) of the matrix A
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, ij
+
+      real (kind=dbl_kind) :: &
+         ccaimp             , & ! intermediate variables
+         strintx, strinty       ! diagonal contributions to the divergence
+
+      character(len=*), parameter :: subname = '(formDiag_step2)'
+
+      !-----------------------------------------------------------------
+      ! integrate the momentum equation
+      !-----------------------------------------------------------------
+
+      strintx = c0
+      strinty = c0
+      
+      ! Be careful: Drheo contains 4 terms for u and 4 terms for v.
+      ! These 8 terms come from the surrounding T cells but are all
+      ! refrerenced to the i,j (u point) :
+
+      ! Drheo(i,j,1) corresponds to str(i,j,1)
+      ! Drheo(i,j,2) corresponds to str(i+1,j,2)
+      ! Drheo(i,j,3) corresponds to str(i,j+1,3)
+      ! Drheo(i,j,4) corresponds to str(i+1,j+1,4))
+      ! Drheo(i,j,5) corresponds to str(i,j,5)
+      ! Drheo(i,j,6) corresponds to str(i,j+1,6)
+      ! Drheo(i,j,7) corresponds to str(i+1,j,7)
+      ! Drheo(i,j,8) corresponds to str(i+1,j+1,8))
+      
+      do ij = 1, icellu
+         i = indxui(ij)
+         j = indxuj(ij)
+
+         ccaimp = umassdti(i,j) + vrel(i,j) * cosw + Cb(i,j) ! kg/m^2 s
+         
+         strintx = uarear(i,j)* &
+             (Drheo(i,j,1) + Drheo(i,j,2) + Drheo(i,j,3) + Drheo(i,j,4))
+         strinty = uarear(i,j)* &
+             (Drheo(i,j,5) + Drheo(i,j,6) + Drheo(i,j,7) + Drheo(i,j,8))
+
+         diagx(i,j) = ccaimp - strintx
+         diagy(i,j) = ccaimp - strinty
+      enddo                     ! ij
+
+      end subroutine formDiag_step2
+
+!=======================================================================
+
+! Compute squared l^2 norm of a grid function (tpu,tpv)
+
+      subroutine calc_L2norm_squared (nx_block, ny_block, &
+                                      icellu  ,           &
+                                      indxui  , indxuj  , &
+                                      tpu     , tpv     , &
+                                      L2norm)
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         icellu                ! total count when iceumask is true
+
+      integer (kind=int_kind), dimension (nx_block*ny_block), intent(in) :: &
+         indxui  , & ! compressed index in i-direction
+         indxuj      ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block), intent(in) :: &
+         tpu     , & ! x-component of vector grid function
+         tpv         ! y-component of vector grid function
+
+      real (kind=dbl_kind), intent(out) :: &
+         L2norm      ! squared l^2 norm of vector grid function (tpu,tpv)
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, ij
+
+      character(len=*), parameter :: subname = '(calc_L2norm_squared)'
+
+      !-----------------------------------------------------------------
+      ! compute squared l^2 norm of vector grid function (tpu,tpv)
+      !-----------------------------------------------------------------
+
+      L2norm = c0
+      
+      do ij = 1, icellu
+         i = indxui(ij)
+         j = indxuj(ij)
+         
+         L2norm = L2norm + tpu(i,j)**2 + tpv(i,j)**2
+      enddo ! ij
+      
+      end subroutine calc_L2norm_squared
+
+!=======================================================================
+
+! Convert a grid function (tpu,tpv) to a one dimensional vector
+
+      subroutine arrays_to_vec (nx_block, ny_block  , &
+                                nblocks , max_blocks, &
+                                icellu  , ntot      , &
+                                indxui  , indxuj    , &
+                                tpu     , tpv       , &
+                                outvec)
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         nblocks,            & ! nb of blocks
+         max_blocks,         & ! max nb of blocks
+         ntot                  ! size of problem for Anderson
+
+      integer (kind=int_kind), dimension (max_blocks), intent(in) :: &
+         icellu
+
+      integer (kind=int_kind), dimension (nx_block*ny_block, max_blocks), intent(in) :: &
+         indxui  , & ! compressed index in i-direction
+         indxuj      ! compressed index in j-direction
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block, max_blocks), intent(in) :: &
+         tpu     , & ! x-component of vector
+         tpv         ! y-component of vector
+
+      real (kind=dbl_kind), dimension (ntot), intent(out) :: &
+         outvec      ! output 1D vector
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, iblk, tot, ij
+
+      character(len=*), parameter :: subname = '(arrays_to_vec)'
+
+      !-----------------------------------------------------------------
+      ! form vector (converts from max_blocks arrays to single vector)
+      !-----------------------------------------------------------------
+
+      outvec(:) = c0
+      tot = 0
+      
+      do iblk = 1, nblocks
+         do ij = 1, icellu(iblk)
+            i = indxui(ij, iblk)
+            j = indxuj(ij, iblk)
+            tot = tot + 1
+            outvec(tot) = tpu(i, j, iblk)
+            tot = tot + 1
+            outvec(tot) = tpv(i, j, iblk)
+         enddo
+      enddo ! ij
+
+      end subroutine arrays_to_vec
+
+!=======================================================================
+
+! Convert one dimensional vector to a grid function (tpu,tpv)
+
+      subroutine vec_to_arrays (nx_block, ny_block  , &
+                                nblocks , max_blocks, &
+                                icellu  , ntot      , &
+                                indxui  , indxuj    , &
+                                invec   ,             &
+                                tpu     , tpv)
+
+      integer (kind=int_kind), intent(in) :: &
+         nx_block, ny_block, & ! block dimensions
+         nblocks,            & ! nb of blocks
+         max_blocks,         & ! max nb of blocks
+         ntot                  ! size of problem for Anderson
+
+      integer (kind=int_kind), dimension (max_blocks), intent(in) :: &
+         icellu
+
+      integer (kind=int_kind), dimension (nx_block*ny_block, max_blocks), intent(in) :: &
+         indxui  , & ! compressed index in i-direction
+         indxuj      ! compressed index in j-direction
+         
+      real (kind=dbl_kind), dimension (ntot), intent(in) :: &
+         invec       ! input 1D vector
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block, max_blocks), intent(out) :: &
+         tpu     , & ! x-component of vector
+         tpv         ! y-component of vector
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         i, j, iblk, tot, ij
+
+      character(len=*), parameter :: subname = '(vec_to_arrays)'
+
+      !-----------------------------------------------------------------
+      ! form arrays (converts from vector to the max_blocks arrays)
+      !-----------------------------------------------------------------
+
+      tpu(:,:,:) = c0
+      tpv(:,:,:) = c0
+      tot = 0
+      
+      do iblk = 1, nblocks
+         do ij = 1, icellu(iblk)
+            i = indxui(ij, iblk)
+            j = indxuj(ij, iblk)
+            tot = tot + 1
+            tpu(i, j, iblk) = invec(tot)
+            tot = tot + 1
+            tpv(i, j, iblk) = invec(tot)
+         enddo
+      enddo! ij
+
+      end subroutine vec_to_arrays
+
+!=======================================================================
+
+! Update Q and R factors after deletion of the 1st column of G_diff
+!
+! author: P. Blain ECCC
+!
+! adapted from :
+! H. F. Walker, “Anderson Acceleration: Algorithms and Implementations”
+!   [Online]. Available: https://users.wpi.edu/~walker/Papers/anderson_accn_algs_imps.pdf
+
+      subroutine qr_delete(Q, R)
+
+      real (kind=dbl_kind), intent(inout) :: &
+         Q(:,:),  & ! Q factor
+         R(:,:)     ! R factor
+      
+      ! local variables
+      
+      integer (kind=int_kind) :: &
+         i, j, k, & ! loop indices
+         m, n       ! size of Q matrix
+
+      real (kind=dbl_kind) :: &
+         temp, c, s
+
+      character(len=*), parameter :: subname = '(qr_delete)'
+      
+      n = size(Q, 1)
+      m = size(Q, 2)
+      do i = 1, m-1
+         temp = sqrt(R(i, i+1)**2 + R(i+1, i+1)**2)
+         c = R(i  , i+1) / temp
+         s = R(i+1, i+1) / temp
+         R(i  , i+1) = temp
+         R(i+1, i+1) = 0
+         if (i < m-1) then
+            do j = i+2, m
+               temp      =  c*R(i, j) + s*R(i+1, j)
+               R(i+1, j) = -s*R(i, j) + c*R(i+1, j)
+               R(i  , j) = temp
+            enddo
+         endif
+         do k = 1, n
+            temp      =  c*Q(k, i) + s*Q(k, i+1);
+            Q(k, i+1) = -s*Q(k, i) + c*Q(k, i+1);
+            Q(k, i)   = temp
+         enddo
+      enddo
+      R(:, 1:m-1) = R(:, 2:m)
+      
+      end subroutine qr_delete
+
+!=======================================================================
+
+! FGMRES: Flexible generalized minimum residual method (with restarts).
+! Solves the linear system A x = b using GMRES with a varying (right) preconditioner
+!
+! authors: Stéphane Gaudreault, Abdessamad Qaddouri, Philippe Blain, ECCC
+
+      subroutine fgmres (zetaD    ,           &
+                         Cb       , vrel    , &
+                         umassdti ,           &
+                         halo_info_mask     , &
+                         bx       , by      , &
+                         diagx    , diagy   , &
+                         tolerance, maxinner, &
+                         maxouter ,           &
+                         solx     , soly    , &
+                         nbiter)
+
+      use ice_boundary, only: ice_HaloUpdate
+      use ice_domain, only: maskhalo_dyn, halo_info
+      use ice_timers, only: ice_timer_start, ice_timer_stop, timer_bound
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,max_blocks,4), intent(in) :: &
+         zetaD   ! zetaD = 2*zeta (viscous coefficient)
+
+      real (kind=dbl_kind), dimension(nx_block, ny_block, max_blocks), intent(in) :: &
+         vrel  , & ! coefficient for tauw
+         Cb    , & ! seabed stress coefficient
+         umassdti  ! mass of U-cell/dte (kg/m^2 s)
+
+      type (ice_halo), intent(in) :: &
+         halo_info_mask !  ghost cell update info for masked halo
+
+      real (kind=dbl_kind), dimension(nx_block, ny_block, max_blocks), intent(in) :: &
+         bx       , & ! Right hand side of the linear system (x components)
+         by       , & ! Right hand side of the linear system (y components)
+         diagx    , & ! Diagonal of the system matrix (x components)
+         diagy        ! Diagonal of the system matrix (y components)
+
+      real (kind=dbl_kind), intent(in) :: &
+         tolerance   ! Tolerance to achieve. The algorithm terminates when the relative
+                     ! residual is below tolerance
+
+      integer (kind=int_kind), intent(in) :: &
+         maxinner, & ! Restart the method every maxinner inner (Arnoldi) iterations
+         maxouter    ! Maximum number of outer (restarts) iterations
+                     ! Iteration will stop after maxinner*maxouter Arnoldi steps
+                     ! even if the specified tolerance has not been achieved
+
+      real (kind=dbl_kind), dimension(nx_block, ny_block, max_blocks), intent(inout) :: &
+         solx     , & ! Initial guess on input, approximate solution on output (x components)
+         soly         ! Initial guess on input, approximate solution on output (y components)
+
+      integer (kind=int_kind), intent(out) :: &
+         nbiter      ! Total number of Arnoldi iterations performed
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         iblk    , & ! block index
+         ij      , & ! index for indx[t|u][i|j]
+         i, j        ! grid indices
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,max_blocks) :: &
+         workspace_x , & ! work vector (x components)
+         workspace_y     ! work vector (y components)
+
+      real (kind=dbl_kind), dimension (max_blocks) :: &
+         norm_squared   ! array to accumulate squared norm of grid function over blocks
+
+      real (kind=dbl_kind), dimension(nx_block, ny_block, max_blocks, maxinner+1) :: &
+         arnoldi_basis_x , & ! Arnoldi basis (x components)
+         arnoldi_basis_y     ! Arnoldi basis (y components)
+
+      real (kind=dbl_kind), dimension(nx_block, ny_block, max_blocks, maxinner) :: &
+         orig_basis_x , & ! original basis (x components)
+         orig_basis_y     ! original basis (y components)
+
+      real (kind=dbl_kind) :: &
+         norm_residual   , & ! current L^2 norm of residual vector
+         inverse_norm    , & ! inverse of the norm of a vector
+         nu, t               ! local temporary values
+
+      integer (kind=int_kind) :: &
+         initer          , & ! inner (Arnoldi) loop counter
+         outiter         , & ! outer (restarts) loop counter
+         nextit          , & ! nextit == initer+1
+         it, k, ii, jj       ! reusable loop counters
+
+      real (kind=dbl_kind), dimension(maxinner+1) :: &
+         rot_cos         , & ! cosine elements of Givens rotations
+         rot_sin         , & ! sine elements of Givens rotations
+         rhs_hess            ! right hand side vector of the Hessenberg (least squares) system
+
+      real (kind=dbl_kind), dimension(maxinner+1, maxinner) :: &
+         hessenberg        ! system matrix of the Hessenberg (least squares) system
+
+      character (len=char_len) :: &
+         precond_type ! type of preconditioner
+
+      real (kind=dbl_kind) :: &
+         relative_tolerance  ! relative_tolerance, i.e. tolerance*norm(initial residual)
+
+      character(len=*), parameter :: subname = '(fgmres)'
+
+      ! Here we go !
+
+      ! Initialize
+      outiter = 0
+      nbiter = 0
+      
+      norm_squared = c0
+      precond_type = precond
+      
+      ! Cells with no ice should be zero-initialized
+      workspace_x = c0
+      workspace_y = c0
+      arnoldi_basis_x = c0
+      arnoldi_basis_y = c0
+      
+      ! Residual of the initial iterate
+      
+      !$OMP PARALLEL DO PRIVATE(iblk)
+      do iblk = 1, nblocks
+         call matvec (nx_block               , ny_block             , &
+                      icellu         (iblk)  , icellt         (iblk), &
+                      indxui       (:,iblk)  , indxuj       (:,iblk), &
+                      indxti       (:,iblk)  , indxtj       (:,iblk), &
+                      dxt        (:,:,iblk)  , dyt        (:,:,iblk), &
+                      dxhy       (:,:,iblk)  , dyhx       (:,:,iblk), &
+                      cxp        (:,:,iblk)  , cyp        (:,:,iblk), &
+                      cxm        (:,:,iblk)  , cym        (:,:,iblk), &
+                      solx       (:,:,iblk)  , soly       (:,:,iblk), &
+                      vrel       (:,:,iblk)  , Cb         (:,:,iblk), &
+                      zetaD      (:,:,iblk,:),                        &
+                      umassdti   (:,:,iblk)  , fm         (:,:,iblk), &
+                      uarear     (:,:,iblk)  ,                        &
+                      workspace_x(:,:,iblk)  , workspace_y(:,:,iblk))
+         call residual_vec (nx_block             , ny_block             , &
+                            icellu         (iblk),                        &
+                            indxui       (:,iblk), indxuj       (:,iblk), &
+                            bx         (:,:,iblk), by         (:,:,iblk), &
+                            workspace_x(:,:,iblk), workspace_y(:,:,iblk), &
+                            arnoldi_basis_x (:,:,iblk, 1),                &
+                            arnoldi_basis_y (:,:,iblk, 1))
+      enddo
+      !$OMP END PARALLEL DO
+      
+      ! Start outer (restarts) loop
+      do
+         ! Compute norm of initial residual
+         !$OMP PARALLEL DO PRIVATE(iblk)
+         do iblk = 1, nblocks
+            call calc_L2norm_squared(nx_block       , ny_block      , &
+                                     icellu   (iblk),                 &
+                                     indxui (:,iblk), indxuj(:,iblk), &
+                                     arnoldi_basis_x(:,:,iblk, 1)   , &
+                                     arnoldi_basis_y(:,:,iblk, 1)   , &
+                                     norm_squared(iblk))
+
+         enddo
+         !$OMP END PARALLEL DO
+         norm_residual = sqrt(global_sum(sum(norm_squared), distrb_info))
+         
+         if (my_task == master_task .and. monitor_fgmres) then
+            write(nu_diag, '(a,i4,a,d26.16)') "monitor_fgmres: iter_fgmres= ", nbiter, &
+                                              " fgmres_L2norm= ", norm_residual
+         endif
+         
+         ! Current guess is a good enough solution TODO: reactivate and test this
+         ! if (norm_residual < tolerance) then
+         !    return
+         ! end if
+         
+         ! Normalize the first Arnoldi vector
+         inverse_norm = c1 / norm_residual
+         !$OMP PARALLEL DO PRIVATE(iblk)
+         do iblk = 1, nblocks
+            do ij = 1, icellu(iblk)
+               i = indxui(ij, iblk)
+               j = indxuj(ij, iblk)
+
+               arnoldi_basis_x(i, j, iblk, 1) = arnoldi_basis_x(i, j, iblk, 1) * inverse_norm
+               arnoldi_basis_y(i, j, iblk, 1) = arnoldi_basis_y(i, j, iblk, 1) * inverse_norm
+            enddo ! ij
+         enddo
+         !$OMP END PARALLEL DO
+         
+         if (outiter == 0) then
+            relative_tolerance = tolerance * norm_residual
+         end if
+         
+         ! Initialize 1-st term of RHS of Hessenberg system
+         rhs_hess(1)  = norm_residual
+         rhs_hess(2:) = c0
+         
+         initer = 0
+         
+         ! Start of inner (Arnoldi) loop
+         do
+            
+            nbiter = nbiter + 1
+            initer = initer + 1
+            nextit = initer + 1
+            ! precondition the current Arnoldi vector
+            call precondition(zetaD       ,                  &
+                              Cb          , vrel           , &
+                              umassdti    ,                  &
+                              arnoldi_basis_x(:,:,:,initer), &
+                              arnoldi_basis_y(:,:,:,initer), &
+                              diagx       , diagy          , &
+                              precond_type,                  &
+                              workspace_x , workspace_y)
+            orig_basis_x(:,:,:,initer) = workspace_x
+            orig_basis_y(:,:,:,initer) = workspace_y
+            
+            ! Update workspace with boundary values
+            call stack_velocity_field(workspace_x, workspace_y, fld2)
+            call ice_timer_start(timer_bound)
+            if (maskhalo_dyn) then
+               call ice_HaloUpdate (fld2,               halo_info_mask, &
+                                    field_loc_NEcorner, field_type_vector)
+            else
+               call ice_HaloUpdate (fld2,               halo_info, &
+                                    field_loc_NEcorner, field_type_vector)
+            endif
+            call ice_timer_stop(timer_bound)
+            call unstack_velocity_field(fld2, workspace_x, workspace_y)
+
+            !$OMP PARALLEL DO PRIVATE(iblk)
+            do iblk = 1, nblocks
+               call matvec (nx_block               , ny_block             , &
+                            icellu         (iblk)  , icellt         (iblk), &
+                            indxui       (:,iblk)  , indxuj       (:,iblk), &
+                            indxti       (:,iblk)  , indxtj       (:,iblk), &
+                            dxt        (:,:,iblk)  , dyt        (:,:,iblk), &
+                            dxhy       (:,:,iblk)  , dyhx       (:,:,iblk), &
+                            cxp        (:,:,iblk)  , cyp        (:,:,iblk), &
+                            cxm        (:,:,iblk)  , cym        (:,:,iblk), &
+                            workspace_x(:,:,iblk)  , workspace_y(:,:,iblk), &
+                            vrel       (:,:,iblk)  , Cb         (:,:,iblk), &
+                            zetaD      (:,:,iblk,:),                        &
+                            umassdti   (:,:,iblk)  , fm         (:,:,iblk), &
+                            uarear     (:,:,iblk)  ,                        &
+                            arnoldi_basis_x(:,:,iblk,nextit),               &
+                            arnoldi_basis_y(:,:,iblk,nextit))
+            enddo
+            !$OMP END PARALLEL DO
+            
+            ! Orthogonalize the new vector
+            call orthogonalize(ortho_type     , initer         , &
+                               nextit         , maxinner       , &
+                               arnoldi_basis_x, arnoldi_basis_y, &
+                               hessenberg)
+            
+            ! Compute norm of new Arnoldi vector and update Hessenberg matrix
+            !$OMP PARALLEL DO PRIVATE(iblk)
+            do iblk = 1, nblocks
+               call calc_L2norm_squared(nx_block      ,  ny_block        , &
+                                        icellu   (iblk),                   &
+                                        indxui (:,iblk), indxuj(:, iblk) , &
+                                        arnoldi_basis_x(:,:,iblk, nextit), &
+                                        arnoldi_basis_y(:,:,iblk, nextit), &
+                                        norm_squared(iblk))
+            enddo
+            !$OMP END PARALLEL DO
+            hessenberg(nextit,initer) = sqrt(global_sum(sum(norm_squared), distrb_info))
+            
+            ! Watch out for happy breakdown
+            if (.not. almost_zero( hessenberg(nextit,initer) ) ) then
+               ! Normalize next Arnoldi vector
+               inverse_norm = c1 / hessenberg(nextit,initer)
+               !$OMP PARALLEL DO PRIVATE(iblk)
+               do iblk = 1, nblocks
+                  do ij = 1, icellu(iblk)
+                     i = indxui(ij, iblk)
+                     j = indxuj(ij, iblk)
+                     
+                     arnoldi_basis_x(i, j, iblk, nextit) = arnoldi_basis_x(i, j, iblk, nextit)*inverse_norm
+                     arnoldi_basis_y(i, j, iblk, nextit) = arnoldi_basis_y(i, j, iblk, nextit)*inverse_norm
+                  enddo ! ij
+               enddo
+               !$OMP END PARALLEL DO
+            end if
+            
+            ! Apply previous Givens rotation to the last column of the Hessenberg matrix
+            if (initer > 1) then
+               do k = 2, initer
+                  t = hessenberg(k-1, initer)
+                  hessenberg(k-1, initer) =  rot_cos(k-1)*t + rot_sin(k-1)*hessenberg(k, initer)
+                  hessenberg(k,   initer) = -rot_sin(k-1)*t + rot_cos(k-1)*hessenberg(k, initer)
+               end do
+            end if
+            
+            ! Compute and apply new Givens rotation
+            nu = sqrt(hessenberg(initer,initer)**2 + hessenberg(nextit,initer)**2)
+            if (.not. almost_zero(nu)) then
+               rot_cos(initer) = hessenberg(initer,initer) / nu
+               rot_sin(initer) = hessenberg(nextit,initer) / nu
+               
+               rhs_hess(nextit) = -rot_sin(initer) * rhs_hess(initer)
+               rhs_hess(initer) =  rot_cos(initer) * rhs_hess(initer)
+               
+               hessenberg(initer,initer) = rot_cos(initer) * hessenberg(initer,initer) + rot_sin(initer) * hessenberg(nextit,initer)
+            end if
+            
+            ! Check for convergence
+            norm_residual = abs(rhs_hess(nextit))
+            
+            if (my_task == master_task .and. monitor_fgmres) then
+               write(nu_diag, '(a,i4,a,d26.16)') "monitor_fgmres: iter_fgmres= ", nbiter, &
+                                                 " fgmres_L2norm= ", norm_residual
+            endif
+            
+             if ((initer >= maxinner) .or. (norm_residual <= relative_tolerance)) then
+               exit
+            endif
+            
+         end do ! end of inner (Arnoldi) loop
+         
+         ! At this point either the maximum number of inner iterations
+         ! was reached or the absolute residual is below the scaled tolerance.
+
+         ! Solve the (now upper triangular) system "hessenberg * sol_hess = rhs_hess"
+         ! (sol_hess is stored in rhs_hess)
+         rhs_hess(initer) = rhs_hess(initer) / hessenberg(initer,initer)
+         do ii = 2, initer
+            k  = initer - ii + 1
+            t  = rhs_hess(k)
+            do j = k + 1, initer
+               t = t - hessenberg(k,j) * rhs_hess(j)
+            end do
+            rhs_hess(k) = t / hessenberg(k,k)
+         end do
+         
+         ! Form linear combination to get new solution iterate
+         do it = 1, initer
+            t = rhs_hess(it)
+            !$OMP PARALLEL DO PRIVATE(iblk)
+            do iblk = 1, nblocks
+               do ij = 1, icellu(iblk)
+                  i = indxui(ij, iblk)
+                  j = indxuj(ij, iblk)
+
+                  solx(i, j, iblk) = solx(i, j, iblk) + t * orig_basis_x(i, j, iblk, it)
+                  soly(i, j, iblk) = soly(i, j, iblk) + t * orig_basis_y(i, j, iblk, it)
+               enddo ! ij
+            enddo
+            !$OMP END PARALLEL DO
+         end do
+         
+         ! Increment outer loop counter and check for convergence
+         outiter = outiter + 1
+         if (norm_residual <= relative_tolerance .or. outiter >= maxouter) then
+            return
+         end if
+
+         ! Solution is not convergent : compute residual vector and continue.
+         
+         ! The residual vector is computed here using (see Saad p. 177) :
+         ! \begin{equation}
+         !    r =  V_{m+1} * Q_m^T * (\gamma_{m+1} * e_{m+1})
+         ! \end{equation}
+         ! where :
+         ! $r$ is the residual
+         ! $V_{m+1}$ is a matrix whose columns are the Arnoldi vectors from 1 to nextit (m+1)
+         ! $Q_m$ is the product of the Givens rotation : Q_m = G_m G_{m-1} ... G_1
+         ! $gamma_{m+1}$ is the last element of rhs_hess
+         ! $e_{m+1})$ is the unit vector (0, 0, ..., 1)^T \in \reals^{m+1}
+         
+         ! Apply the Givens rotation in reverse order to g := \gamma_{m+1} * e_{m+1},
+         ! store the result in rhs_hess
+         do it = 1, initer
+            jj = nextit - it + 1
+            rhs_hess(jj-1) = -rot_sin(jj-1) * rhs_hess(jj) ! + rot_cos(jj-1) * g(jj-1) (== 0)
+            rhs_hess(jj)   =  rot_cos(jj-1) * rhs_hess(jj) ! + rot_sin(jj-1) * g(jj-1) (== 0)
+         end do
+         
+         ! Compute the residual by multiplying V_{m+1} and rhs_hess
+         workspace_x = c0
+         workspace_y = c0
+         do it = 1, nextit
+            !$OMP PARALLEL DO PRIVATE(iblk)
+            do iblk = 1, nblocks
+               do ij = 1, icellu(iblk)
+                  i = indxui(ij, iblk)
+                  j = indxuj(ij, iblk)
+
+                  workspace_x(i, j, iblk) = workspace_x(i, j, iblk) + rhs_hess(it) * arnoldi_basis_x(i, j, iblk, it)
+                  workspace_y(i, j, iblk) = workspace_x(i, j, iblk) + rhs_hess(it) * arnoldi_basis_y(i, j, iblk, it)
+               enddo ! ij
+            enddo
+            !$OMP END PARALLEL DO
+            arnoldi_basis_x(:,:,:,1) = workspace_x
+            arnoldi_basis_y(:,:,:,1) = workspace_y
+         end do
+      end do ! end of outer (restarts) loop
+      
+      end subroutine fgmres
+
+!=======================================================================
+
+! PGMRES: Right-preconditioned generalized minimum residual method (with restarts).
+! Solves the linear A x = b using GMRES with a right preconditioner
+!
+! authors: Stéphane Gaudreault, Abdessamad Qaddouri, Philippe Blain, ECCC
+
+      subroutine pgmres (zetaD    ,           &
+                         Cb       , vrel    , &
+                         umassdti ,           &
+                         bx       , by      , &
+                         diagx    , diagy   , &
+                         tolerance, maxinner, &
+                         maxouter ,           &
+                         solx     , soly    , &
+                         nbiter)
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,max_blocks,4), intent(in) :: &
+         zetaD   ! zetaD = 2*zeta (viscous coefficient)
+
+      real (kind=dbl_kind), dimension(nx_block, ny_block, max_blocks), intent(in) :: &
+         vrel  , & ! coefficient for tauw
+         Cb    , & ! seabed stress coefficient
+         umassdti  ! mass of U-cell/dte (kg/m^2 s)
+
+      real (kind=dbl_kind), dimension(nx_block, ny_block, max_blocks), intent(in) :: &
+         bx       , & ! Right hand side of the linear system (x components)
+         by           ! Right hand side of the linear system (y components)
+
+      real (kind=dbl_kind), dimension(nx_block, ny_block, max_blocks), intent(in) :: &
+         diagx    , & ! Diagonal of the system matrix (x components)
+         diagy        ! Diagonal of the system matrix (y components)
+
+      real (kind=dbl_kind), intent(in) :: &
+         tolerance   ! Tolerance to achieve. The algorithm terminates when the relative
+                     ! residual is below tolerance
+
+      integer (kind=int_kind), intent(in) :: &
+         maxinner, & ! Restart the method every maxinner inner (Arnoldi) iterations
+         maxouter    ! Maximum number of outer (restarts) iterations
+                     ! Iteration will stop after maxinner*maxouter Arnoldi steps
+                     ! even if the specified tolerance has not been achieved
+
+      real (kind=dbl_kind), dimension(nx_block, ny_block, max_blocks), intent(inout) :: &
+         solx     , & ! Initial guess on input, approximate solution on output (x components)
+         soly         ! Initial guess on input, approximate solution on output (y components)
+
+      integer (kind=int_kind), intent(out) :: &
+         nbiter      ! Total number of Arnoldi iterations performed
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         iblk    , & ! block index
+         ij      , & ! index for indx[t|u][i|j]
+         i, j        ! grid indices
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,max_blocks) :: &
+         workspace_x , & ! work vector (x components)
+         workspace_y     ! work vector (y components)
+
+      real (kind=dbl_kind), dimension (max_blocks) :: &
+         norm_squared   ! array to accumulate squared norm of grid function over blocks
+
+      real (kind=dbl_kind), dimension(nx_block, ny_block, max_blocks, maxinner+1) :: &
+         arnoldi_basis_x , & ! Arnoldi basis (x components)
+         arnoldi_basis_y     ! Arnoldi basis (y components)
+
+      real (kind=dbl_kind) :: &
+         norm_residual   , & ! current L^2 norm of residual vector
+         inverse_norm    , & ! inverse of the norm of a vector
+         nu, t               ! local temporary values
+
+      integer (kind=int_kind) :: &
+         initer          , & ! inner (Arnoldi) loop counter
+         outiter         , & ! outer (restarts) loop counter
+         nextit          , & ! nextit == initer+1
+         it, k, ii, jj       ! reusable loop counters
+
+      real (kind=dbl_kind), dimension(maxinner+1) :: &
+         rot_cos         , & ! cosine elements of Givens rotations
+         rot_sin         , & ! sine elements of Givens rotations
+         rhs_hess            ! right hand side vector of the Hessenberg (least squares) system
+
+      real (kind=dbl_kind), dimension(maxinner+1, maxinner) :: &
+         hessenberg          ! system matrix of the Hessenberg (least squares) system
+
+      character(len=char_len) :: &
+         precond_type    , & ! type of preconditioner
+         ortho_type          ! type of orthogonalization
+
+      real (kind=dbl_kind) :: &
+         relative_tolerance  ! relative_tolerance, i.e. tolerance*norm(initial residual)
+
+      character(len=*), parameter :: subname = '(pgmres)'
+      
+      ! Here we go !
+
+      ! Initialize
+      outiter = 0
+      nbiter = 0
+      
+      norm_squared = c0
+      precond_type = 'diag' ! Jacobi preconditioner
+      ortho_type = 'cgs' ! classical gram-schmidt TODO: try with MGS
+      
+      ! Cells with no ice should be zero-initialized
+      workspace_x = c0
+      workspace_y = c0
+      arnoldi_basis_x = c0
+      arnoldi_basis_y = c0
+      
+      ! Residual of the initial iterate
+      
+      !$OMP PARALLEL DO PRIVATE(iblk)
+      do iblk = 1, nblocks
+         call matvec (nx_block               , ny_block             , &
+                      icellu         (iblk)  , icellt         (iblk), &
+                      indxui       (:,iblk)  , indxuj       (:,iblk), &
+                      indxti       (:,iblk)  , indxtj       (:,iblk), &
+                      dxt        (:,:,iblk)  , dyt        (:,:,iblk), &
+                      dxhy       (:,:,iblk)  , dyhx       (:,:,iblk), &
+                      cxp        (:,:,iblk)  , cyp        (:,:,iblk), &
+                      cxm        (:,:,iblk)  , cym        (:,:,iblk), &
+                      solx       (:,:,iblk)  , soly       (:,:,iblk), &
+                      vrel       (:,:,iblk)  , Cb         (:,:,iblk), &
+                      zetaD      (:,:,iblk,:),                        &
+                      umassdti   (:,:,iblk)  , fm         (:,:,iblk), &
+                      uarear     (:,:,iblk)  ,                        &
+                      workspace_x(:,:,iblk)  , workspace_y(:,:,iblk))
+         call residual_vec (nx_block             , ny_block             , &
+                            icellu         (iblk),                        &
+                            indxui       (:,iblk), indxuj       (:,iblk), &
+                            bx         (:,:,iblk), by         (:,:,iblk), &
+                            workspace_x(:,:,iblk), workspace_y(:,:,iblk), &
+                            arnoldi_basis_x (:,:,iblk, 1),                &
+                            arnoldi_basis_y (:,:,iblk, 1))
+      enddo
+      !$OMP END PARALLEL DO
+      
+      ! Start outer (restarts) loop
+      do
+         ! Compute norm of initial residual
+         !$OMP PARALLEL DO PRIVATE(iblk)
+         do iblk = 1, nblocks
+            call calc_L2norm_squared(nx_block       , ny_block       , &
+                                     icellu   (iblk),                  &
+                                     indxui (:,iblk), indxuj(:, iblk), &
+                                     arnoldi_basis_x(:,:,iblk, 1),     &
+                                     arnoldi_basis_y(:,:,iblk, 1),     &
+                                     norm_squared(iblk))
+
+         enddo
+         !$OMP END PARALLEL DO
+         norm_residual = sqrt(global_sum(sum(norm_squared), distrb_info))
+         
+         if (my_task == master_task .and. monitor_pgmres) then
+            write(nu_diag, '(a,i4,a,d26.16)') "monitor_pgmres: iter_pgmres= ", nbiter, &
+                                              " pgmres_L2norm= ", norm_residual
+         endif
+         
+         ! Current guess is a good enough solution
+         ! if (norm_residual < tolerance) then
+         !    return
+         ! end if
+         
+         ! Normalize the first Arnoldi vector
+         inverse_norm = c1 / norm_residual
+         !$OMP PARALLEL DO PRIVATE(iblk)
+         do iblk = 1, nblocks
+            do ij = 1, icellu(iblk)
+               i = indxui(ij, iblk)
+               j = indxuj(ij, iblk)
+
+               arnoldi_basis_x(i, j, iblk, 1) = arnoldi_basis_x(i, j, iblk, 1) * inverse_norm
+               arnoldi_basis_y(i, j, iblk, 1) = arnoldi_basis_y(i, j, iblk, 1) * inverse_norm
+            enddo ! ij
+         enddo
+         !$OMP END PARALLEL DO
+         
+         if (outiter == 0) then
+            relative_tolerance = tolerance * norm_residual
+         end if
+         
+         ! Initialize 1-st term of RHS of Hessenberg system
+         rhs_hess(1)  = norm_residual
+         rhs_hess(2:) = c0
+         
+         initer = 0
+         
+         ! Start of inner (Arnoldi) loop
+         do
+            
+            nbiter = nbiter + 1
+            initer = initer + 1
+            nextit = initer + 1
+            
+            ! precondition the current Arnoldi vector
+            call precondition(zetaD       ,                  &
+                              Cb          , vrel           , &
+                              umassdti    ,                  &
+                              arnoldi_basis_x(:,:,:,initer), &
+                              arnoldi_basis_y(:,:,:,initer), &
+                              diagx       , diagy          , &
+                              precond_type,                  &
+                              workspace_x , workspace_y)
+            
+            ! NOTE: halo updates for (workspace_x, workspace_y)
+            ! are skipped here for efficiency since this is just a preconditioner
+            
+            !$OMP PARALLEL DO PRIVATE(iblk)
+            do iblk = 1, nblocks
+               call matvec (nx_block               , ny_block             , &
+                            icellu         (iblk)  , icellt         (iblk), &
+                            indxui       (:,iblk)  , indxuj       (:,iblk), &
+                            indxti       (:,iblk)  , indxtj       (:,iblk), &
+                            dxt        (:,:,iblk)  , dyt        (:,:,iblk), &
+                            dxhy       (:,:,iblk)  , dyhx       (:,:,iblk), &
+                            cxp        (:,:,iblk)  , cyp        (:,:,iblk), &
+                            cxm        (:,:,iblk)  , cym        (:,:,iblk), &
+                            workspace_x(:,:,iblk)  , workspace_y(:,:,iblk), &
+                            vrel       (:,:,iblk)  , Cb         (:,:,iblk), &
+                            zetaD      (:,:,iblk,:),                        &
+                            umassdti   (:,:,iblk)  , fm         (:,:,iblk), &
+                            uarear     (:,:,iblk)  ,                        &
+                            arnoldi_basis_x(:,:,iblk,nextit),               &
+                            arnoldi_basis_y(:,:,iblk,nextit))
+            enddo
+            !$OMP END PARALLEL DO
+            
+            ! Orthogonalize the new vector
+            call orthogonalize(ortho_type     , initer         , &
+                               nextit         , maxinner       , &
+                               arnoldi_basis_x, arnoldi_basis_y, &
+                               hessenberg)
+            
+            ! Compute norm of new Arnoldi vector and update Hessenberg matrix
+            !$OMP PARALLEL DO PRIVATE(iblk)
+            do iblk = 1, nblocks
+               call calc_L2norm_squared(nx_block       , ny_block        , &
+                                        icellu   (iblk),                   &
+                                        indxui (:,iblk), indxuj(:, iblk) , &
+                                        arnoldi_basis_x(:,:,iblk, nextit), &
+                                        arnoldi_basis_y(:,:,iblk, nextit), &
+                                        norm_squared(iblk))
+            enddo
+            !$OMP END PARALLEL DO
+            hessenberg(nextit,initer) = sqrt(global_sum(sum(norm_squared), distrb_info))
+            
+            ! Watch out for happy breakdown
+            if (.not. almost_zero( hessenberg(nextit,initer) ) ) then
+               ! Normalize next Arnoldi vector
+               inverse_norm = c1 / hessenberg(nextit,initer)
+               !$OMP PARALLEL DO PRIVATE(iblk)
+               do iblk = 1, nblocks
+                  do ij = 1, icellu(iblk)
+                     i = indxui(ij, iblk)
+                     j = indxuj(ij, iblk)
+                     
+                     arnoldi_basis_x(i, j, iblk, nextit) = arnoldi_basis_x(i, j, iblk, nextit)*inverse_norm
+                     arnoldi_basis_y(i, j, iblk, nextit) = arnoldi_basis_y(i, j, iblk, nextit)*inverse_norm
+                  enddo ! ij
+               enddo
+               !$OMP END PARALLEL DO
+            end if
+            
+            ! Apply previous Givens rotation to the last column of the Hessenberg matrix
+            if (initer > 1) then
+               do k = 2, initer
+                  t = hessenberg(k-1, initer)
+                  hessenberg(k-1, initer) =  rot_cos(k-1)*t + rot_sin(k-1)*hessenberg(k, initer)
+                  hessenberg(k,   initer) = -rot_sin(k-1)*t + rot_cos(k-1)*hessenberg(k, initer)
+               end do
+            end if
+            
+            ! Compute and apply new Givens rotation
+            nu = sqrt(hessenberg(initer,initer)**2 + hessenberg(nextit,initer)**2)
+            if (.not. almost_zero(nu)) then
+               rot_cos(initer) = hessenberg(initer,initer) / nu
+               rot_sin(initer) = hessenberg(nextit,initer) / nu
+               
+               rhs_hess(nextit) = -rot_sin(initer) * rhs_hess(initer)
+               rhs_hess(initer) =  rot_cos(initer) * rhs_hess(initer)
+               
+               hessenberg(initer,initer) = rot_cos(initer) * hessenberg(initer,initer) + rot_sin(initer) * hessenberg(nextit,initer)
+            end if
+            
+            ! Check for convergence
+            norm_residual = abs(rhs_hess(nextit))
+            
+            if (my_task == master_task .and. monitor_pgmres) then
+               write(nu_diag, '(a,i4,a,d26.16)') "monitor_pgmres: iter_pgmres= ", nbiter, &
+                                                 " pgmres_L2norm= ", norm_residual
+            endif
+            
+             if ((initer >= maxinner) .or. (norm_residual <= relative_tolerance)) then
+               exit
+            endif
+            
+         end do ! end of inner (Arnoldi) loop
+         
+         ! At this point either the maximum number of inner iterations
+         ! was reached or the absolute residual is below the scaled tolerance.
+
+         ! Solve the (now upper triangular) system "hessenberg * sol_hess = rhs_hess"
+         ! (sol_hess is stored in rhs_hess)
+         rhs_hess(initer) = rhs_hess(initer) / hessenberg(initer,initer)
+         do ii = 2, initer
+            k  = initer - ii + 1
+            t  = rhs_hess(k)
+            do j = k + 1, initer
+               t = t - hessenberg(k,j) * rhs_hess(j)
+            end do
+            rhs_hess(k) = t / hessenberg(k,k)
+         end do
+         
+         ! Form linear combination to get new solution iterate
+         workspace_x = c0
+         workspace_y = c0
+         do it = 1, initer
+            t = rhs_hess(it)
+            !$OMP PARALLEL DO PRIVATE(iblk)
+            do iblk = 1, nblocks
+               do ij = 1, icellu(iblk)
+                  i = indxui(ij, iblk)
+                  j = indxuj(ij, iblk)
+
+                  workspace_x(i, j, iblk) = workspace_x(i, j, iblk) + t * arnoldi_basis_x(i, j, iblk, it)
+                  workspace_y(i, j, iblk) = workspace_y(i, j, iblk) + t * arnoldi_basis_y(i, j, iblk, it)
+               enddo ! ij
+            enddo
+            !$OMP END PARALLEL DO
+         end do
+         
+         ! Call preconditioner
+         call precondition(zetaD       ,              &
+                           Cb          , vrel       , &
+                           umassdti    ,              &
+                           workspace_x , workspace_y, &
+                           diagx       , diagy      , &
+                           precond_type,              &
+                           workspace_x , workspace_y)
+         
+         solx = solx + workspace_x
+         soly = soly + workspace_y
+         
+         ! Increment outer loop counter and check for convergence
+         outiter = outiter + 1
+         if (norm_residual <= relative_tolerance .or. outiter >= maxouter) then
+            return
+         end if
+
+         ! Solution is not convergent : compute residual vector and continue.
+         
+         ! The residual vector is computed here using (see Saad p. 177) :
+         ! \begin{equation}
+         !    r =  V_{m+1} * Q_m^T * (\gamma_{m+1} * e_{m+1})
+         ! \end{equation}
+         ! where :
+         ! $r$ is the residual
+         ! $V_{m+1}$ is a matrix whose columns are the Arnoldi vectors from 1 to nextit (m+1)
+         ! $Q_m$ is the product of the Givens rotation : Q_m = G_m G_{m-1} ... G_1
+         ! $gamma_{m+1}$ is the last element of rhs_hess
+         ! $e_{m+1})$ is the unit vector (0, 0, ..., 1)^T \in \reals^{m+1}
+         
+         ! Apply the Givens rotation in reverse order to g := \gamma_{m+1} * e_{m+1},
+         ! store the result in rhs_hess
+         do it = 1, initer
+            jj = nextit - it + 1
+            rhs_hess(jj-1) = -rot_sin(jj-1) * rhs_hess(jj) ! + rot_cos(jj-1) * g(jj-1) (== 0)
+            rhs_hess(jj)   =  rot_cos(jj-1) * rhs_hess(jj) ! + rot_sin(jj-1) * g(jj-1) (== 0)
+         end do
+         
+         ! Compute the residual by multiplying V_{m+1} and rhs_hess
+         workspace_x = c0
+         workspace_y = c0
+         do it = 1, nextit
+            !$OMP PARALLEL DO PRIVATE(iblk)
+            do iblk = 1, nblocks
+               do ij = 1, icellu(iblk)
+                  i = indxui(ij, iblk)
+                  j = indxuj(ij, iblk)
+
+                  workspace_x(i, j, iblk) = workspace_x(i, j, iblk) + rhs_hess(it) * arnoldi_basis_x(i, j, iblk, it)
+                  workspace_y(i, j, iblk) = workspace_x(i, j, iblk) + rhs_hess(it) * arnoldi_basis_y(i, j, iblk, it)
+               enddo ! ij
+            enddo
+            !$OMP END PARALLEL DO
+            arnoldi_basis_x(:,:,:,1) = workspace_x
+            arnoldi_basis_y(:,:,:,1) = workspace_y
+         end do
+      end do ! end of outer (restarts) loop
+      
+      end subroutine pgmres
+
+!=======================================================================
+
+! Generic routine to precondition a vector
+!
+! authors: Philippe Blain, ECCC
+
+      subroutine precondition(zetaD       ,        &
+                              Cb          , vrel , &
+                              umassdti    ,        &
+                              vx          , vy   , &
+                              diagx       , diagy, &
+                              precond_type,        &
+                              wx          , wy)
+
+      real (kind=dbl_kind), dimension(nx_block,ny_block,max_blocks,4), intent(in) :: &
+         zetaD   ! zetaD = 2*zeta (viscous coefficient)
+
+      real (kind=dbl_kind), dimension(nx_block, ny_block, max_blocks), intent(in) :: &
+         vrel  , & ! coefficient for tauw
+         Cb    , & ! seabed stress coefficient
+         umassdti  ! mass of U-cell/dte (kg/m^2 s)
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,max_blocks), intent(in) :: &
+         vx       , & ! input vector (x components)
+         vy           ! input vector (y components)
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,max_blocks), intent(in) :: &
+         diagx    , & ! diagonal of the system matrix (x components)
+         diagy        ! diagonal of the system matrix (y components)
+
+      character (len=char_len), intent(in) :: &
+         precond_type ! type of preconditioner
+
+      real (kind=dbl_kind), dimension (nx_block,ny_block,max_blocks), intent(inout) :: &
+         wx       , & ! preconditionned vector (x components)
+         wy           ! preconditionned vector (y components)
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         iblk    , & ! block index
+         ij      , & ! compressed index
+         i, j        ! grid indices
+
+      real (kind=dbl_kind) :: &
+         tolerance   ! Tolerance for PGMRES
+
+      integer (kind=int_kind) :: &
+         maxinner    ! Restart parameter for PGMRES
+
+      integer (kind=int_kind) :: &
+         maxouter    ! Maximum number of outer iterations for PGMRES
+
+      integer (kind=int_kind) :: &
+         nbiter      ! Total number of iteration PGMRES performed
+
+      character(len=*), parameter :: subname = '(precondition)'
+
+      if     (precond_type == 'ident') then ! identity (no preconditioner)
+         wx = vx
+         wy = vy
+      elseif (precond_type == 'diag') then ! Jacobi preconditioner (diagonal)
+         !$OMP PARALLEL DO PRIVATE(iblk)
+         do iblk = 1, nblocks
+            do ij = 1, icellu(iblk)
+               i = indxui(ij, iblk)
+               j = indxuj(ij, iblk)
+
+               wx(i,j,iblk) = vx(i,j,iblk) / diagx(i,j,iblk)
+               wy(i,j,iblk) = vy(i,j,iblk) / diagy(i,j,iblk)
+            enddo ! ij
+         enddo
+         !$OMP END PARALLEL DO
+      elseif (precond_type == 'pgmres') then ! PGMRES (Jacobi-preconditioned GMRES)
+         ! Initialize preconditioned vector to 0 ! TODO: try with wx = vx or vx/diagx
+         wx = c0
+         wy = c0
+         tolerance = reltol_pgmres
+         maxinner = dim_pgmres
+         maxouter = maxits_pgmres
+         call pgmres (zetaD,               &
+                      Cb       , vrel    , &
+                      umassdti ,           &
+                      vx       , vy      , &
+                      diagx    , diagy   , &
+                      tolerance, maxinner, &
+                      maxouter ,           &
+                      wx       , wy      , &
+                      nbiter)
+      else
+         call abort_ice(error_message='wrong preconditioner in ' // subname, &
+            file=__FILE__, line=__LINE__)
+      endif
+      end subroutine precondition
+
+!=======================================================================
+
+! Generic routine to orthogonalize a vector (arnoldi_basis_[xy](:, :, :, nextit))
+! against a set of vectors (arnoldi_basis_[xy](:, :, :, 1:initer))
+!
+! authors: Philippe Blain, ECCC
+
+      subroutine orthogonalize(ortho_type     , initer         , &
+                               nextit         , maxinner       , &
+                               arnoldi_basis_x, arnoldi_basis_y, &
+                               hessenberg)
+
+      character(len=*), intent(in) :: &
+         ortho_type ! type of orthogonalization
+
+      integer (kind=int_kind), intent(in) :: &
+         initer  , & ! inner (Arnoldi) loop counter
+         nextit  , & ! nextit == initer+1
+         maxinner    ! Restart the method every maxinner inner iterations
+
+      real (kind=dbl_kind), dimension(nx_block, ny_block, max_blocks, maxinner+1), intent(inout) :: &
+         arnoldi_basis_x , & ! arnoldi basis (x components)
+         arnoldi_basis_y     ! arnoldi basis (y components)
+
+      real (kind=dbl_kind), dimension(maxinner+1, maxinner), intent(inout) :: &
+         hessenberg        ! system matrix of the Hessenberg (least squares) system
+
+      ! local variables
+
+      integer (kind=int_kind) :: &
+         it      , & ! reusable loop counter
+         iblk    , & ! block index
+         ij      , & ! compressed index
+         i, j        ! grid indices
+
+      real (kind=dbl_kind), dimension (max_blocks) :: &
+         local_dot      ! local array value to accumulate dot product of grid function over blocks
+
+      real (kind=dbl_kind), dimension(maxinner) :: &
+         dotprod_local  ! local array to accumulate several dot product computations
+
+      character(len=*), parameter :: subname = '(orthogonalize)'
+
+      if (trim(ortho_type) == 'cgs') then ! Classical Gram-Schmidt
+         ! Classical Gram-Schmidt orthogonalisation process
+         ! First loop of Gram-Schmidt (compute coefficients)
+         dotprod_local = c0
+         do it = 1, initer
+            local_dot = c0
+            
+            !$OMP PARALLEL DO PRIVATE(iblk, ij, i, j)
+            do iblk = 1, nblocks
+               do ij = 1, icellu(iblk)
+                  i = indxui(ij, iblk)
+                  j = indxuj(ij, iblk)
+                  
+                  local_dot(iblk) = local_dot(iblk) + &
+                                    (arnoldi_basis_x(i, j, iblk, it) * arnoldi_basis_x(i, j, iblk, nextit)) + &
+                                    (arnoldi_basis_y(i, j, iblk, it) * arnoldi_basis_y(i, j, iblk, nextit))
+               enddo ! ij
+            enddo
+            !$OMP END PARALLEL DO
+            
+            dotprod_local(it) = sum(local_dot)
+         end do
+
+         hessenberg(1:initer, initer) = global_allreduce_sum(dotprod_local(1:initer), distrb_info)
+
+         ! Second loop of Gram-Schmidt (orthonormalize)
+         do it = 1, initer
+            !$OMP PARALLEL DO PRIVATE(iblk)
+            do iblk = 1, nblocks
+               do ij = 1, icellu(iblk)
+                  i = indxui(ij, iblk)
+                  j = indxuj(ij, iblk)
+                  
+                  arnoldi_basis_x(i, j, iblk, nextit) = arnoldi_basis_x(i, j, iblk, nextit) &
+                                                        - hessenberg(it,initer) * arnoldi_basis_x(i, j, iblk, it)
+                  arnoldi_basis_y(i, j, iblk, nextit) = arnoldi_basis_y(i, j, iblk, nextit) &
+                                                        - hessenberg(it,initer) * arnoldi_basis_y(i, j, iblk, it)
+               enddo ! ij
+            enddo
+            !$OMP END PARALLEL DO
+         end do
+      elseif (trim(ortho_type) == 'mgs') then ! Modified Gram-Schmidt
+         ! Modified Gram-Schmidt orthogonalisation process
+         do it = 1, initer
+            local_dot = c0
+            
+            !$OMP PARALLEL DO PRIVATE(iblk, ij, i, j)
+            do iblk = 1, nblocks
+               do ij = 1, icellu(iblk)
+                  i = indxui(ij, iblk)
+                  j = indxuj(ij, iblk)
+                  
+                  local_dot(iblk) = local_dot(iblk) + &
+                                    (arnoldi_basis_x(i, j, iblk, it) * arnoldi_basis_x(i, j, iblk, nextit)) + &
+                                    (arnoldi_basis_y(i, j, iblk, it) * arnoldi_basis_y(i, j, iblk, nextit))
+               enddo ! ij
+            enddo
+            !$OMP END PARALLEL DO
+            
+            hessenberg(it,initer) = global_sum(sum(local_dot), distrb_info)
+            
+            !$OMP PARALLEL DO PRIVATE(iblk, ij, i, j)
+            do iblk = 1, nblocks
+               do ij = 1, icellu(iblk)
+                  i = indxui(ij, iblk)
+                  j = indxuj(ij, iblk)
+                  
+                  arnoldi_basis_x(i, j, iblk, nextit) = arnoldi_basis_x(i, j, iblk, nextit) &
+                                                        - hessenberg(it,initer) * arnoldi_basis_x(i, j, iblk, it)
+                  arnoldi_basis_y(i, j, iblk, nextit) = arnoldi_basis_y(i, j, iblk, nextit) &
+                                                        - hessenberg(it,initer) * arnoldi_basis_y(i, j, iblk, it)
+               enddo ! ij
+            enddo
+            !$OMP END PARALLEL DO
+         end do
+      else
+         call abort_ice(error_message='wrong orthonalization in ' // subname, &
+         file=__FILE__, line=__LINE__)
+      endif
+      
+   end subroutine orthogonalize
+
+!=======================================================================
+
+! Check if value A is close to zero, up to machine precision
+!
+!author
+!     Stéphane Gaudreault, ECCC -- June 2014
+!
+!revision
+!     v4-80 - Gaudreault S.         - gfortran compatibility
+!     2019  - Philippe Blain, ECCC  - converted to CICE standards
+
+      logical function almost_zero(A) result(retval)
+
+      real (kind=dbl_kind), intent(in) :: A
+
+      ! local variables
+
+      character(len=*), parameter :: subname = '(almost_zero)'
+
+      integer (kind=int8_kind) :: aBit
+      integer (kind=int8_kind), parameter :: two_complement = int(Z'80000000', kind=int8_kind)
+      aBit = 0
+      aBit = transfer(A, aBit)
+      if (aBit < 0) then
+         aBit = two_complement - aBit
+      end if
+      ! lexicographic order test with a tolerance of 1 adjacent float
+      retval = (abs(aBit) <= 1)
+      
+      end function almost_zero
+
+!=======================================================================
+
+      end module ice_dyn_vp
+
+!=======================================================================
diff --git a/cicecore/cicedynB/general/ice_init.F90 b/cicecore/cicedynB/general/ice_init.F90
index f2eaae17d..fb9c45978 100644
--- a/cicecore/cicedynB/general/ice_init.F90
+++ b/cicecore/cicedynB/general/ice_init.F90
@@ -100,6 +100,11 @@ subroutine input_data
                                 basalstress, k1, k2, alphab, threshold_hw, &
                                 Ktens, e_ratio, coriolis, ssh_stress, &
                                 kridge, ktransport, brlx, arlx
+      use ice_dyn_vp, only: maxits_nonlin, precond, dim_fgmres, dim_pgmres, maxits_fgmres, &
+                            maxits_pgmres, monitor_nonlin, monitor_fgmres, &
+                            monitor_pgmres, reltol_nonlin, reltol_fgmres, reltol_pgmres, &
+                            algo_nonlin, fpfunc_andacc, dim_andacc, reltol_andacc, &
+                            damping_andacc, start_andacc, use_mean_vrel, ortho_type
       use ice_transport_driver, only: advection, conserv_check
       use ice_restoring, only: restore_ice
 #ifdef CESMCOUPLED
@@ -194,7 +199,13 @@ subroutine input_data
         advection,      coriolis,       kridge,         ktransport,     &
         kstrength,      krdg_partic,    krdg_redist,    mu_rdg,         &
         e_ratio,        Ktens,          Cf,             basalstress,    &
-        k1,             k2,             alphab,         threshold_hw,   &
+        k1,             maxits_nonlin,  precond,        dim_fgmres,     &
+        dim_pgmres,     maxits_fgmres,  maxits_pgmres,  monitor_nonlin, &
+        monitor_fgmres, monitor_pgmres, reltol_nonlin,  reltol_fgmres,  &
+        reltol_pgmres,  algo_nonlin,    dim_andacc,     reltol_andacc,  &
+        damping_andacc, start_andacc,   fpfunc_andacc,  use_mean_vrel,  &
+        ortho_type,                                                     &
+        k2,             alphab,         threshold_hw,                   &
         Pstar,          Cstar
 
       namelist /shortwave_nml/ &
@@ -322,7 +333,27 @@ subroutine input_data
       alphab = 20.0_dbl_kind       ! alphab=Cb factor in Lemieux et al 2015
       threshold_hw = 30.0_dbl_kind ! max water depth for grounding
       Ktens = 0.0_dbl_kind   ! T=Ktens*P (tensile strength: see Konig and Holland, 2010)
-      e_ratio = 2.0_dbl_kind ! EVP ellipse aspect ratio
+      e_ratio = 2.0_dbl_kind ! VP ellipse aspect ratio
+      maxits_nonlin = 4      ! max nb of iteration for nonlinear solver
+      precond = 'pgmres'     ! preconditioner for fgmres: 'ident' (identity), 'diag' (diagonal), 'pgmres' (Jacobi-preconditioned GMRES)
+      dim_fgmres = 50        ! size of fgmres Krylov subspace
+      dim_pgmres = 5         ! size of pgmres Krylov subspace
+      maxits_fgmres = 50     ! max nb of iteration for fgmres
+      maxits_pgmres = 5      ! max nb of iteration for pgmres
+      monitor_nonlin = .false. ! print nonlinear residual norm
+      monitor_fgmres = .false. ! print fgmres residual norm
+      monitor_pgmres = .false. ! print pgmres residual norm
+      ortho_type = 'mgs'     ! orthogonalization procedure 'cgs' or 'mgs'
+      reltol_nonlin = 1e-8_dbl_kind ! nonlinear stopping criterion: reltol_nonlin*res(k=0)
+      reltol_fgmres = 1e-2_dbl_kind ! fgmres stopping criterion: reltol_fgmres*res(k)
+      reltol_pgmres = 1e-6_dbl_kind ! pgmres stopping criterion: reltol_pgmres*res(k)
+      algo_nonlin = 'picard'        ! nonlinear algorithm: 'picard' (Picard iteration), 'anderson' (Anderson acceleration)
+      fpfunc_andacc = 1      ! fixed point function for Anderson acceleration: 1: g(x) = FMGRES(A(x),b(x)), 2: g(x) = x - A(x)x + b(x)
+      dim_andacc = 5         ! size of Anderson minimization matrix (number of saved previous residuals)
+      reltol_andacc = 1e-6_dbl_kind  ! relative tolerance for Anderson acceleration
+      damping_andacc = 0     ! damping factor for Anderson acceleration
+      start_andacc = 0       ! acceleration delay factor (acceleration starts at this iteration)
+      use_mean_vrel = .true. ! use mean of previous 2 iterates to compute vrel
       advection  = 'remap'   ! incremental remapping transport scheme
       conserv_check = .false.! tracer conservation check
       shortwave = 'ccsm3'    ! 'ccsm3' or 'dEdd' (delta-Eddington)
@@ -628,6 +659,26 @@ subroutine input_data
       call broadcast_scalar(ssh_stress,         master_task)
       call broadcast_scalar(kridge,             master_task)
       call broadcast_scalar(ktransport,         master_task)
+      call broadcast_scalar(maxits_nonlin,      master_task)
+      call broadcast_scalar(precond,            master_task)
+      call broadcast_scalar(dim_fgmres,         master_task)
+      call broadcast_scalar(dim_pgmres,         master_task)
+      call broadcast_scalar(maxits_fgmres,      master_task)
+      call broadcast_scalar(maxits_pgmres,      master_task)
+      call broadcast_scalar(monitor_nonlin,     master_task)
+      call broadcast_scalar(monitor_fgmres,     master_task)
+      call broadcast_scalar(monitor_pgmres,     master_task)
+      call broadcast_scalar(ortho_type,         master_task)
+      call broadcast_scalar(reltol_nonlin,      master_task)
+      call broadcast_scalar(reltol_fgmres,      master_task)
+      call broadcast_scalar(reltol_pgmres,      master_task)
+      call broadcast_scalar(algo_nonlin,        master_task)
+      call broadcast_scalar(fpfunc_andacc,      master_task)
+      call broadcast_scalar(dim_andacc,         master_task)
+      call broadcast_scalar(reltol_andacc,      master_task)
+      call broadcast_scalar(damping_andacc,     master_task)
+      call broadcast_scalar(start_andacc,       master_task)
+      call broadcast_scalar(use_mean_vrel,      master_task)
       call broadcast_scalar(conduct,            master_task)
       call broadcast_scalar(R_ice,              master_task)
       call broadcast_scalar(R_pnd,              master_task)
@@ -831,7 +882,7 @@ subroutine input_data
          revised_evp = .false.
       endif
 
-      if (kdyn > 2) then
+      if (kdyn > 3) then
          if (my_task == master_task) then
             write(nu_diag,*) subname//' WARNING: kdyn out of range'
          endif
@@ -1037,6 +1088,38 @@ subroutine input_data
          endif
       endif
 
+      ! Implicit solver input validation
+      if (kdyn == 3) then
+         if (.not. (trim(algo_nonlin) == 'picard' .or. trim(algo_nonlin) == 'anderson')) then
+            if (my_task == master_task) then
+               write(nu_diag,*) subname//' ERROR: unknown algo_nonlin: '//algo_nonlin
+               write(nu_diag,*) subname//' ERROR:   allowed values: ''picard'', ''anderson'''
+            endif
+            abort_list = trim(abort_list)//":60"
+         endif
+         
+         if (trim(algo_nonlin) == 'picard') then
+            ! Picard solver is implemented in the Anderson solver; reset number of saved residuals to zero
+            dim_andacc = 0
+         endif
+         
+         if (.not. (trim(precond) == 'ident' .or. trim(precond) == 'diag' .or. trim(precond) == 'pgmres')) then
+            if (my_task == master_task) then
+               write(nu_diag,*) subname//' ERROR: unknown precond: '//precond
+               write(nu_diag,*) subname//' ERROR:   allowed values: ''ident'', ''diag'', ''pgmres'''
+            endif
+            abort_list = trim(abort_list)//":61"
+         endif
+         
+         if (.not. (trim(ortho_type) == 'cgs' .or. trim(ortho_type) == 'mgs')) then
+            if (my_task == master_task) then
+               write(nu_diag,*) subname//' ERROR: unknown ortho_type: '//ortho_type
+               write(nu_diag,*) subname//' ERROR:   allowed values: ''cgs'', ''mgs'''
+            endif
+            abort_list = trim(abort_list)//":62"
+         endif
+      endif
+
       ice_IOUnitsMinUnit = numin
       ice_IOUnitsMaxUnit = numax
 
@@ -1139,28 +1222,35 @@ subroutine input_data
          write(nu_diag,*) '--------------------------------'
          if (kdyn == 1) then
             tmpstr2 = ' elastic-viscous-plastic dynamics'
-            write(nu_diag,*)    'yield_curve      = ', trim(yield_curve)
-            if (trim(yield_curve) == 'ellipse') &
-            write(nu_diag,1007) ' e_ratio          = ', e_ratio, ' aspect ratio of ellipse'
          elseif (kdyn == 2) then
             tmpstr2 = ' elastic-anisotropic-plastic dynamics'
+         elseif (kdyn == 3) then
+            tmpstr2 = ' viscous-plastic dynamics'
          elseif (kdyn < 1) then
             tmpstr2 = ' dynamics disabled'
          endif
          write(nu_diag,1022) ' kdyn             = ', kdyn,trim(tmpstr2)
          if (kdyn >= 1) then
-            if (revised_evp) then
-               tmpstr2 = ' revised EVP formulation used'
-            else
-               tmpstr2 = ' revised EVP formulation not used'
-            endif
-            write(nu_diag,1012) ' revised_evp      = ', revised_evp,trim(tmpstr2)
-            write(nu_diag,1022) ' kevp_kernel      = ', kevp_kernel,' EVP solver'
+            if (kdyn == 1 .or. kdyn == 2) then
+               if (revised_evp) then
+                  tmpstr2 = ' revised EVP formulation used'
+                  write(nu_diag,1007) ' arlx             = ', arlx, ' stress equation factor alpha'
+                  write(nu_diag,1007) ' brlx             = ', brlx, ' stress equation factor beta'
+               else
+                  tmpstr2 = ' revised EVP formulation not used'
+               endif
+               write(nu_diag,1012) ' revised_evp      = ', revised_evp,trim(tmpstr2)
+               write(nu_diag,1022) ' kevp_kernel      = ', kevp_kernel,' EVP solver'
+
+               write(nu_diag,1022) ' ndtd             = ', ndtd, ' number of dynamics/advection/ridging/steps per thermo timestep'
+               write(nu_diag,1022) ' ndte             = ', ndte, ' number of EVP or EAP subcycles'
+           endif
 
-            write(nu_diag,1022) ' ndtd             = ', ndtd, ' number of dynamics/advection/ridging/steps per thermo timestep'
-            write(nu_diag,1022) ' ndte             = ', ndte, ' number of EVP or EAP subcycles'
-            write(nu_diag,1007) ' arlx             = ', arlx, ' stress equation factor alpha'
-            write(nu_diag,1007) ' brlx             = ', brlx, ' stress equation factor beta'
+           if (kdyn == 1 .or. kdyn == 3) then
+              write(nu_diag,*)    'yield_curve      = ', trim(yield_curve)
+              if (trim(yield_curve) == 'ellipse') &
+              write(nu_diag,1007) ' e_ratio          = ', e_ratio, ' aspect ratio of ellipse'
+           endif
 
             if (trim(coriolis) == 'latitude') then
                tmpstr2 = ': latitude-dependent Coriolis parameter'
@@ -1524,6 +1614,31 @@ subroutine input_data
          write(nu_diag,1010) ' orca_halogrid             = ', &
                                orca_halogrid
 
+         if (kdyn == 3) then
+            write(nu_diag,1020) ' maxits_nonlin             = ', maxits_nonlin
+            write(nu_diag,1030) ' precond                   = ', precond
+            write(nu_diag,1020) ' dim_fgmres                = ', dim_fgmres
+            write(nu_diag,1020) ' dim_pgmres                = ', dim_pgmres
+            write(nu_diag,1020) ' maxits_fgmres             = ', maxits_fgmres
+            write(nu_diag,1020) ' maxits_pgmres             = ', maxits_pgmres
+            write(nu_diag,1010) ' monitor_nonlin            = ', monitor_nonlin
+            write(nu_diag,1010) ' monitor_fgmres            = ', monitor_fgmres
+            write(nu_diag,1010) ' monitor_pgmres            = ', monitor_pgmres
+            write(nu_diag,1030) ' ortho_type                = ', ortho_type
+            write(nu_diag,1008) ' reltol_nonlin             = ', reltol_nonlin
+            write(nu_diag,1008) ' reltol_fgmres             = ', reltol_fgmres
+            write(nu_diag,1008) ' reltol_pgmres             = ', reltol_pgmres
+            write(nu_diag,1030) ' algo_nonlin               = ', algo_nonlin
+            write(nu_diag,1010) ' use_mean_vrel             = ', use_mean_vrel
+            if (algo_nonlin == 'anderson') then
+               write(nu_diag,1020) ' fpfunc_andacc          = ', fpfunc_andacc
+               write(nu_diag,1020) ' dim_andacc             = ', dim_andacc
+               write(nu_diag,1008) ' reltol_andacc          = ', reltol_andacc
+               write(nu_diag,1005) ' damping_andacc         = ', damping_andacc
+               write(nu_diag,1020) ' start_andacc           = ', start_andacc
+            endif
+         endif
+
          write(nu_diag,1010) ' conserv_check             = ', conserv_check
 
          write(nu_diag,1020) ' fyear_init                = ', &
@@ -1675,6 +1790,7 @@ subroutine input_data
  1005    format (a30,2x,f12.6) ! float
  1006    format (a20,2x,f10.6,a)
  1007    format (a20,2x,f6.2,a)
+ 1008    format (a30,2x,d13.6)  ! float, exponential notation
  1009    format (a20,2x,d13.6,a)  ! float, exponential notation
  1010    format (a30,2x,l6)    ! logical
  1012    format (a20,2x,l3,1x,a)  ! logical
diff --git a/cicecore/cicedynB/general/ice_step_mod.F90 b/cicecore/cicedynB/general/ice_step_mod.F90
index 77d0ad492..4b92c2a42 100644
--- a/cicecore/cicedynB/general/ice_step_mod.F90
+++ b/cicecore/cicedynB/general/ice_step_mod.F90
@@ -850,6 +850,7 @@ subroutine step_dyn_horiz (dt)
 
       use ice_dyn_evp, only: evp
       use ice_dyn_eap, only: eap
+      use ice_dyn_vp, only: implicit_solver
       use ice_dyn_shared, only: kdyn, ktransport
       use ice_flux, only: init_history_dyn
 !deprecate upwind      use ice_transport_driver, only: advection, transport_upwind, transport_remap
@@ -863,11 +864,12 @@ subroutine step_dyn_horiz (dt)
       call init_history_dyn     ! initialize dynamic history variables
 
       !-----------------------------------------------------------------
-      ! Elastic-viscous-plastic ice dynamics
+      ! Ice dynamics (momentum equation)
       !-----------------------------------------------------------------
 
       if (kdyn == 1) call evp (dt)
       if (kdyn == 2) call eap (dt)
+      if (kdyn == 3) call implicit_solver (dt)
 
       !-----------------------------------------------------------------
       ! Horizontal ice transport
diff --git a/cicecore/cicedynB/infrastructure/comm/mpi/ice_global_reductions.F90 b/cicecore/cicedynB/infrastructure/comm/mpi/ice_global_reductions.F90
index 2b4172d81..1d724fb39 100644
--- a/cicecore/cicedynB/infrastructure/comm/mpi/ice_global_reductions.F90
+++ b/cicecore/cicedynB/infrastructure/comm/mpi/ice_global_reductions.F90
@@ -22,7 +22,7 @@ module ice_global_reductions
 #else
    use ice_communicate, only: my_task, mpiR16, mpiR8, mpiR4, master_task
 #endif
-   use ice_constants, only: field_loc_Nface, field_loc_NEcorner
+   use ice_constants, only: field_loc_Nface, field_loc_NEcorner, c0
    use ice_fileunits, only: bfbflag
    use ice_exit, only: abort_ice
    use ice_distribution, only: distrb, ice_distributionGet, &
@@ -36,6 +36,7 @@ module ice_global_reductions
    private
 
    public :: global_sum,      &
+             global_allreduce_sum, &
              global_sum_prod, &
              global_maxval,   &
              global_minval
@@ -55,6 +56,12 @@ module ice_global_reductions
                       global_sum_scalar_int
    end interface
 
+   interface global_allreduce_sum
+     module procedure global_allreduce_sum_vector_dbl!,   &
+     ! module procedure global_allreduce_sum_vector_real, & ! not yet implemented
+     ! module procedure global_allreduce_sum_vector_int     ! not yet implemented
+   end interface
+
    interface global_sum_prod
      module procedure global_sum_prod_dbl,         &
                       global_sum_prod_real,        &
@@ -700,6 +707,69 @@ function global_sum_scalar_int(scalar, dist) &
 
  end function global_sum_scalar_int
 
+!***********************************************************************
+
+ function global_allreduce_sum_vector_dbl(vector, dist) &
+          result(globalSums)
+
+!  Computes the global sums of sets of scalars (elements of 'vector') 
+!  distributed across a parallel machine.
+!
+!  This is actually the specific interface for the generic global_allreduce_sum
+!  function corresponding to double precision vectors.  The generic
+!  interface is identical but will handle real and integer vectors.
+
+   real (dbl_kind), dimension(:), intent(in) :: &
+      vector               ! vector whose components are to be summed
+
+   type (distrb), intent(in) :: &
+      dist                 ! block distribution
+
+   real (dbl_kind), dimension(size(vector)) :: &
+      globalSums           ! resulting array of global sums
+
+!-----------------------------------------------------------------------
+!
+!  local variables
+!
+!-----------------------------------------------------------------------
+
+   integer (int_kind) :: &
+      ierr,         &! mpi error flag
+      numProcs,     &! number of processor participating
+      numBlocks,    &! number of local blocks
+      communicator, &! communicator for this distribution
+      numElem        ! number of elements in vector
+
+   real (dbl_kind), dimension(:,:), allocatable :: &
+      work           ! temporary local array
+
+   character(len=*), parameter :: subname = '(global_allreduce_sum_vector_dbl)'
+
+!-----------------------------------------------------------------------
+!
+!  get communicator for MPI calls
+!
+!-----------------------------------------------------------------------
+
+   call ice_distributionGet(dist, &
+                            numLocalBlocks = numBlocks, &
+                            nprocs = numProcs,        &
+                            communicator = communicator)
+
+   numElem = size(vector)
+   allocate(work(1,numElem))
+   work(1,:) = vector
+   globalSums = c0
+
+   call compute_sums_dbl(work,globalSums,communicator,numProcs)
+
+   deallocate(work)
+
+!-----------------------------------------------------------------------
+
+ end function global_allreduce_sum_vector_dbl
+
 !***********************************************************************
 
  function global_sum_prod_dbl (array1, array2, dist, field_loc, &
diff --git a/cicecore/cicedynB/infrastructure/comm/serial/ice_global_reductions.F90 b/cicecore/cicedynB/infrastructure/comm/serial/ice_global_reductions.F90
index 1517bd73b..4d53e873e 100644
--- a/cicecore/cicedynB/infrastructure/comm/serial/ice_global_reductions.F90
+++ b/cicecore/cicedynB/infrastructure/comm/serial/ice_global_reductions.F90
@@ -23,7 +23,7 @@ module ice_global_reductions
 #else
    use ice_communicate, only: my_task, mpiR16, mpiR8, mpiR4, master_task
 #endif
-   use ice_constants, only: field_loc_Nface, field_loc_NEcorner
+   use ice_constants, only: field_loc_Nface, field_loc_NEcorner, c0
    use ice_fileunits, only: bfbflag
    use ice_exit, only: abort_ice
    use ice_distribution, only: distrb, ice_distributionGet, &
@@ -37,6 +37,7 @@ module ice_global_reductions
    private
 
    public :: global_sum,      &
+             global_allreduce_sum, &
              global_sum_prod, &
              global_maxval,   &
              global_minval
@@ -56,6 +57,12 @@ module ice_global_reductions
                       global_sum_scalar_int
    end interface
 
+   interface global_allreduce_sum
+     module procedure global_allreduce_sum_vector_dbl!,   &
+     ! module procedure global_allreduce_sum_vector_real, & ! not yet implemented
+     ! module procedure global_allreduce_sum_vector_int     ! not yet implemented
+   end interface
+
    interface global_sum_prod
      module procedure global_sum_prod_dbl,         &
                       global_sum_prod_real,        &
@@ -701,6 +708,69 @@ function global_sum_scalar_int(scalar, dist) &
 
  end function global_sum_scalar_int
 
+!***********************************************************************
+
+ function global_allreduce_sum_vector_dbl(vector, dist) &
+          result(globalSums)
+
+!  Computes the global sums of sets of scalars (elements of 'vector') 
+!  distributed across a parallel machine.
+!
+!  This is actually the specific interface for the generic global_allreduce_sum
+!  function corresponding to double precision vectors.  The generic
+!  interface is identical but will handle real and integer vectors.
+
+   real (dbl_kind), dimension(:), intent(in) :: &
+      vector               ! vector whose components are to be summed
+
+   type (distrb), intent(in) :: &
+      dist                 ! block distribution
+
+   real (dbl_kind), dimension(size(vector)) :: &
+      globalSums           ! resulting array of global sums
+
+!-----------------------------------------------------------------------
+!
+!  local variables
+!
+!-----------------------------------------------------------------------
+
+   integer (int_kind) :: &
+      ierr,         &! mpi error flag
+      numProcs,     &! number of processor participating
+      numBlocks,    &! number of local blocks
+      communicator, &! communicator for this distribution
+      numElem        ! number of elements in vector
+
+   real (dbl_kind), dimension(:,:), allocatable :: &
+      work           ! temporary local array
+
+   character(len=*), parameter :: subname = '(global_allreduce_sum_vector_dbl)'
+
+!-----------------------------------------------------------------------
+!
+!  get communicator for MPI calls
+!
+!-----------------------------------------------------------------------
+
+   call ice_distributionGet(dist, &
+                            numLocalBlocks = numBlocks, &
+                            nprocs = numProcs,        &
+                            communicator = communicator)
+
+   numElem = size(vector)
+   allocate(work(1,numElem))
+   work(1,:) = vector
+   globalSums = c0
+
+   call compute_sums_dbl(work,globalSums,communicator,numProcs)
+
+   deallocate(work)
+
+!-----------------------------------------------------------------------
+
+ end function global_allreduce_sum_vector_dbl
+
 !***********************************************************************
 
  function global_sum_prod_dbl (array1, array2, dist, field_loc, &
diff --git a/cicecore/cicedynB/infrastructure/ice_grid.F90 b/cicecore/cicedynB/infrastructure/ice_grid.F90
index 34b37cf29..67129c911 100644
--- a/cicecore/cicedynB/infrastructure/ice_grid.F90
+++ b/cicecore/cicedynB/infrastructure/ice_grid.F90
@@ -340,6 +340,7 @@ subroutine init_grid2
       real (kind=dbl_kind) :: &
          angle_0, angle_w, angle_s, angle_sw, &
          pi, pi2, puny
+
       logical (kind=log_kind), dimension(nx_block,ny_block,max_blocks):: &
          out_of_range
 
diff --git a/cicecore/drivers/direct/hadgem3/CICE_InitMod.F90 b/cicecore/drivers/direct/hadgem3/CICE_InitMod.F90
index dc41ff9fd..49cf12ce1 100644
--- a/cicecore/drivers/direct/hadgem3/CICE_InitMod.F90
+++ b/cicecore/drivers/direct/hadgem3/CICE_InitMod.F90
@@ -71,7 +71,8 @@ subroutine cice_init
       use ice_domain, only: init_domain_blocks
       use ice_domain_size, only: ncat, nfsd
       use ice_dyn_eap, only: init_eap, alloc_dyn_eap
-      use ice_dyn_shared, only: kdyn, init_evp, basalstress, alloc_dyn_shared
+      use ice_dyn_shared, only: kdyn, init_dyn, basalstress, alloc_dyn_shared
+      use ice_dyn_vp, only: init_vp
       use ice_flux, only: init_coupler_flux, init_history_therm, &
           init_history_dyn, init_flux_atm, init_flux_ocn, alloc_flux
       use ice_forcing, only: init_forcing_ocn, init_forcing_atmo, &
@@ -120,11 +121,12 @@ subroutine cice_init
       call init_calendar        ! initialize some calendar stuff
       call init_hist (dt)       ! initialize output history file
 
+      call init_dyn (dt_dyn)    ! define dynamics parameters, variables
       if (kdyn == 2) then
          call alloc_dyn_eap     ! allocate dyn_eap arrays
-         call init_eap (dt_dyn) ! define eap dynamics parameters, variables
-      else                      ! for both kdyn = 0 or 1
-         call init_evp (dt_dyn) ! define evp dynamics parameters, variables
+         call init_eap          ! define eap dynamics parameters, variables
+      else if (kdyn == 3) then
+         call init_vp           ! define vp dynamics parameters, variables
       endif
 
       call init_coupler_flux    ! initialize fluxes exchanged with coupler
diff --git a/cicecore/drivers/mct/cesm1/CICE_InitMod.F90 b/cicecore/drivers/mct/cesm1/CICE_InitMod.F90
index 80bb2570e..da745d965 100644
--- a/cicecore/drivers/mct/cesm1/CICE_InitMod.F90
+++ b/cicecore/drivers/mct/cesm1/CICE_InitMod.F90
@@ -71,7 +71,8 @@ subroutine cice_init(mpicom_ice)
       use ice_domain, only: init_domain_blocks
       use ice_domain_size, only: ncat, nfsd
       use ice_dyn_eap, only: init_eap, alloc_dyn_eap
-      use ice_dyn_shared, only: kdyn, init_evp, alloc_dyn_shared
+      use ice_dyn_shared, only: kdyn, init_dyn, alloc_dyn_shared
+      use ice_dyn_vp, only: init_vp
       use ice_flux, only: init_coupler_flux, init_history_therm, &
           init_history_dyn, init_flux_atm, init_flux_ocn, alloc_flux
       use ice_forcing, only: init_forcing_ocn, init_forcing_atmo, &
@@ -122,11 +123,12 @@ subroutine cice_init(mpicom_ice)
       call init_calendar        ! initialize some calendar stuff
       call init_hist (dt)       ! initialize output history file
 
+      call init_dyn (dt_dyn)    ! define dynamics parameters, variables
       if (kdyn == 2) then
          call alloc_dyn_eap     ! allocate dyn_eap arrays
-         call init_eap (dt_dyn) ! define eap dynamics parameters, variables
-      else                      ! for both kdyn = 0 or 1
-         call init_evp (dt_dyn) ! define evp dynamics parameters, variables
+         call init_eap          ! define eap dynamics parameters, variables
+      else if (kdyn == 3) then
+         call init_vp           ! define vp dynamics parameters, variables
       endif
 
       call init_coupler_flux    ! initialize fluxes exchanged with coupler
diff --git a/cicecore/drivers/nuopc/cmeps/CICE_InitMod.F90 b/cicecore/drivers/nuopc/cmeps/CICE_InitMod.F90
index 917774908..cb70c9b4a 100644
--- a/cicecore/drivers/nuopc/cmeps/CICE_InitMod.F90
+++ b/cicecore/drivers/nuopc/cmeps/CICE_InitMod.F90
@@ -52,7 +52,7 @@ subroutine cice_init
       use ice_domain, only: init_domain_blocks
       use ice_domain_size, only: ncat, nfsd
       use ice_dyn_eap, only: init_eap, alloc_dyn_eap
-      use ice_dyn_shared, only: kdyn, init_evp, alloc_dyn_shared
+      use ice_dyn_shared, only: kdyn, init_dyn, alloc_dyn_shared
       use ice_flux, only: init_coupler_flux, init_history_therm, &
           init_history_dyn, init_flux_atm, init_flux_ocn, alloc_flux
       use ice_forcing, only: init_forcing_ocn
@@ -98,11 +98,12 @@ subroutine cice_init
       call init_calendar        ! initialize some calendar stuff
       call init_hist (dt)       ! initialize output history file
 
+      call init_dyn (dt_dyn)    ! define dynamics parameters, variables
       if (kdyn == 2) then
          call alloc_dyn_eap     ! allocate dyn_eap arrays
-         call init_eap (dt_dyn) ! define eap dynamics parameters, variables
-      else                      ! for both kdyn = 0 or 1
-         call init_evp (dt_dyn) ! define evp dynamics parameters, variables
+         call init_eap          ! define eap dynamics parameters, variables
+      else if (kdyn == 3) then
+         call init_vp           ! define vp dynamics parameters, variables
       endif
 
       call init_coupler_flux    ! initialize fluxes exchanged with coupler
diff --git a/cicecore/drivers/nuopc/dmi/CICE_InitMod.F90 b/cicecore/drivers/nuopc/dmi/CICE_InitMod.F90
index 4e236bb11..70ef5f895 100644
--- a/cicecore/drivers/nuopc/dmi/CICE_InitMod.F90
+++ b/cicecore/drivers/nuopc/dmi/CICE_InitMod.F90
@@ -76,7 +76,7 @@ subroutine cice_init(mpi_comm)
       use ice_domain, only: init_domain_blocks
       use ice_domain_size, only: ncat, nfsd
       use ice_dyn_eap, only: init_eap, alloc_dyn_eap
-      use ice_dyn_shared, only: kdyn, init_evp, alloc_dyn_shared
+      use ice_dyn_shared, only: kdyn, init_dyn, alloc_dyn_shared
       use ice_flux, only: init_coupler_flux, init_history_therm, &
           init_history_dyn, init_flux_atm, init_flux_ocn, alloc_flux
       use ice_forcing, only: init_forcing_ocn, init_forcing_atmo, &
@@ -134,11 +134,12 @@ subroutine cice_init(mpi_comm)
       call init_calendar        ! initialize some calendar stuff
       call init_hist (dt)       ! initialize output history file
 
+      call init_dyn (dt_dyn)    ! define dynamics parameters, variables
       if (kdyn == 2) then
          call alloc_dyn_eap     ! allocate dyn_eap arrays
-         call init_eap (dt_dyn) ! define eap dynamics parameters, variables
-      else                      ! for both kdyn = 0 or 1
-         call init_evp (dt_dyn) ! define evp dynamics parameters, variables
+         call init_eap          ! define eap dynamics parameters, variables
+      else if (kdyn == 3) then
+         call init_vp           ! define vp dynamics parameters, variables
       endif
 
       call init_coupler_flux    ! initialize fluxes exchanged with coupler
diff --git a/cicecore/drivers/standalone/cice/CICE_InitMod.F90 b/cicecore/drivers/standalone/cice/CICE_InitMod.F90
index 0a8614eb2..8b507740d 100644
--- a/cicecore/drivers/standalone/cice/CICE_InitMod.F90
+++ b/cicecore/drivers/standalone/cice/CICE_InitMod.F90
@@ -71,7 +71,8 @@ subroutine cice_init
       use ice_domain, only: init_domain_blocks
       use ice_domain_size, only: ncat, nfsd
       use ice_dyn_eap, only: init_eap, alloc_dyn_eap
-      use ice_dyn_shared, only: kdyn, init_evp, alloc_dyn_shared
+      use ice_dyn_shared, only: kdyn, init_dyn, alloc_dyn_shared
+      use ice_dyn_vp, only: init_vp
       use ice_flux, only: init_coupler_flux, init_history_therm, &
           init_history_dyn, init_flux_atm, init_flux_ocn, alloc_flux
       use ice_forcing, only: init_forcing_ocn, init_forcing_atmo, &
@@ -122,11 +123,12 @@ subroutine cice_init
       call init_calendar        ! initialize some calendar stuff
       call init_hist (dt)       ! initialize output history file
 
+      call init_dyn (dt_dyn)    ! define dynamics parameters, variables
       if (kdyn == 2) then
          call alloc_dyn_eap     ! allocate dyn_eap arrays
-         call init_eap (dt_dyn) ! define eap dynamics parameters, variables
-      else                      ! for both kdyn = 0 or 1
-         call init_evp (dt_dyn) ! define evp dynamics parameters, variables
+         call init_eap          ! define eap dynamics parameters, variables
+      else if (kdyn == 3) then
+         call init_vp           ! define vp dynamics parameters, variables
       endif
 
       call init_coupler_flux    ! initialize fluxes exchanged with coupler
diff --git a/cicecore/shared/ice_fileunits.F90 b/cicecore/shared/ice_fileunits.F90
index 4c91fdb2a..b6b30d47a 100644
--- a/cicecore/shared/ice_fileunits.F90
+++ b/cicecore/shared/ice_fileunits.F90
@@ -1,4 +1,3 @@
-!  SVN:$Id: ice_fileunits.F90 1228 2017-05-23 21:33:34Z tcraig $
 !=======================================================================
 !
 !  This module contains an I/O unit manager for tracking, assigning
diff --git a/configuration/scripts/ice_in b/configuration/scripts/ice_in
index a26579df1..3139726f5 100644
--- a/configuration/scripts/ice_in
+++ b/configuration/scripts/ice_in
@@ -139,6 +139,21 @@
     kridge          = 1
     ktransport      = 1
     ssh_stress      = 'geostrophic'
+    maxits_nonlin   = 4
+    precond         = 'pgmres'
+    dim_fgmres       = 50
+    dim_pgmres       = 5
+    maxits_fgmres   = 1
+    maxits_pgmres   = 1
+    monitor_nonlin  = .false.
+    monitor_fgmres  = .false.
+    monitor_pgmres  = .false.
+    ortho_type      = 'mgs'
+    reltol_nonlin   = 1e-8
+    reltol_fgmres   = 1e-2
+    reltol_pgmres   = 1e-6
+    algo_nonlin     = 'picard'
+    use_mean_vrel   = .true.
 /
 
 &shortwave_nml
diff --git a/configuration/scripts/machines/Macros.cesium_intel b/configuration/scripts/machines/Macros.cesium_intel
index 1bca1ddac..ae112780d 100644
--- a/configuration/scripts/machines/Macros.cesium_intel
+++ b/configuration/scripts/machines/Macros.cesium_intel
@@ -50,7 +50,7 @@ LIB_PNETCDF := $(PNETCDF_PATH)/lib
 LIB_MPI := $(IMPILIBDIR)
 
 #SLIBS   := -L$(LIB_NETCDF) -lnetcdff -lnetcdf -L$(LIB_PNETCDF) -lpnetcdf -lgptl
-SLIBS   := -L$(LIB_NETCDF) -lnetcdff -lnetcdf
+SLIBS   := -L$(LIB_NETCDF) -lnetcdff -lnetcdf  -llapack -lblas
 
 ifeq ($(ICE_THREADED), true) 
    LDFLAGS += -openmp 
diff --git a/configuration/scripts/machines/Macros.conda_linux b/configuration/scripts/machines/Macros.conda_linux
index 32c5ae012..c821a4592 100644
--- a/configuration/scripts/machines/Macros.conda_linux
+++ b/configuration/scripts/machines/Macros.conda_linux
@@ -40,7 +40,7 @@ LD:= $(FC)
 MODDIR  += -I$(CONDA_PREFIX)/include
 
 # Libraries to be passed to the linker
-SLIBS   := -L$(CONDA_PREFIX)/lib -lnetcdf -lnetcdff
+SLIBS   := -L$(CONDA_PREFIX)/lib -lnetcdf -lnetcdff -llapack
 
 # Necessary flag to compile with OpenMP support
 ifeq ($(ICE_THREADED), true)
diff --git a/configuration/scripts/machines/Macros.conda_macos b/configuration/scripts/machines/Macros.conda_macos
index 0d866d9a2..4acc4d3ba 100644
--- a/configuration/scripts/machines/Macros.conda_macos
+++ b/configuration/scripts/machines/Macros.conda_macos
@@ -48,7 +48,7 @@ else
 endif
 
 # Libraries to be passed to the linker
-SLIBS   := -L$(CONDA_PREFIX)/lib -lnetcdf -lnetcdff
+SLIBS   := -L$(CONDA_PREFIX)/lib -lnetcdf -lnetcdff -llapack
 
 # Necessary flag to compile with OpenMP support
 ifeq ($(ICE_THREADED), true)
diff --git a/configuration/scripts/machines/environment.yml b/configuration/scripts/machines/environment.yml
index aab90d23c..57bdacfec 100644
--- a/configuration/scripts/machines/environment.yml
+++ b/configuration/scripts/machines/environment.yml
@@ -8,6 +8,7 @@ dependencies:
   - netcdf-fortran
   - openmpi
   - make
+  - liblapack
 # Python dependencies for plotting scripts
   - numpy
   - matplotlib-base
diff --git a/configuration/scripts/options/set_env.lapack b/configuration/scripts/options/set_env.lapack
new file mode 100644
index 000000000..cf52ad1b0
--- /dev/null
+++ b/configuration/scripts/options/set_env.lapack
@@ -0,0 +1 @@
+setenv ICE_CPPDEFS -DUSE_LAPACK
diff --git a/configuration/scripts/options/set_nml.diagimp b/configuration/scripts/options/set_nml.diagimp
new file mode 100644
index 000000000..940754157
--- /dev/null
+++ b/configuration/scripts/options/set_nml.diagimp
@@ -0,0 +1,3 @@
+monitor_nonlin  = .true.
+monitor_fgmres  = .true.
+monitor_pgmres  = .true.
diff --git a/configuration/scripts/options/set_nml.dynanderson b/configuration/scripts/options/set_nml.dynanderson
new file mode 100644
index 000000000..566c53a09
--- /dev/null
+++ b/configuration/scripts/options/set_nml.dynanderson
@@ -0,0 +1,3 @@
+kdyn            = 3
+algo_nonlin     = 'anderson'
+use_mean_vrel   = .false.
diff --git a/configuration/scripts/options/set_nml.dynpicard b/configuration/scripts/options/set_nml.dynpicard
new file mode 100644
index 000000000..b81f4d4e6
--- /dev/null
+++ b/configuration/scripts/options/set_nml.dynpicard
@@ -0,0 +1,3 @@
+kdyn            = 3
+algo_nonlin     = 'picard'
+use_mean_vrel   = .true.
diff --git a/configuration/scripts/options/set_nml.nonlin5000 b/configuration/scripts/options/set_nml.nonlin5000
new file mode 100644
index 000000000..f767a3d0d
--- /dev/null
+++ b/configuration/scripts/options/set_nml.nonlin5000
@@ -0,0 +1 @@
+maxits_nonlin   = 5000
diff --git a/configuration/scripts/options/set_nml.run3dt b/configuration/scripts/options/set_nml.run3dt
new file mode 100644
index 000000000..102a19d80
--- /dev/null
+++ b/configuration/scripts/options/set_nml.run3dt
@@ -0,0 +1,6 @@
+npt             = 3
+dump_last       = .true.
+histfreq        = '1','x','x','x','x'
+hist_avg        = .false.
+f_uvel          = '1' 
+f_vvel          = '1' 
diff --git a/configuration/scripts/tests/base_suite.ts b/configuration/scripts/tests/base_suite.ts
index e96b07622..386c29e41 100755
--- a/configuration/scripts/tests/base_suite.ts
+++ b/configuration/scripts/tests/base_suite.ts
@@ -50,3 +50,4 @@ restart        gx3     4x4        iobinary
 restart        gx3     4x4        histall,precision8,cdf64
 smoke          gx3    30x1        bgcz,histall
 smoke          gx3    14x2        fsd12,histall
+smoke          gx3     4x1        dynpicard,medium
diff --git a/doc/source/cice_index.rst b/doc/source/cice_index.rst
index 1fb73c2d7..8ea16261d 100644
--- a/doc/source/cice_index.rst
+++ b/doc/source/cice_index.rst
@@ -336,7 +336,7 @@ either Celsius or Kelvin units).
    "kalg", ":math:`\bullet` absorption coefficient for algae", ""
    "kappav", "visible extinction coefficient in ice, wavelength\ :math:`<`\ 700nm", "1.4 m\ :math:`^{-1}`"
    "kcatbound", ":math:`\bullet` category boundary formula", ""
-   "kdyn", ":math:`\bullet` type of dynamics (1 = EVP, 0 = off)", "1"
+   "kdyn", ":math:`\bullet` type of dynamics (1 = EVP, 2 = EAP, 3 = VP, 0,-1 = off)", "1"
    "kg_to_g", "kg to g conversion factor", "1000."
    "kice", "thermal conductivity of fresh ice (:cite:`Bitz99`)", "2.03 W/m/deg"
    "kitd", ":math:`\bullet` type of itd conversions (0 = delta function, 1 = linear remap)", "1"
diff --git a/doc/source/developer_guide/dg_driver.rst b/doc/source/developer_guide/dg_driver.rst
index dd560a17c..a10cb319a 100644
--- a/doc/source/developer_guide/dg_driver.rst
+++ b/doc/source/developer_guide/dg_driver.rst
@@ -55,10 +55,11 @@ The initialize calling sequence looks something like::
       call init_zbgc            ! vertical biogeochemistry initialization
       call init_calendar        ! initialize some calendar stuff
       call init_hist (dt)       ! initialize output history file
+      call init_dyn (dt_dyn)    ! define dynamics parameters, variables
       if (kdyn == 2) then
-         call init_eap (dt_dyn) ! define eap dynamics parameters, variables
-      else                      ! for both kdyn = 0 or 1
-         call init_evp (dt_dyn) ! define evp dynamics parameters, variables
+         call init_eap          ! define eap dynamics parameters, variables
+      else if (kdyn == 3) then
+         call init_vp           ! define vp dynamics parameters, variables
       endif
       call init_coupler_flux    ! initialize fluxes exchanged with coupler
       call init_thermo_vertical ! initialize vertical thermodynamics
diff --git a/doc/source/developer_guide/dg_dynamics.rst b/doc/source/developer_guide/dg_dynamics.rst
index 3551763b5..eac19b1f6 100644
--- a/doc/source/developer_guide/dg_dynamics.rst
+++ b/doc/source/developer_guide/dg_dynamics.rst
@@ -30,13 +30,13 @@ Dynamical Solvers
 --------------------
 
 The dynamics solvers are found in **cicecore/cicedynB/dynamics/**.  A couple of different solvers are
-available including EVP, revised EVP, and EAP.  The dynamics solver is specified in namelist with the
-``kdyn`` variable.  ``kdyn=1`` is evp, ``kdyn=2`` is eap, and revised evp requires the ``revised_evp``
-namelist flag be set to true.
+available including EVP, revised EVP, EAP and VP.  The dynamics solver is specified in namelist with the
+``kdyn`` variable.  ``kdyn=1`` is evp, ``kdyn=2`` is eap, ``kdyn=3`` is VP and revised EVP requires 
+the ``revised_evp`` namelist flag be set to true.
 
-Multiple evp solvers are supported thru the namelist flag ``kevp_kernel``.  The standard implementation
+Multiple EVP solvers are supported thru the namelist flag ``kevp_kernel``.  The standard implementation
 and current default is ``kevp_kernel=0``.  In this case, the stress is solved on the regular decomposition
-via subcycling and calls to subroutine stress and subroutine stepu with MPI global sums required in each
+via subcycling and calls to subroutine ``stress`` and subroutine ``stepu`` with MPI global sums required in each
 subcycling call.  With ``kevp_kernel=2``, the data required to compute the stress is gathered to the root
 MPI process and the stress calculation is performed on the root task without any MPI global sums.  OpenMP
 parallelism is supported in ``kevp_kernel=2``.  The solutions with ``kevp_kernel`` set to 0 or 2 will 
diff --git a/doc/source/master_list.bib b/doc/source/master_list.bib
index caa93ec06..0b928d012 100644
--- a/doc/source/master_list.bib
+++ b/doc/source/master_list.bib
@@ -59,6 +59,8 @@ @string{GMD
 @string{CRST = {Cold Reg. Sci. Technol.}}
 @string{IJHPCA={Int. J High Perform. Comput. Appl}}
 @string{PTRSA={Philos. Trans. Royal Soc. A}}
+@string{SIAMJCP={SIAM J. Sci. Comput.}}
+
 
 
 %  **********************************************
@@ -977,6 +979,31 @@ @Article{Tsujino18
   pages = {79-139},
   url = {http://dx.doi.org/10.1016/j.ocemod.2018.07.002}
 }
+@Article{Lemieux08,
+  author = "J.-F. Lemieux and B. Tremblay and S. Thomas and J. Sedláček and L. A. Mysak",
+  title = "{Using the preconditioned Generalized Minimum RESidual (GMRES) method to solve the sea-ice momentum equation}",
+  journal = JGRO,
+  volume = {113},
+  number = {C10},
+  pages = {},
+  keywords = {Sea ice, GMRES, Krylov subspace},
+  doi = {10.1029/2007JC004680},
+  url = {https://agupubs.onlinelibrary.wiley.com/doi/abs/10.1029/2007JC004680},
+  eprint = {https://agupubs.onlinelibrary.wiley.com/doi/pdf/10.1029/2007JC004680},
+  year = {2008}
+}
+@Article{Saad93,
+  author = "Y. Saad",
+  title = "{A Flexible Inner-Outer Preconditioned GMRES Algorithm}",
+  journal = SIAMJCP,
+  volume = {14},
+  number = {2},
+  year = {1993},
+  pages = {461-469},
+  doi = {10.1137/0914028},
+  URL = {https://doi.org/10.1137/0914028}
+}
+
 %  **********************************************
 %  For new entries, see example entry in BIB_TEMPLATE.txt
 %  **********************************************
diff --git a/doc/source/science_guide/sg_dynamics.rst b/doc/source/science_guide/sg_dynamics.rst
index 4c9b6d502..e7f214ff7 100644
--- a/doc/source/science_guide/sg_dynamics.rst
+++ b/doc/source/science_guide/sg_dynamics.rst
@@ -5,15 +5,19 @@
 Dynamics
 ========
 
-There are now different rheologies available in the CICE code. The
+There are different approaches in the CICE code for representing sea ice
+rheology and for solving the sea ice momentum equation. The
 elastic-viscous-plastic (EVP) model represents a modification of the
 standard viscous-plastic (VP) model for sea ice dynamics
 :cite:`Hibler79`. The elastic-anisotropic-plastic (EAP) model,
 on the other hand, explicitly accounts for the observed sub-continuum
 anisotropy of the sea ice cover :cite:`Wilchinsky06,Weiss09`. If
-`kdyn` = 1 in the namelist then the EVP rheology is used (module
-**ice\_dyn\_evp.F90**), while `kdyn` = 2 is associated with the EAP
-rheology (**ice\_dyn\_eap.F90**). At times scales associated with the
+``kdyn`` = 1 in the namelist then the EVP model is used (module
+**ice\_dyn\_evp.F90**), while ``kdyn`` = 2 is associated with the EAP
+model (**ice\_dyn\_eap.F90**), and ``kdyn`` = 3 is associated with the
+VP model (**ice\_dyn\_vp.F90**).
+
+At times scales associated with the
 wind forcing, the EVP model reduces to the VP model while the EAP model
 reduces to the anisotropic rheology described in detail in
 :cite:`Wilchinsky06,Tsamados13`. At shorter time scales the
@@ -29,14 +33,23 @@ dynamics in :cite:`Tsamados13`. Simulation results and
 performance of the EVP and EAP models have been compared with the VP
 model and with each other in realistic simulations of the Arctic
 respectively in :cite:`Hunke99` and
-:cite:`Tsamados13`. Here we summarize the equations and
-direct the reader to the above references for details. The numerical
+:cite:`Tsamados13`.
+
+The EVP numerical
 implementation in this code release is that of :cite:`Hunke02`
 and :cite:`Hunke03`, with revisions to the numerical solver as
 in :cite:`Bouillon13`. The implementation of the EAP sea ice
 dynamics into CICE is described in detail in
 :cite:`Tsamados13`.
 
+The VP solver implementation mostly follows :cite:`Lemieux08`, with
+FGMRES :cite:`Saad93` as the linear solver and GMRES as the preconditioner.
+Note that the VP solver has not yet been tested on the ``tx1`` grid or with
+threading enabled.
+
+Here we summarize the equations and
+direct the reader to the above references for details.
+
 .. _momentum:
 
 ********
@@ -67,20 +80,36 @@ concentration regions. A careful explanation of the issue and its
 continuum solution is provided in :cite:`Hunke03` and
 :cite:`Connolley04`.
 
-The momentum equation is discretized in time as follows, for the classic
-EVP approach. First, for clarity, the two components of Equation :eq:`vpmom` are
+For clarity, the two components of Equation :eq:`vpmom` are
 
 .. math::
    \begin{aligned}
-   m{\partial u\over\partial t} &=& {\partial\sigma_{1j}\over\partial x_j} + \tau_{ax} + 
+   m{\partial u\over\partial t} &= {\partial\sigma_{1j}\over\partial x_j} + \tau_{ax} + 
      a_i c_w \rho_w
      \left|{\bf U}_w - {\bf u}\right| \left[\left(U_w-u\right)\cos\theta - \left(V_w-v\right)\sin\theta\right]
      -C_bu +mfv - mg{\partial H_\circ\over\partial x}, \\
-   m{\partial v\over\partial t} &=& {\partial\sigma_{2j}\over\partial x_j} + \tau_{ay} + 
+   m{\partial v\over\partial t} &= {\partial\sigma_{2j}\over\partial x_j} + \tau_{ay} + 
      a_i c_w \rho_w
      \left|{\bf U}_w - {\bf u}\right| \left[\left(U_w-u\right)\sin\theta + \left(V_w-v\right)\cos\theta\right]
      -C_bv-mfu - mg{\partial H_\circ\over\partial y}. \end{aligned}
+   :label: momsys
+
+
+A bilinear discretization is used for the stress terms
+:math:`\partial\sigma_{ij}/\partial x_j`,
+which enables the discrete equations to be derived from the
+continuous equations written in curvilinear coordinates. In this
+manner, metric terms associated with the curvature of the grid are
+incorporated into the discretization explicitly. Details pertaining to
+the spatial discretization are found in :cite:`Hunke02`.
+
+.. _evp-momentum:
+
+Elastic-Viscous-Plastic
+~~~~~~~~~~~~~~~~~~~~~~~
 
+The momentum equation is discretized in time as follows, for the classic
+EVP approach.
 In the code,
 :math:`{\tt vrel}=a_i c_w \rho_w\left|{\bf U}_w - {\bf u}^k\right|` and 
 :math:`C_b=T_b \left( \sqrt{(u^k)^2+(v^k)^2}+u_0 \right)^{-1}`, 
@@ -91,20 +120,20 @@ variables used in the code.
 .. math::
    \underbrace{\left({m\over\Delta t_e}+{\tt vrel} \cos\theta\ + C_b \right)}_{\tt cca} u^{k+1} 
    - \underbrace{\left(mf+{\tt vrel}\sin\theta\right)}_{\tt ccb}v^{k+1}
-    =  \underbrace{{\partial\sigma_{1j}^{k+1}\over\partial x_j}}_{\tt strintx} 
-    + \underbrace{\tau_{ax} - mg{\partial H_\circ\over\partial x} }_{\tt forcex}
-     + {\tt vrel}\underbrace{\left(U_w\cos\theta-V_w\sin\theta\right)}_{\tt waterx}  + {m\over\Delta t_e}u^k,
+    =  &\underbrace{{\partial\sigma_{1j}^{k+1}\over\partial x_j}}_{\tt strintx}
+    + \underbrace{\tau_{ax} - mg{\partial H_\circ\over\partial x} }_{\tt forcex} \\
+     &+ {\tt vrel}\underbrace{\left(U_w\cos\theta-V_w\sin\theta\right)}_{\tt waterx}  + {m\over\Delta t_e}u^k,
    :label: umom
 
 .. math::
     \underbrace{\left(mf+{\tt vrel}\sin\theta\right)}_{\tt ccb} u^{k+1} 
    + \underbrace{\left({m\over\Delta t_e}+{\tt vrel} \cos\theta + C_b \right)}_{\tt cca}v^{k+1}
-    =  \underbrace{{\partial\sigma_{2j}^{k+1}\over\partial x_j}}_{\tt strinty} 
-    + \underbrace{\tau_{ay} - mg{\partial H_\circ\over\partial y} }_{\tt forcey}
-     + {\tt vrel}\underbrace{\left(U_w\sin\theta+V_w\cos\theta\right)}_{\tt watery}  + {m\over\Delta t_e}v^k,
+    =  &\underbrace{{\partial\sigma_{2j}^{k+1}\over\partial x_j}}_{\tt strinty}
+    + \underbrace{\tau_{ay} - mg{\partial H_\circ\over\partial y} }_{\tt forcey} \\
+     &+ {\tt vrel}\underbrace{\left(U_w\sin\theta+V_w\cos\theta\right)}_{\tt watery}  + {m\over\Delta t_e}v^k,
    :label: vmom
 
-and vrel\ :math:`\cdot`\ waterx(y) = taux(y).
+and :math:`{\tt vrel}\ \cdot\ {\tt waterx(y)}= {\tt taux(y)}`.
 
 We solve this system of equations analytically for :math:`u^{k+1}` and
 :math:`v^{k+1}`. Define
@@ -121,8 +150,8 @@ where :math:`{\bf F} = \nabla\cdot\sigma^{k+1}`. Then
 
 .. math::
    \begin{aligned}
-   \left({m\over\Delta t_e} +{\tt vrel}\cos\theta\ + C_b \right)u^{k+1} - \left(mf + {\tt vrel}\sin\theta\right) v^{k+1} &=& \hat{u}  \\
-   \left(mf + {\tt vrel}\sin\theta\right) u^{k+1} + \left({m\over\Delta t_e} +{\tt vrel}\cos\theta + C_b \right)v^{k+1} &=& \hat{v}.\end{aligned}
+   \left({m\over\Delta t_e} +{\tt vrel}\cos\theta\ + C_b \right)u^{k+1} - \left(mf + {\tt vrel}\sin\theta\right) v^{k+1} &= \hat{u}  \\
+   \left(mf + {\tt vrel}\sin\theta\right) u^{k+1} + \left({m\over\Delta t_e} +{\tt vrel}\cos\theta + C_b \right)v^{k+1} &= \hat{v}.\end{aligned}
 
 Solving simultaneously for :math:`u^{k+1}` and :math:`v^{k+1}`,
 
@@ -140,10 +169,62 @@ where
 .. math::
    b = mf + {\tt vrel}\sin\theta.
    :label: cevpb
+   
+.. _vp-momentum:
+
+Viscous-Plastic
+~~~~~~~~~~~~~~~
+
+In the VP approach, equation :eq:`momsys` is discretized implicitly using a Backward Euler approach,
+and stresses are not computed explicitly:
 
-When the subcycling is finished for each (thermodynamic) time step, the
-ice–ocean stress must be constructed from `taux(y)` and the terms
-containing `vrel` on the left hand side of the equations.
+.. math::
+  \begin{align}
+  m\frac{(u^{n}-u^{n-1})}{\Delta t} &= \frac{\partial \sigma_{1j}^n}{\partial x_j}
+  - \tau_{w,x}^n + \tau_{b,x}^n +  mfv^n
+   + r_{x}^n,
+  \\
+  m\frac{(v^{n}-v^{n-1})}{\Delta t} &= \frac{\partial \sigma^{n} _{2j}}{\partial x_j}
+  - \tau_{w,y}^n + \tau_{b,y}^n   -mfu^{n}
+   + r_{y}^n
+  \end{align}
+  :label: u_sit
+
+where :math:`r = (r_x,r_y)` contains all terms that do not depend on the velocities :math:`u^n, v^n` (namely the sea surface tilt and the wind stress).
+As the water drag, seabed stress and rheology term depend on the velocity field, the only unknowns in equation :eq:`u_sit` are :math:`u^n` and :math:`v^n`.
+
+Once discretized in space, equation :eq:`u_sit` leads to a system of :math:`N` nonlinear equations with :math:`N` unknowns that can be concisely written as
+
+.. math::
+  \mathbf{A}(\mathbf{u})\mathbf{u} = \mathbf{b}(\mathbf{u}),
+  :label: nonlin_sys
+
+where :math:`\mathbf{A}` is an :math:`N\times N` matrix and :math:`\mathbf{u}` and :math:`\mathbf{b}` are vectors of size :math:`N`.
+Note that we have dropped the time level index :math:`n`.
+The vector :math:`\mathbf{u}` is formed by stacking first the :math:`u` components, followed by the :math:`v` components of the discretized ice velocity.
+The vector :math:`\mathbf{b}` is a function of the velocity vector :math:`\mathbf{u}` because of the water and seabed stress terms as well as parts of the rheology term that depend non-linearly on :math:`\mathbf{u}`.
+
+The nonlinear system :eq:`nonlin_sys` is solved using a Picard iteration method.
+Starting from a previous iterate :math:`\mathbf{u}_{k-1}`, the nonlinear system is linearized by substituting :math:`\mathbf{u}_{k-1}` in the expression of the matrix :math:`\mathbf{A}` and the vector :math:`\mathbf{b}`:
+
+.. math::
+  \mathbf{A}(\mathbf{u}_{k-1})\mathbf{u}_{k} =  \mathbf{b}(\mathbf{u}_{k-1})
+  :label: picard
+
+The resulting linear system is solved using the Flexible Generalized Minimum RESidual (FGMRES, :cite:`Saad93`) method and this process is repeated iteratively.
+
+The maximum number of Picard iterations can be set using the namelist flag ``maxits_nonlin``.
+The relative tolerance for the Picard solver can be set using the namelist flag ``reltol_nonlin``.
+The Picard iterative process stops when :math:`\left\lVert \mathbf{u}_{k} \right\rVert_2 < {\tt reltol\_nonlin} \cdot \left\lVert\mathbf{u}_{0}\right\rVert_2` or when ``maxits_nonlin`` is reached.
+
+Parameters for the FGMRES linear solver and the preconditioner can be controlled using additional namelist flags (see :ref:`dynamics_nml`).
+
+Ice-Ocean stress
+~~~~~~~~~~~~~~~~
+
+At the end of each (thermodynamic) time step, the
+ice–ocean stress must be constructed from :math:`{\tt taux(y)}` and the terms
+containing :math:`{\tt vrel}` on the left hand side of the equations.
 
 The Hibler-Bryan form for the ice-ocean stress :cite:`Hibler87`
 is included in **ice\_dyn\_shared.F90** but is currently commented out,
@@ -185,7 +266,7 @@ where the :math:`a_i` and :math:`v_i` are the total ice concentrations and ice v
 ridge(s) reaches the seafloor for a water depth :math:`h_{wu}=\min[h_w(i,j),h_w(i+1,j),h_w(i,j+1),h_w(i+1,j+1)]`. Given the formulation of :math:`C_b` in equation :eq:`Cb`, the seabed stress components are non-zero only 
 when :math:`h_u > h_{cu}`. 
 
-The maximum seabed stress depends on the weigth of the ridge 
+The maximum seabed stress depends on the weight of the ridge 
 above hydrostatic balance and the value of :math:`k_2`. It is, however, the parameter :math:`k_1` that has the most notable impact on the simulated extent of landfast ice. 
 The value of :math:`k_1` can be changed at runtime using the namelist variable ``k1``. The grounding scheme can be turned on or off using the namelist logical basalstress. 
 
@@ -207,47 +288,44 @@ For convenience we formulate the stress tensor :math:`\bf \sigma` in
 terms of :math:`\sigma_1=\sigma_{11}+\sigma_{22}`,
 :math:`\sigma_2=\sigma_{11}-\sigma_{22}`, and introduce the
 divergence, :math:`D_D`, and the horizontal tension and shearing
-strain rates, :math:`D_T` and :math:`D_S` respectively.
-
-CICE now outputs the internal ice pressure which is an important field to support navigation in ice-infested water.
-The internal ice pressure :math:`(sigP)` is the average of the normal stresses multiplied by :math:`-1` and 
-is therefore simply equal to :math:`-\sigma_1/2`.
-
-*Elastic-Viscous-Plastic*
-
-In the EVP model the internal stress tensor is determined from a
-regularized version of the VP constitutive law. Following the approach of :cite:`Konig10` (see also :cite:`Lemieux16`), the 
-elliptical yield curve can be modified such that the ice has isotropic tensile strength. 
-The tensile strength :math:`T_p` is expressed as a fraction of the ice strength :math:`P`, that is :math:`T_p=k_t P` 
-where :math:`k_t` should be set to a value between 0 and 1 (this can be changed at runtime with the namelist parameter ``Ktens``). The constitutive law is therefore 
+strain rates, :math:`D_T` and :math:`D_S` respectively:
 
 .. math::
-   {1\over E}{\partial\sigma_1\over\partial t} + {\sigma_1\over 2\zeta} 
-     + {P_R(1-k_t)\over 2\zeta} = D_D, \\
-   :label: sig1 
+   D_D = \dot{\epsilon}_{11} + \dot{\epsilon}_{22}, 
 
 .. math::
-   {1\over E}{\partial\sigma_2\over\partial t} + {\sigma_2\over 2\eta} = D_T,
-   :label: sig2
+   D_T = \dot{\epsilon}_{11} - \dot{\epsilon}_{22}, 
 
 .. math::
-   {1\over E}{\partial\sigma_{12}\over\partial t} + {\sigma_{12}\over
-     2\eta} = {1\over 2}D_S,
-   :label: sig12
+   D_S = 2\dot{\epsilon}_{12}, 
 
 where
 
 .. math::
-   D_D = \dot{\epsilon}_{11} + \dot{\epsilon}_{22}, 
+   \dot{\epsilon}_{ij} = {1\over 2}\left({{\partial u_i}\over{\partial x_j}} + {{\partial u_j}\over{\partial x_i}}\right)
 
-.. math::
-   D_T = \dot{\epsilon}_{11} - \dot{\epsilon}_{22}, 
+CICE can output the internal ice pressure which is an important field to support navigation in ice-infested water.
+The internal ice pressure (``sigP``) is the average of the normal stresses multiplied by :math:`-1` and 
+is therefore simply equal to :math:`-\sigma_1/2`.
 
-.. math::
-   D_S = 2\dot{\epsilon}_{12}, 
+Following the approach of :cite:`Konig10` (see also :cite:`Lemieux16`), the 
+elliptical yield curve can be modified such that the ice has isotropic tensile strength. 
+The tensile strength :math:`T_p` is expressed as a fraction of the ice strength :math:`P`, that is :math:`T_p=k_t P` 
+where :math:`k_t` should be set to a value between 0 and 1 (this can be changed at runtime with the namelist parameter ``Ktens``).
+
+.. _stress-vp:
+
+Viscous-Plastic
+~~~~~~~~~~~~~~~
+
+The VP constitutive law is given by
 
 .. math::
-   \dot{\epsilon}_{ij} = {1\over 2}\left({{\partial u_i}\over{\partial x_j}} + {{\partial u_j}\over{\partial x_i}}\right), 
+   \sigma_{ij} = 2 \eta \dot{\epsilon}_{ij} + (\zeta - \eta) D_D - P_R(1 - k_t)\frac{\delta_{ij}}{2}
+   :label: vp-const
+
+where :math:`\eta` and :math:`\zeta` are the bulk and shear viscosities.
+An elliptical yield curve is used, with the viscosities given by
 
 .. math::
    \zeta = {P(1+k_t)\over 2\Delta}, 
@@ -255,14 +333,41 @@ where
 .. math::
    \eta  = {P(1+k_t)\over {2\Delta e^2}}, 
 
+where
+
 .. math::
-   \Delta = \left[D_D^2 + {1\over e^2}\left(D_T^2 + D_S^2\right)\right]^{1/2},
+   \Delta = \left[D_D^2 + {1\over e^2}\left(D_T^2 + D_S^2\right)\right]^{1/2}
 
 and :math:`P_R` is a “replacement pressure” (see :cite:`Geiger98`, for
 example), which serves to prevent residual ice motion due to spatial
-variations of :math:`P` when the rates of strain are exactly zero. The ice strength :math:`P` 
+variations of :math:`P` when the rates of strain are exactly zero.
+
+The ice strength :math:`P`
 is a function of the ice thickness and concentration
-as it is described in the `Icepack Documentation <https://cice-consortium-icepack.readthedocs.io/en/master/science_guide/index.html>`_. The parameteter :math:`e` is the  ratio of the major and minor axes of the elliptical yield curve, also called the ellipse aspect ratio. It can be changed using the namelist parameter ``e_ratio``.
+as described in the `Icepack Documentation <https://cice-consortium-icepack.readthedocs.io/en/master/science_guide/index.html>`_. The parameter :math:`e` is the  ratio of the major and minor axes of the elliptical yield curve, also called the ellipse aspect ratio. It can be changed using the namelist parameter ``e_ratio``.
+
+.. _stress-evp:
+
+Elastic-Viscous-Plastic
+~~~~~~~~~~~~~~~~~~~~~~~
+
+In the EVP model the internal stress tensor is determined from a
+regularized version of the VP constitutive law :eq:`vp-const`.  The constitutive law is therefore
+
+.. math::
+   {1\over E}{\partial\sigma_1\over\partial t} + {\sigma_1\over 2\zeta}
+     + {P_R(1-k_t)\over 2\zeta} = D_D, \\
+   :label: sig1
+
+.. math::
+   {1\over E}{\partial\sigma_2\over\partial t} + {\sigma_2\over 2\eta} = D_T,
+   :label: sig2
+
+.. math::
+   {1\over E}{\partial\sigma_{12}\over\partial t} + {\sigma_{12}\over
+     2\eta} = {1\over 2}D_S,
+   :label: sig12
+
 
 Viscosities are updated during the subcycling, so that the entire
 dynamics component is subcycled within the time step, and the elastic
@@ -304,15 +409,10 @@ appear explicitly.) Choices of the parameters used to define :math:`E`,
 :math:`T` and :math:`\Delta t_e` are discussed in
 Sections :ref:`revp` and :ref:`parameters`.
 
-The bilinear discretization used for the stress terms
-:math:`\partial\sigma_{ij}/\partial x_j` in the momentum equation is
-now used, which enabled the discrete equations to be derived from the
-continuous equations written in curvilinear coordinates. In this
-manner, metric terms associated with the curvature of the grid are
-incorporated into the discretization explicitly. Details pertaining to
-the spatial discretization are found in :cite:`Hunke02`.
+.. _stress-eap:
 
-*Elastic-Anisotropic-Plastic*
+Elastic-Anisotropic-Plastic
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 In the EAP model the internal stress tensor is related to the
 geometrical properties and orientation of underlying virtual diamond
@@ -558,6 +658,6 @@ Introducing another numerical parameter :math:`\alpha=2T \Delta t_e ^{-1}` :cite
 where as opposed to the classic EVP, the second term in each equation is at iteration :math:`k` :cite:`Bouillon13`. Also, as opposed to the classic EVP, 
 :math:`\Delta t_e` times the number of subcycles (or iterations) does not need to be equal to the advective time step :math:`\Delta t`. 
 Finally, as with the classic EVP approach, the stresses are initialized using the previous time level values. 
-The revised EVP is activated by setting the namelist parameter `revised\_evp` = true. 
-In the code :math:`\alpha = arlx` and :math:`\beta = brlx`. The values of :math:`arlx` and :math:`brlx` can be set in the namelist. 
-It is recommended to use large values of these parameters and to set :math:`arlx=brlx` :cite:`Kimmritz15`.
+The revised EVP is activated by setting the namelist parameter ``revised_evp = true``. 
+In the code :math:`\alpha` is ``arlx`` and :math:`\beta` is ``brlx``. The values of ``arlx`` and ``brlx`` can be set in the namelist. 
+It is recommended to use large values of these parameters and to set :math:`\alpha=\beta` :cite:`Kimmritz15`.
diff --git a/doc/source/user_guide/ug_case_settings.rst b/doc/source/user_guide/ug_case_settings.rst
index 032c8b529..227a63663 100644
--- a/doc/source/user_guide/ug_case_settings.rst
+++ b/doc/source/user_guide/ug_case_settings.rst
@@ -349,6 +349,8 @@ thermo_nml
    "``sw_dtemp``", "real", "temperature difference from melt to start redistributing", "0.02"
    "", "", "", ""
 
+.. _dynamics_nml:
+
 dynamics_nml
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -369,10 +371,13 @@ dynamics_nml
    "", "``zero``", "zero coriolis", ""
    "``Cstar``", "real", "constant in Hibler strength formula", "20"
    "``e_ratio``", "real", "EVP ellipse aspect ratio", "2.0"
+   "``dim_fgmres``", "integer", "maximum number of Arnoldi iterations for FGMRES solver", "50"
+   "``dim_pgmres``", "integer", "maximum number of Arnoldi iterations for PGMRES preconditioner", "5"
    "``kdyn``", "``-1``", "dynamics algorithm OFF", "1"
    "", "``0``", "dynamics OFF", ""
    "", "``1``", "EVP dynamics", ""
    "", "``2``", "EAP dynamics", ""
+   "", "``3``", "VP dynamics", ""
    "``kevp_kernel``", "``0``", "standard 2D EVP memory parallel solver", "0"
    "", "``2``", "1D shared memory solver (not fully validated)", ""
    "``kstrength``", "``0``", "ice strength formulation :cite:`Hibler79`", "1"
@@ -388,9 +393,23 @@ dynamics_nml
    "``Ktens``", "real", "Tensile strength factor (see :cite:`Konig10`)", "0.0"
    "``k1``", "real", "1st free parameter for landfast parameterization", "8.0"
    "``k2``", "real", "2nd free parameter (N/m\ :math:`^3`) for landfast parameterization", "15.0"
+   "``maxits_nonlin``", "integer", "maximum number of nonlinear iterations for VP solver", "1000"
+   "``maxits_fgmres``", "integer", "maximum number of restarts for FGMRES solver", "1"
+   "``maxits_pgmres``", "integer", "maximum number of restarts for PGMRES preconditioner", "1"
+   "``monitor_nonlin``", "logical", "write velocity norm at each nonlinear iteration", "``.false.``"
+   "``monitor_fgmres``", "logical", "write velocity norm at each FGMRES iteration", "``.false.``"
+   "``monitor_pgmres``", "logical", "write velocity norm at each PGMRES iteration", "``.false.``"
    "``mu_rdg``", "real", "e-folding scale of ridged ice for ``krdg_partic`` = 1 in m^0.5", "3.0"
    "``ndte``", "integer", "number of EVP subcycles", "120"
+   "``ortho_type``", "``mgs``", "Use modified Gram-Shchmidt in FGMRES solver", "``mgs``"
+   "", "``cgs``", "Use classical Gram-Shchmidt in FGMRES solver", ""
+   "``precond``", "``pgmres``", "Use GMRES as preconditioner for FGMRES solver", "``pgmres``"
+   "", "``diag``", "Use Jacobi preconditioner for the FGMRES solver", ""
+   "", "``ident``", "Don't use a preconditioner for the FGMRES solver", ""
    "``Pstar``", "real", "constant in Hibler strength formula (N/m\ :math:`^2`)", "2.75e4"
+   "``reltol_nonlin``", "real", "relative tolerance for nonlinear solver", "1e-8"
+   "``reltol_fgmres``", "real", "relative tolerance for FGMRES solver", "1e-2"
+   "``reltol_pgmres``", "real", "relative tolerance for PGMRES preconditioner", "1e-6"
    "``revised_evp``", "logical", "use revised EVP formulation", "``.false.``"
    "``ssh_stress``", "``coupled``", "computed from coupled sea surface height gradient", "``geostrophic``"
    "", "``geostropic``", "computed from ocean velocity", ""