diff --git a/.github/workflows/config/md_link_check_config.json b/.github/workflows/config/md_link_check_config.json
index c7ba606bd3..15776ee69f 100644
--- a/.github/workflows/config/md_link_check_config.json
+++ b/.github/workflows/config/md_link_check_config.json
@@ -21,6 +21,12 @@
       {
         "pattern": "^https://epubs.siam.org/doi/10.1137/S0097539796300921"
       },
+      {
+        "pattern": "^https://epubs.siam.org/doi/10.1137/090774999"
+      },
+      {
+        "pattern": "^https://epubs.siam.org/doi/10.1137/090771806"
+      },
       {
         "pattern": "^https://vscode.dev/"
       },
diff --git a/docs/sphinx/using/backends/simulators.rst b/docs/sphinx/using/backends/simulators.rst
index 9d4c552653..3824027c86 100644
--- a/docs/sphinx/using/backends/simulators.rst
+++ b/docs/sphinx/using/backends/simulators.rst
@@ -286,6 +286,7 @@ Specific aspects of the simulation can be configured by defining the following e
 * **`CUDAQ_MPS_MAX_BOND=X`**: The maximum number of singular values to keep (fixed extent truncation). Default: 64.
 * **`CUDAQ_MPS_ABS_CUTOFF=X`**: The cutoff for the largest singular value during truncation. Eigenvalues that are smaller will be trimmed out. Default: 1e-5.
 * **`CUDAQ_MPS_RELATIVE_CUTOFF=X`**: The cutoff for the maximal singular value relative to the largest eigenvalue. Eigenvalues that are smaller than this fraction of the largest singular value will be trimmed out. Default: 1e-5
+* **`CUDAQ_MPS_SVD_ALGO=X`**: The SVD algorithm to use. Valid values are: `GESVD` (QR algorithm), `GESVDJ` (Jacobi method), `GESVDP` (`polar decomposition <https://epubs.siam.org/doi/10.1137/090774999>`__), `GESVDR` (`randomized methods <https://epubs.siam.org/doi/10.1137/090771806>`__). Default: `GESVDJ`.
 
 .. note:: 
 
@@ -297,6 +298,9 @@ Specific aspects of the simulation can be configured by defining the following e
 
   Setting random seed, via :code:`cudaq::set_random_seed`, is not supported for this backend due to a limitation of the :code:`cuTensorNet` library. This will be fixed in future release once this feature becomes available.
 
+.. note::
+    The parallelism of Jacobi method (the default `CUDAQ_MPS_SVD_ALGO` setting) gives GPU better performance on small and medium size matrices.
+    If you expect the a large number of singular values (e.g., increasing the `CUDAQ_MPS_MAX_BOND` setting), please adjust the `CUDAQ_MPS_SVD_ALGO` setting accordingly.  
 
 Default Simulator
 ==================================
diff --git a/runtime/nvqir/cutensornet/mps_simulation_state.cpp b/runtime/nvqir/cutensornet/mps_simulation_state.cpp
index 0a277dd3d8..5f88017152 100644
--- a/runtime/nvqir/cutensornet/mps_simulation_state.cpp
+++ b/runtime/nvqir/cutensornet/mps_simulation_state.cpp
@@ -256,9 +256,9 @@ MPSSimulationState::getAmplitude(const std::vector<int> &basisState) {
     TensorNetState basisTensorNetState(basisState, scratchPad,
                                        state->getInternalContext());
     // Note: this is a basis state, hence bond dim == 1
-    std::vector<MPSTensor> basisStateTensors =
-        basisTensorNetState.factorizeMPS(1, std::numeric_limits<double>::min(),
-                                         std::numeric_limits<double>::min());
+    std::vector<MPSTensor> basisStateTensors = basisTensorNetState.factorizeMPS(
+        1, std::numeric_limits<double>::min(),
+        std::numeric_limits<double>::min(), MPSSettings().svdAlgo);
     const auto overlap = computeOverlap(m_mpsTensors, basisStateTensors);
     for (auto &mpsTensor : basisStateTensors) {
       HANDLE_CUDA_ERROR(cudaFree(mpsTensor.deviceData));
@@ -523,6 +523,33 @@ MPSSettings::MPSSettings() {
 
     cudaq::info("Setting MPS relative cutoff to {}.", relCutoff);
   }
+  using namespace std::literals::string_view_literals;
+  using SvdPair = std::pair<std::string_view, cutensornetTensorSVDAlgo_t>;
+  constexpr std::array<SvdPair, 4> g_stringToAlgoEnum = {
+      SvdPair{"GESVD"sv, CUTENSORNET_TENSOR_SVD_ALGO_GESVD},
+      SvdPair{"GESVDJ"sv, CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ},
+      SvdPair{"GESVDP"sv, CUTENSORNET_TENSOR_SVD_ALGO_GESVDP},
+      SvdPair{"GESVDR"sv, CUTENSORNET_TENSOR_SVD_ALGO_GESVDR}};
+  if (auto *svdAlgoEnvVar = std::getenv("CUDAQ_MPS_SVD_ALGO")) {
+    std::string svdAlgoStr(svdAlgoEnvVar);
+    std::transform(svdAlgoStr.begin(), svdAlgoStr.end(), svdAlgoStr.begin(),
+                   ::toupper);
+    const auto iter = std::lower_bound(
+        g_stringToAlgoEnum.begin(), g_stringToAlgoEnum.end(), svdAlgoStr,
+        [](const SvdPair &pair, const std::string &key) {
+          return pair.first < key;
+        });
+    if (iter == g_stringToAlgoEnum.end() || iter->first != svdAlgoStr) {
+      std::stringstream errorMsg;
+      errorMsg << "Unknown CUDAQ_MPS_SVD_ALGO value ('" << svdAlgoEnvVar
+               << "').\nValid values are:\n";
+      for (const auto &[configStr, _] : g_stringToAlgoEnum)
+        errorMsg << "  - " << configStr << "\n";
+      throw std::runtime_error(errorMsg.str());
+    }
+    svdAlgo = iter->second;
+    cudaq::info("Setting MPS SVD algorithm to {} ({}).", svdAlgo, iter->first);
+  }
 }
 
 void MPSSimulationState::toHost(std::complex<double> *clientAllocatedData,
diff --git a/runtime/nvqir/cutensornet/mps_simulation_state.h b/runtime/nvqir/cutensornet/mps_simulation_state.h
index 54ad4155f5..227058c28b 100644
--- a/runtime/nvqir/cutensornet/mps_simulation_state.h
+++ b/runtime/nvqir/cutensornet/mps_simulation_state.h
@@ -24,6 +24,8 @@ struct MPSSettings {
   double absCutoff = 1e-5;
   // Default relative cutoff
   double relCutoff = 1e-5;
+  // Default SVD algorithm (Jacobi)
+  cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
   MPSSettings();
 };
 
diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
index 6bb8c1f5c8..9e232262d7 100644
--- a/runtime/nvqir/cutensornet/simulator_mps_register.cpp
+++ b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
@@ -27,8 +27,9 @@ class SimulatorMPS : public SimulatorTensorNetBase {
     m_mpsTensors_d.clear();
     // Factorize the state:
     if (m_state->getNumQubits() > 1)
-      m_mpsTensors_d = m_state->factorizeMPS(
-          m_settings.maxBond, m_settings.absCutoff, m_settings.relCutoff);
+      m_mpsTensors_d =
+          m_state->factorizeMPS(m_settings.maxBond, m_settings.absCutoff,
+                                m_settings.relCutoff, m_settings.svdAlgo);
   }
 
   virtual std::size_t calculateStateDim(const std::size_t numQubits) override {
@@ -49,8 +50,9 @@ class SimulatorMPS : public SimulatorTensorNetBase {
     } else {
       // Expand an existing state: Append MPS tensors
       // Factor the existing state
-      auto tensors = m_state->factorizeMPS(
-          m_settings.maxBond, m_settings.absCutoff, m_settings.relCutoff);
+      auto tensors =
+          m_state->factorizeMPS(m_settings.maxBond, m_settings.absCutoff,
+                                m_settings.relCutoff, m_settings.svdAlgo);
       // The right most MPS tensor needs to have one more extra leg (no longer
       // the boundary tensor).
       tensors.back().extents.emplace_back(1);
@@ -170,10 +172,10 @@ class SimulatorMPS : public SimulatorTensorNetBase {
         m_state = std::move(state);
       }
     } else {
-      // FIXME: expand the MPS tensors to the max extent
       if (!ptr) {
-        auto tensors = m_state->factorizeMPS(
-            m_settings.maxBond, m_settings.absCutoff, m_settings.relCutoff);
+        auto tensors =
+            m_state->factorizeMPS(m_settings.maxBond, m_settings.absCutoff,
+                                  m_settings.relCutoff, m_settings.svdAlgo);
         // The right most MPS tensor needs to have one more extra leg (no longer
         // the boundary tensor).
         tensors.back().extents.emplace_back(1);
@@ -198,8 +200,9 @@ class SimulatorMPS : public SimulatorTensorNetBase {
             m_cutnHandle, scratchPad, 1ULL << numQubits,
             reinterpret_cast<std::complex<double> *>(const_cast<void *>(ptr)),
             m_settings.maxBond);
-        auto tensors = m_state->factorizeMPS(
-            m_settings.maxBond, m_settings.absCutoff, m_settings.relCutoff);
+        auto tensors =
+            m_state->factorizeMPS(m_settings.maxBond, m_settings.absCutoff,
+                                  m_settings.relCutoff, m_settings.svdAlgo);
         // Adjust the extents of the last tensor in the original state
         tensors.back().extents.emplace_back(1);
 
@@ -224,8 +227,9 @@ class SimulatorMPS : public SimulatorTensorNetBase {
                                                   scratchPad, m_cutnHandle);
 
     if (m_state->getNumQubits() > 1) {
-      std::vector<MPSTensor> tensors = m_state->factorizeMPS(
-          m_settings.maxBond, m_settings.absCutoff, m_settings.relCutoff);
+      std::vector<MPSTensor> tensors =
+          m_state->factorizeMPS(m_settings.maxBond, m_settings.absCutoff,
+                                m_settings.relCutoff, m_settings.svdAlgo);
       return std::make_unique<MPSSimulationState>(std::move(m_state), tensors,
                                                   scratchPad, m_cutnHandle);
     }
diff --git a/runtime/nvqir/cutensornet/tensornet_state.cpp b/runtime/nvqir/cutensornet/tensornet_state.cpp
index 7b2c61c6a1..d060a4f64e 100644
--- a/runtime/nvqir/cutensornet/tensornet_state.cpp
+++ b/runtime/nvqir/cutensornet/tensornet_state.cpp
@@ -468,6 +468,11 @@ TensorNetState::factorizeMPS(int64_t maxExtent, double absCutoff,
   void *hostWork = nullptr;
   if (hostWorkspaceSize > 0) {
     hostWork = malloc(hostWorkspaceSize);
+    if (!hostWork) {
+      throw std::runtime_error("Unable to allocate " +
+                               std::to_string(hostWorkspaceSize) +
+                               " bytes for cuTensorNet host workspace.");
+    }
   }
 
   HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(
diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h
index 186a40a211..4388a4ed9b 100644
--- a/runtime/nvqir/cutensornet/tensornet_state.h
+++ b/runtime/nvqir/cutensornet/tensornet_state.h
@@ -134,9 +134,9 @@ class TensorNetState {
   /// Returns MPS tensors in GPU device memory.
   /// Note: the caller assumes the ownership of these pointers, thus needs to
   /// clean them up properly (with cudaFree).
-  std::vector<MPSTensor> factorizeMPS(
-      int64_t maxExtent, double absCutoff, double relCutoff,
-      cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ);
+  std::vector<MPSTensor> factorizeMPS(int64_t maxExtent, double absCutoff,
+                                      double relCutoff,
+                                      cutensornetTensorSVDAlgo_t algo);
 
   /// @brief  Compute the expectation value w.r.t. a
   /// `cutensornetNetworkOperator_t`