NVIDIA · 1tnguyen · Jul 2, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024
@@ -18,6 +18,12 @@
       {
         "pattern": "^https://epubs.siam.org/doi/10.1137/S0097539796300921"
       },
+      {
+        "pattern": "^https://epubs.siam.org/doi/10.1137/090774999"
+      },
+      {
+        "pattern": "^https://epubs.siam.org/doi/10.1137/090771806"
+      },
       {
         "pattern": "^https://vscode.dev/"
       },

diff --git a/docs/sphinx/using/backends/simulators.rst b/docs/sphinx/using/backends/simulators.rst
@@ -286,6 +286,7 @@ Specific aspects of the simulation can be configured by defining the following e
 * **`CUDAQ_MPS_MAX_BOND=X`**: The maximum number of singular values to keep (fixed extent truncation). Default: 64.
 * **`CUDAQ_MPS_ABS_CUTOFF=X`**: The cutoff for the largest singular value during truncation. Eigenvalues that are smaller will be trimmed out. Default: 1e-5.
 * **`CUDAQ_MPS_RELATIVE_CUTOFF=X`**: The cutoff for the maximal singular value relative to the largest eigenvalue. Eigenvalues that are smaller than this fraction of the largest singular value will be trimmed out. Default: 1e-5
+* **`CUDAQ_MPS_SVD_ALGO=X`**: The SVD algorithm to use. Valid values are: `GESVD` (QR algorithm), `GESVDJ` (Jacobi method), `GESVDP` (`polar decomposition <https://epubs.siam.org/doi/10.1137/090774999>`__), `GESVDR` (`randomized methods <https://epubs.siam.org/doi/10.1137/090771806>`__). Default: `GESVDJ`.
 
 .. note:: 
 
@@ -297,6 +298,9 @@ Specific aspects of the simulation can be configured by defining the following e
 
   Setting random seed, via :code:`cudaq::set_random_seed`, is not supported for this backend due to a limitation of the :code:`cuTensorNet` library. This will be fixed in future release once this feature becomes available.
 
+.. note::
+    The parallelism of Jacobi method (the default `CUDAQ_MPS_SVD_ALGO` setting) gives GPU better performance on small and medium size matrices.
+    If you expect the a large number of singular values (e.g., increasing the `CUDAQ_MPS_MAX_BOND` setting), please adjust the `CUDAQ_MPS_SVD_ALGO` setting accordingly.  
 
 Default Simulator
 ==================================

diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
@@ -17,6 +17,8 @@ class SimulatorMPS : public SimulatorTensorNetBase {
   double m_absCutoff = 1e-5;
   // Default relative cutoff
   double m_relCutoff = 1e-5;
+  // Default SVD algorithm (Jacobi)
+  cutensornetTensorSVDAlgo_t m_svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
   std::vector<void *> m_mpsTensors_d;
 
 public:
@@ -65,6 +67,29 @@ class SimulatorMPS : public SimulatorTensorNetBase {
       m_relCutoff = relCutoff;
       cudaq::info("Setting MPS relative cutoff to {}.", m_relCutoff);
     }
+    static const std::unordered_map<std::string, cutensornetTensorSVDAlgo_t>
+        g_stringToAlgoEnum{{"GESVD", CUTENSORNET_TENSOR_SVD_ALGO_GESVD},
+                           {"GESVDJ", CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ},
+                           {"GESVDP", CUTENSORNET_TENSOR_SVD_ALGO_GESVDP},
+                           {"GESVDR", CUTENSORNET_TENSOR_SVD_ALGO_GESVDR}};
+
+    if (auto *svdAlgoEnvVar = std::getenv("CUDAQ_MPS_SVD_ALGO")) {
+      std::string svdAlgo(svdAlgoEnvVar);
+      std::transform(svdAlgo.begin(), svdAlgo.end(), svdAlgo.begin(),
+                     ::toupper);
+      const auto iter = g_stringToAlgoEnum.find(svdAlgo);
+      if (iter == g_stringToAlgoEnum.end()) {
+        std::stringstream errorMsg;
+        errorMsg << "Unknown CUDAQ_MPS_SVD_ALGO value ('" << svdAlgoEnvVar
+                 << "').\nValid values are:\n";
+        for (const auto &[configStr, _] : g_stringToAlgoEnum)
+          errorMsg << "  - " << configStr << "\n";
+        throw std::runtime_error(errorMsg.str());
+      }
+
+      m_svdAlgo = iter->second;
+      cudaq::info("Setting MPS SVD algorithm to {}.", svdAlgo);
+    }
   }
 
   virtual void prepareQubitTensorState() override {
@@ -77,7 +102,7 @@ class SimulatorMPS : public SimulatorTensorNetBase {
     // Factorize the state:
     if (m_state->getNumQubits() > 1)
       m_mpsTensors_d =
-          m_state->factorizeMPS(m_maxBond, m_absCutoff, m_relCutoff);
+          m_state->factorizeMPS(m_maxBond, m_absCutoff, m_relCutoff, m_svdAlgo);
   }
 
   virtual std::size_t calculateStateDim(const std::size_t numQubits) override {

diff --git a/runtime/nvqir/cutensornet/tensornet_state.cpp b/runtime/nvqir/cutensornet/tensornet_state.cpp
@@ -321,10 +321,34 @@ TensorNetState::factorizeMPS(int64_t maxExtent, double absCutoff,
     throw std::runtime_error("ERROR: Insufficient workspace size on Device!");
   }
 
+  // Check whether we need host memory workspace
+  int64_t hostWorkspaceSize{0};
+  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(
+      m_cutnHandle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
+      CUTENSORNET_MEMSPACE_HOST, CUTENSORNET_WORKSPACE_SCRATCH,
+      &hostWorkspaceSize));
+
+  void *hostWork = nullptr;
+  if (hostWorkspaceSize > 0) {
+    hostWork = malloc(hostWorkspaceSize);
+    if (!hostWork) {
+      throw std::runtime_error("Unable to allocate " +
+                               std::to_string(hostWorkspaceSize) +
+                               " bytes for cuTensorNet host workspace.");
+    }
+  }
+
+  HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(
+      m_cutnHandle, workDesc, CUTENSORNET_MEMSPACE_HOST,
+      CUTENSORNET_WORKSPACE_SCRATCH, hostWork, hostWorkspaceSize));
+
   // Execute MPS computation
   HANDLE_CUTN_ERROR(cutensornetStateCompute(
       m_cutnHandle, m_quantumState, workDesc, extentsPtr.data(),
       /*strides=*/nullptr, d_mpsTensors.data(), 0));
+  if (hostWork) {
+    free(hostWork);
+  }
   return d_mpsTensors;
 }
 

diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h
@@ -67,9 +67,9 @@ class TensorNetState {
   // Returns MPS tensors in GPU device memory.
   // Note: the caller assumes the ownership of these pointers, thus needs to
   // clean them up properly (with cudaFree).
-  std::vector<void *> factorizeMPS(
-      int64_t maxExtent, double absCutoff, double relCutoff,
-      cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ);
+  std::vector<void *> factorizeMPS(int64_t maxExtent, double absCutoff,
+                                   double relCutoff,
+                                   cutensornetTensorSVDAlgo_t algo);
 
   /// @brief  Compute the expectation value w.r.t. a
   /// `cutensornetNetworkOperator_t`