From b18b02a70a1efeaeb90a6c858d9fb725d14f87ee Mon Sep 17 00:00:00 2001
From: Jun Doi <doichan@jp.ibm.com>
Date: Thu, 28 Sep 2023 15:55:45 +0900
Subject: [PATCH] Fix nested parallel performance

---
 ...p_nested_performance-a3d55f3e85366a5b.yaml |  7 +++++
 src/controllers/aer_controller.hpp            |  2 +-
 src/simulators/statevector/qubitvector.hpp    |  9 +++---
 src/simulators/unitary/unitarymatrix.hpp      | 30 +++++++++----------
 src/transpile/fusion.hpp                      | 20 +++++++++----
 5 files changed, 42 insertions(+), 26 deletions(-)
 create mode 100644 releasenotes/notes/fix_omp_nested_performance-a3d55f3e85366a5b.yaml
diff --git a/releasenotes/notes/fix_omp_nested_performance-a3d55f3e85366a5b.yaml b/releasenotes/notes/fix_omp_nested_performance-a3d55f3e85366a5b.yaml
new file mode 100644
index 0000000000..50a19f6be9
--- /dev/null
+++ b/releasenotes/notes/fix_omp_nested_performance-a3d55f3e85366a5b.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    OpenMP nested parallel simulation for parallel experiments + parallel state
+    update was very slow because gate fusion uses unitary simulator inside
+    and it used omp parallel region. This fix remove parallel region in
+    gate fusion and improve performance of nested parallel simulations
diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp
index f42ae64ef8..e6005b9a62 100755
--- a/src/controllers/aer_controller.hpp
+++ b/src/controllers/aer_controller.hpp
@@ -541,7 +541,7 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
 
       // nested should be set to zero if num_threads clause will be used
 #if _OPENMP >= 200805
-      omp_set_max_active_levels(2);
+      omp_set_max_active_levels(1);
 #else
       omp_set_nested(1);
 #endif
diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp
index 3cc84d8a79..a686899358 100755
--- a/src/simulators/statevector/qubitvector.hpp
+++ b/src/simulators/statevector/qubitvector.hpp
@@ -890,11 +890,10 @@ template <typename data_t>
 void QubitVector<data_t>::zero() {
   const int_t END = data_size_; // end for k loop
 
-#pragma omp parallel for if (num_qubits_ > omp_threshold_ && omp_threads_ > 1) \
-    num_threads(omp_threads_)
-  for (int_t k = 0; k < END; ++k) {
-    data_[k] = 0.0;
-  }
+  auto zero_proc = [this](int_t i) { data_[i] = 0.0; };
+  Utils::apply_omp_parallel_for(
+      (num_qubits_ > omp_threshold_ && omp_threads_ > 1), 0, END, zero_proc,
+      omp_threads_);
 }
 
 template <typename data_t>
diff --git a/src/simulators/unitary/unitarymatrix.hpp b/src/simulators/unitary/unitarymatrix.hpp
index 494d57e84e..f406091662 100644
--- a/src/simulators/unitary/unitarymatrix.hpp
+++ b/src/simulators/unitary/unitarymatrix.hpp
@@ -238,13 +238,13 @@ void UnitaryMatrix<data_t>::initialize() {
   BaseVector::zero();
   // Set to be identity matrix
   const int_t nrows = rows_; // end for k loop
-#pragma omp parallel if (BaseVector::num_qubits_ >                             \
-                             BaseVector::omp_threshold_ &&                     \
-                         BaseVector::omp_threads_ > 1)                         \
-    num_threads(BaseVector::omp_threads_)
-  for (int_t k = 0; k < nrows; ++k) {
-    BaseVector::data_[k * (nrows + 1)] = 1.0;
-  }
+  auto initialize_proc = [this](int_t i) {
+    BaseVector::data_[i * (rows_ + 1)] = 1.0;
+  };
+  Utils::apply_omp_parallel_for(
+      (BaseVector::num_qubits_ > BaseVector::omp_threshold_ &&
+       BaseVector::omp_threads_ > 1),
+      0, rows_, initialize_proc, BaseVector::omp_threads_);
 }
 
 template <class data_t>
@@ -260,15 +260,15 @@ void UnitaryMatrix<data_t>::initialize_from_matrix(
         std::to_string(mat.GetRows()) + "," + std::to_string(mat.GetColumns()) +
         ").");
   }
-
-#pragma omp parallel if (BaseVector::num_qubits_ >                             \
-                             BaseVector::omp_threshold_ &&                     \
-                         BaseVector::omp_threads_ > 1)                         \
-    num_threads(BaseVector::omp_threads_)
-  for (int_t row = 0; row < nrows; ++row)
-    for (int_t col = 0; col < nrows; ++col) {
-      BaseVector::data_[row + nrows * col] = mat(row, col);
+  auto initialize_proc = [this, &mat](int_t row) {
+    for (int_t col = 0; col < rows_; ++col) {
+      BaseVector::data_[row + rows_ * col] = mat(row, col);
     }
+  };
+  Utils::apply_omp_parallel_for(
+      (BaseVector::num_qubits_ > BaseVector::omp_threshold_ &&
+       BaseVector::omp_threads_ > 1),
+      0, rows_, initialize_proc, BaseVector::omp_threads_);
 }
 
 template <class data_t>
diff --git a/src/transpile/fusion.hpp b/src/transpile/fusion.hpp
index d7c14ec8b6..a3a1c8b59d 100644
--- a/src/transpile/fusion.hpp
+++ b/src/transpile/fusion.hpp
@@ -851,11 +851,21 @@ void Fusion::optimize_circuit(Circuit &circ, Noise::NoiseModel &noise,
       if (circ.ops.size() % parallelization_)
         ++unit;
 
-#pragma omp parallel for if (parallelization_ > 1) num_threads(parallelization_)
-      for (int_t i = 0; i < parallelization_; i++) {
-        int_t start = unit * i;
-        int_t end = std::min(start + unit, (int_t)circ.ops.size());
-        optimize_circuit(circ, noise, allowed_opset, start, end, fuser, method);
+      if (parallelization_ > 1) {
+#pragma omp parallel for num_threads(parallelization_)
+        for (int_t i = 0; i < parallelization_; i++) {
+          int_t start = unit * i;
+          int_t end = std::min(start + unit, (int_t)circ.ops.size());
+          optimize_circuit(circ, noise, allowed_opset, start, end, fuser,
+                           method);
+        }
+      } else {
+        for (int_t i = 0; i < parallelization_; i++) {
+          int_t start = unit * i;
+          int_t end = std::min(start + unit, (int_t)circ.ops.size());
+          optimize_circuit(circ, noise, allowed_opset, start, end, fuser,
+                           method);
+        }
       }
       result.metadata.add(parallelization_, "fusion", "parallelization");
     }