diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ac3951fc7..517ce982e7 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -512,6 +512,7 @@ endif()
 
 if(AER_DEBUG)
 	set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_DEBUG)
+	set(AER_COMPILER_FLAGS "${AER_COMPILER_FLAGS} -g")
 endif()
 
 if(TEST_JSON)
diff --git a/qiskit_aer/backends/aer_compiler.py b/qiskit_aer/backends/aer_compiler.py
index 4909f73537..e4a3a4e9b6 100644
--- a/qiskit_aer/backends/aer_compiler.py
+++ b/qiskit_aer/backends/aer_compiler.py
@@ -491,6 +491,7 @@ def compile_circuit(circuits, basis_gates=None, optypes=None):
     "parameterizations": (list),
     "fusion_parallelization_threshold": (int, np.integer),
     "target_gpus": (list),
+    "runtime_parameter_bind_enable": (bool, np.bool_),
 }
 
 
diff --git a/qiskit_aer/backends/aer_simulator.py b/qiskit_aer/backends/aer_simulator.py
index d34cf1ef27..f845ecd6f0 100644
--- a/qiskit_aer/backends/aer_simulator.py
+++ b/qiskit_aer/backends/aer_simulator.py
@@ -318,6 +318,12 @@ class AerSimulator(AerBackend):
     * ``accept_distributed_results`` (bool): This option enables storing
       results independently in each process (Default: None).
 
+    * ``runtime_parameter_bind_enable`` (bool): If this option is True
+      parameters are bound at runtime by using multi-shots without constructing
+      circuits for each parameters. For GPU this option can be used with
+      ``batched_shots_gpu`` to run with multiple parameters in a batch.
+      (Default: False).
+
     These backend options only apply when using the ``"statevector"``
     simulation method:
 
@@ -765,6 +771,8 @@ def _default_options(cls):
             # tensor network options
             tensor_network_num_sampling_qubits=10,
             use_cuTensorNet_autotuning=False,
+            # parameter binding
+            runtime_parameter_bind_enable=False,
         )
 
     def __repr__(self):
diff --git a/qiskit_aer/backends/wrappers/aer_controller_binding.hpp b/qiskit_aer/backends/wrappers/aer_controller_binding.hpp
index 67e057c74f..f614e4483d 100644
--- a/qiskit_aer/backends/wrappers/aer_controller_binding.hpp
+++ b/qiskit_aer/backends/wrappers/aer_controller_binding.hpp
@@ -412,6 +412,14 @@ void bind_aer_controller(MODULE m) {
       "target_gpus",
       [](const Config &config) { return config.target_gpus.val; },
       [](Config &config, reg_t val) { config.target_gpus.value(val); });
+  aer_config.def_property(
+      "runtime_parameter_bind_enable",
+      [](const Config &config) {
+        return config.runtime_parameter_bind_enable.val;
+      },
+      [](Config &config, bool val) {
+        config.runtime_parameter_bind_enable.value(val);
+      });
 
   aer_config.def(py::pickle(
       [](const AER::Config &config) {
@@ -500,11 +508,12 @@ void bind_aer_controller(MODULE m) {
                 79, config.extended_stabilizer_norm_estimation_default_samples),
             write_value(80, config.shot_branching_enable),
             write_value(81, config.shot_branching_sampling_enable),
-            write_value(82, config.target_gpus));
+            write_value(82, config.target_gpus),
+            write_value(83, config.runtime_parameter_bind_enable));
       },
       [](py::tuple t) {
         AER::Config config;
-        if (t.size() != 82)
+        if (t.size() != 84)
           throw std::runtime_error("Invalid serialization format.");
 
         read_value(t, 0, config.shots);
@@ -594,6 +603,7 @@ void bind_aer_controller(MODULE m) {
         read_value(t, 80, config.shot_branching_enable);
         read_value(t, 81, config.shot_branching_sampling_enable);
         read_value(t, 82, config.target_gpus);
+        read_value(t, 83, config.runtime_parameter_bind_enable);
         return config;
       }));
 }
diff --git a/releasenotes/notes/add_executor-ba4870f86ed5d8ec.yaml b/releasenotes/notes/add_executor-ba4870f86ed5d8ec.yaml
new file mode 100644
index 0000000000..e1088061fa
--- /dev/null
+++ b/releasenotes/notes/add_executor-ba4870f86ed5d8ec.yaml
@@ -0,0 +1,30 @@
+---
+features:
+  - |
+    This release restructures ``State`` classes.
+    Adding circuit executor classes that runs a circuit and manages multiple
+    states for multi-shots simulations or multi-chunk simulations for large
+    number of qubits.
+    Previously ``StateChunk`` class manages multiple chunks for multi-shots or
+    multi-chunk simulations but now ``State`` class only has one state
+    and all the parallelization codes are moved to ``Executor`` classes.
+    Now all ``State`` classes are independent from parallelization.
+    Also some of the functions in ``Aer::Controller`` class are moved to
+    ``CircuitExecutor::Executor`` class.
+  - |
+    Shot-branching technique that accelerates dynamic circuits simulations
+    is implemented with restructured ``Executor`` classes.
+    Shot-branching is currently applicable to statevector, density_matrix
+    and tensor_network methods.
+    Shot-branching provides dynamic distribution of multi-shots
+    by branching states when applying dynamic operations
+    (measure, reset, initialize, noises)
+    By default ``shot_branching_enable`` is disabled.
+    And by setting ``shot_branching_sampling_enable``, final measures will be
+    done by sampling measure that will speed up to get counts for multiple shots
+    sharing the same state.
+  - |
+    New option for GPU simulation ``target_gpus`` is added.
+    A list of GPUs used for the simulation can be set by this option.
+    Without this option, all the available GPUs are used.
+    For example, if there is 4 GPUs, ``target_gpus=[0, 2]`` will use 2 GPUs.
diff --git a/releasenotes/notes/runtime_parameter_binding-d2c57255f02729a1.yaml b/releasenotes/notes/runtime_parameter_binding-d2c57255f02729a1.yaml
new file mode 100644
index 0000000000..04573f93f7
--- /dev/null
+++ b/releasenotes/notes/runtime_parameter_binding-d2c57255f02729a1.yaml
@@ -0,0 +1,8 @@
+---
+features:
+  - |
+    A runtime parameter binding option is implemented to bind paramters at
+    runtime to a single circuit instead running multiple circuits as input.
+    An option ``runtime_parameter_bind_enable=True`` enables this feature and
+    for GPU, ``batched_shots_gpu=True`` should be also set to speed up
+    simulating parameterized circuit.
diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp
index e6005b9a62..d216b4ff9e 100755
--- a/src/controllers/aer_controller.hpp
+++ b/src/controllers/aer_controller.hpp
@@ -194,6 +194,9 @@ class Controller {
   int myrank_ = 0;
   int num_processes_ = 1;
   int num_process_per_experiment_ = 1;
+
+  // runtime parameter binding
+  bool runtime_parameter_bind_ = false;
 };
 
 //=========================================================================
@@ -329,6 +332,10 @@ void Controller::set_config(const Config &config) {
     throw std::runtime_error(std::string("Invalid simulation precision (") +
                              precision + std::string(")."));
   }
+
+  // check if runtime binding is enable
+  if (config.runtime_parameter_bind_enable.has_value())
+    runtime_parameter_bind_ = config.runtime_parameter_bind_enable.value();
 }
 
 void Controller::clear_config() {
@@ -502,7 +509,14 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
   auto methods = simulation_methods(config, circuits, noise_model);
 
   // Initialize Result object for the given number of experiments
-  Result result(circuits.size());
+  uint_t result_size;
+  reg_t result_offset(circuits.size());
+  result_size = 0;
+  for (int_t i = 0; i < circuits.size(); i++) {
+    result_offset[i] = result_size;
+    result_size += circuits[i]->num_bind_params;
+  }
+  Result result(result_size);
   // Initialize circuit executors for each circuit
   std::vector<std::shared_ptr<CircuitExecutor::Base>> executors(
       circuits.size());
@@ -514,12 +528,15 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
 
     // set parallelization for experiments
     try {
+      uint_t res_pos = 0;
       for (int i = 0; i < circuits.size(); i++) {
         executors[i] = make_circuit_executor(methods[i]);
         required_memory_mb_list[i] =
             executors[i]->required_memory_mb(config, *circuits[i], noise_model);
-        result.results[i].metadata.add(required_memory_mb_list[i],
-                                       "required_memory_mb");
+        for (int j = 0; j < circuits[i]->num_bind_params; j++) {
+          result.results[res_pos++].metadata.add(required_memory_mb_list[i],
+                                                 "required_memory_mb");
+        }
       }
       set_parallelization_experiments(required_memory_mb_list);
     } catch (std::exception &e) {
@@ -565,33 +582,40 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
     // average random seed to set the same seed to each process (when
     // seed_simulator is not set)
     if (num_processes_ > 1) {
-      reg_t seeds(circuits.size());
-      reg_t avg_seeds(circuits.size());
-      for (int_t i = 0; i < circuits.size(); i++)
-        seeds[i] = circuits[i]->seed;
-      MPI_Allreduce(seeds.data(), avg_seeds.data(), circuits.size(),
-                    MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD);
-      for (int_t i = 0; i < circuits.size(); i++)
-        circuits[i]->seed = avg_seeds[i] / num_processes_;
-    }
-#endif
-
-    const int NUM_RESULTS = result.results.size();
-    // following looks very similar but we have to separate them to avoid omp
-    // nested loops that causes performance degradation (DO NOT use if statement
-    // in #pragma omp)
-    if (parallel_experiments_ == 1) {
-      for (int i = 0; i < NUM_RESULTS; i++) {
-        executors[i]->run_circuit(*circuits[i], noise_model, config, methods[i],
-                                  sim_device_, result.results[i]);
+      reg_t seeds(result_size);
+      reg_t avg_seeds(result_size);
+      int_t iseed = 0;
+      for (int_t i = 0; i < circuits.size(); i++) {
+        if (circuits[i]->num_bind_params > 1) {
+          for (int_t j = 0; i < circuits[i]->num_bind_params; i++)
+            seeds[iseed++] = circuits[i]->seed_for_params[j];
+        } else
+          seeds[iseed++] = circuits[i]->seed;
       }
-    } else {
-#pragma omp parallel for num_threads(parallel_experiments_)
-      for (int i = 0; i < NUM_RESULTS; i++) {
-        executors[i]->run_circuit(*circuits[i], noise_model, config, methods[i],
-                                  sim_device_, result.results[i]);
+      MPI_Allreduce(seeds.data(), avg_seeds.data(), result_size, MPI_UINT64_T,
+                    MPI_SUM, MPI_COMM_WORLD);
+      iseed = 0;
+      for (int_t i = 0; i < circuits.size(); i++) {
+        if (circuits[i]->num_bind_params > 1) {
+          for (int_t j = 0; i < circuits[i]->num_bind_params; i++)
+            circuits[i]->seed_for_params[j] =
+                avg_seeds[iseed++] / num_processes_;
+        } else
+          circuits[i]->seed = avg_seeds[iseed++] / num_processes_;
       }
     }
+#endif
+
+    auto run_circuits = [this, &executors, &circuits, &noise_model, &config,
+                         &methods, &result, &result_offset](int_t i) {
+      executors[i]->run_circuit(*circuits[i], noise_model, config, methods[i],
+                                sim_device_,
+                                result.results.begin() + result_offset[i]);
+    };
+    Utils::apply_omp_parallel_for((parallel_experiments_ > 1), 0,
+                                  circuits.size(), run_circuits,
+                                  parallel_experiments_);
+
     executors.clear();
 
     // Check each experiment result for completed status.
@@ -599,7 +623,7 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
 
     bool all_failed = true;
     result.status = Result::Status::completed;
-    for (int i = 0; i < NUM_RESULTS; ++i) {
+    for (int i = 0; i < result.results.size(); ++i) {
       auto &experiment = result.results[i];
       if (experiment.status == ExperimentResult::Status::completed) {
         all_failed = false;
diff --git a/src/controllers/controller_execute.hpp b/src/controllers/controller_execute.hpp
index 4c2015461f..f3128a7739 100644
--- a/src/controllers/controller_execute.hpp
+++ b/src/controllers/controller_execute.hpp
@@ -67,7 +67,12 @@ Result controller_execute(std::vector<std::shared_ptr<Circuit>> &input_circs,
   //    pars = [par0, par1, ...] is a list of different parameterizations
   using pos_t = std::pair<int_t, int_t>;
   using exp_params_t = std::vector<std::pair<pos_t, std::vector<double>>>;
-  std::vector<exp_params_t> param_table = config.param_table;
+  std::vector<exp_params_t> &param_table = config.param_table;
+
+  // check if runtime binding is enable
+  bool runtime_parameter_bind = false;
+  if (config.runtime_parameter_bind_enable.has_value())
+    runtime_parameter_bind = config.runtime_parameter_bind_enable.value();
 
   // Validate parameterizations for number of circuis
   if (!param_table.empty() && param_table.size() != num_circs) {
@@ -78,6 +83,8 @@ Result controller_execute(std::vector<std::shared_ptr<Circuit>> &input_circs,
   std::vector<std::shared_ptr<Circuit>> circs;
   std::vector<std::shared_ptr<Circuit>> template_circs;
 
+  using myclock_t = std::chrono::high_resolution_clock;
+  auto timer_start = myclock_t::now();
   try {
     // Load circuits
     for (size_t i = 0; i < num_circs; i++) {
@@ -93,39 +100,41 @@ Result controller_execute(std::vector<std::shared_ptr<Circuit>> &input_circs,
         circ->set_params(false);
         circ->set_metadata(config, truncate);
         // Load different parameterizations of the initial circuit
-        const auto circ_params = param_table[i];
+        const auto &circ_params = param_table[i];
         const size_t num_params = circ_params[0].second.size();
         const size_t num_instr = circ->ops.size();
-        for (size_t j = 0; j < num_params; j++) {
+
+        if (runtime_parameter_bind && num_params > 1) {
           // Make a copy of the initial circuit
           auto param_circ = std::make_shared<Circuit>(*circ);
+          param_circ->num_bind_params = num_params;
+
           for (const auto &params : circ_params) {
             const auto instr_pos = params.first.first;
             const auto param_pos = params.first.second;
             // Validation
             if (instr_pos == AER::Config::GLOBAL_PHASE_POS) {
               // negative position is for global phase
-              param_circ->global_phase_angle = params.second[j];
-            } else {
-              if (instr_pos >= num_instr) {
-                std::cout << "Invalid parameterization: instruction position "
-                             "out of range: "
-                          << instr_pos << std::endl;
-                throw std::invalid_argument(
-                    R"(Invalid parameterization: instruction position out of range)");
-              }
-              auto &op = param_circ->ops[instr_pos];
+              param_circ->global_phase_for_params.resize(num_params);
+              for (size_t j = 0; j < num_params; j++)
+                param_circ->global_phase_for_params[j] = params.second[j];
+            } else if (instr_pos >= num_instr) {
+              throw std::invalid_argument(
+                  R"(Invalid parameterized qobj: instruction position out of range)");
+            }
+            auto &op = param_circ->ops[instr_pos];
+            if (!op.has_bind_params) {
               if (param_pos >= op.params.size()) {
                 throw std::invalid_argument(
-                    R"(Invalid parameterization: instruction param position out of range)");
-              }
-              if (j >= params.second.size()) {
-                throw std::invalid_argument(
-                    R"(Invalid parameterization: parameterization value out of range)");
+                    R"(Invalid parameterized qobj: instruction param position out of range)");
               }
-              // Update the param
-              op.params[param_pos] = params.second[j];
+              // resize parameter array
+              op.params.resize(op.params.size() * num_params);
+              op.has_bind_params = true;
             }
+            uint_t stride = op.params.size() / num_params;
+            for (size_t j = 0; j < num_params; j++)
+              op.params[param_pos + stride * j] = params.second[j];
           }
           // Run truncation.
           // TODO: Truncation should be performed and parameters should be
@@ -137,7 +146,53 @@ Result controller_execute(std::vector<std::shared_ptr<Circuit>> &input_circs,
             param_circ->set_metadata(config, true);
           }
           circs.push_back(param_circ);
-          template_circs.push_back(circ);
+          for (size_t j = 0; j < num_params; j++)
+            template_circs.push_back(circ);
+        } else {
+          for (size_t j = 0; j < num_params; j++) {
+            // Make a copy of the initial circuit
+            auto param_circ = std::make_shared<Circuit>(*circ);
+            for (const auto &params : circ_params) {
+              const auto instr_pos = params.first.first;
+              const auto param_pos = params.first.second;
+              // Validation
+              if (instr_pos == AER::Config::GLOBAL_PHASE_POS) {
+                // negative position is for global phase
+                circ->global_phase_angle = params.second[j];
+              } else {
+                if (instr_pos >= num_instr) {
+                  std::cout << "Invalid parameterization: instruction position "
+                               "out of range: "
+                            << instr_pos << std::endl;
+                  throw std::invalid_argument(
+                      R"(Invalid parameterization: instruction position out of range)");
+                }
+                auto &op = param_circ->ops[instr_pos];
+                if (param_pos >= op.params.size()) {
+                  throw std::invalid_argument(
+                      R"(Invalid parameterization: instruction param position out of range)");
+                }
+                if (j >= params.second.size()) {
+                  throw std::invalid_argument(
+                      R"(Invalid parameterization: parameterization value out of range)");
+                }
+                // Update the param
+                op.params[param_pos] = params.second[j];
+              }
+            }
+            // Run truncation.
+            // TODO: Truncation should be performed and parameters should be
+            // resolved after it. However, parameters are associated with
+            // indices of instructions, which can be changed in truncation.
+            // Therefore, current implementation performs truncation for each
+            // parameter set.
+            if (truncate) {
+              param_circ->set_params(true);
+              param_circ->set_metadata(config, true);
+            }
+            circs.push_back(param_circ);
+            template_circs.push_back(circ);
+          }
         }
       }
     }
@@ -148,7 +203,6 @@ Result controller_execute(std::vector<std::shared_ptr<Circuit>> &input_circs,
     result.message = std::string("Failed to load circuits: ") + e.what();
     return result;
   }
-
   int_t seed = -1;
   uint_t seed_shift = 0;
 
@@ -157,10 +211,23 @@ Result controller_execute(std::vector<std::shared_ptr<Circuit>> &input_circs,
   else
     seed = circs[0]->seed;
 
-  for (auto &circ : circs) {
-    circ->seed = seed + seed_shift;
-    seed_shift += 2113;
+  if (runtime_parameter_bind) {
+    for (auto &circ : circs) {
+      circ->seed = seed + seed_shift;
+      circ->seed_for_params.resize(circ->num_bind_params);
+      for (int_t i = 0; i < circ->num_bind_params; i++) {
+        circ->seed_for_params[i] = seed + seed_shift;
+        seed_shift += 2113;
+      }
+    }
+  } else {
+    for (auto &circ : circs) {
+      circ->seed = seed + seed_shift;
+      seed_shift += 2113;
+    }
   }
+  auto time_taken =
+      std::chrono::duration<double>(myclock_t::now() - timer_start).count();
 
   // Fix for MacOS and OpenMP library double initialization crash.
   // Issue: https://github.com/Qiskit/qiskit-aer/issues/1
@@ -170,6 +237,7 @@ Result controller_execute(std::vector<std::shared_ptr<Circuit>> &input_circs,
 
   for (size_t i = 0; i < ret.results.size(); ++i)
     ret.results[i].circ_id = template_circs[i]->circ_id;
+  ret.metadata.add(time_taken, "time_taken_parameter_binding");
 
   return ret;
 }
diff --git a/src/framework/circuit.hpp b/src/framework/circuit.hpp
index bc7645d694..a21a7fbd8c 100644
--- a/src/framework/circuit.hpp
+++ b/src/framework/circuit.hpp
@@ -63,6 +63,11 @@ class Circuit {
   double global_phase_angle = 0;
   bool remapped_qubits = false; // True if qubits have been remapped
 
+  // for runtime parameter bind, number of parameters per circuit
+  uint_t num_bind_params = 1;
+  reg_t seed_for_params;             // random seed for each parameter
+  rvector_t global_phase_for_params; // global phase angles for each param
+
   // Constructor
   // The constructor automatically calculates the num_qubits, num_memory,
   // num_registers parameters by scanning the input list of ops.
diff --git a/src/framework/config.hpp b/src/framework/config.hpp
index 60a5d7c313..1074f7acdf 100644
--- a/src/framework/config.hpp
+++ b/src/framework/config.hpp
@@ -171,6 +171,7 @@ struct Config {
   optional<uint_t> memory_blocking_bits;
   optional<uint_t> extended_stabilizer_norm_estimation_default_samples;
   optional<reg_t> target_gpus;
+  optional<bool> runtime_parameter_bind_enable;
 
   void clear() {
     shots = 1024;
@@ -270,7 +271,9 @@ struct Config {
     unitary_parallel_threshold.clear();
     memory_blocking_bits.clear();
     extended_stabilizer_norm_estimation_default_samples.clear();
+
     target_gpus.clear();
+    runtime_parameter_bind_enable.clear();
   }
 
   void merge(const Config &other) {
@@ -412,8 +415,12 @@ struct Config {
     if (other.extended_stabilizer_norm_estimation_default_samples.has_value())
       extended_stabilizer_norm_estimation_default_samples.value(
           other.extended_stabilizer_norm_estimation_default_samples.value());
+
     if (other.target_gpus.has_value())
       target_gpus.value(other.target_gpus.value());
+    if (other.runtime_parameter_bind_enable.has_value())
+      runtime_parameter_bind_enable.value(
+          other.runtime_parameter_bind_enable.value());
   }
 };
 
@@ -529,6 +536,8 @@ inline void from_json(const json_t &js, Config &config) {
   get_value(config.extended_stabilizer_norm_estimation_default_samples,
             "extended_stabilizer_norm_estimation_default_samples", js);
   get_value(config.target_gpus, "target_gpus", js);
+  get_value(config.runtime_parameter_bind_enable,
+            "runtime_parameter_bind_enable", js);
 }
 
 } // namespace AER
diff --git a/src/framework/operations.hpp b/src/framework/operations.hpp
index 4ec55757ff..335528de59 100755
--- a/src/framework/operations.hpp
+++ b/src/framework/operations.hpp
@@ -308,6 +308,9 @@ struct Op {
 
   // Save
   DataSubType save_type = DataSubType::single;
+
+  // runtime parameter bind
+  bool has_bind_params = false;
 };
 
 inline std::ostream &operator<<(std::ostream &s, const Op &op) {
@@ -940,6 +943,30 @@ inline Op make_qerror_loc(const reg_t &qubits, const std::string &label,
   return op;
 }
 
+// make new op by parameter binding
+inline Op bind_parameter(const Op &src, const uint_t iparam,
+                         const uint_t num_params) {
+  Op op;
+  op.type = src.type;
+  op.name = src.name;
+  op.qubits = src.qubits;
+  op.conditional = src.conditional;
+  op.conditional_reg = src.conditional_reg;
+
+  if (src.params.size() > 0) {
+    uint_t stride = src.params.size() / num_params;
+    op.params.resize(stride);
+    for (int_t i = 0; i < stride; i++)
+      op.params[i] = src.params[iparam * stride + i];
+  } else if (src.mats.size() > 0) {
+    uint_t stride = src.mats.size() / num_params;
+    op.mats.resize(stride);
+    for (int_t i = 0; i < stride; i++)
+      op.mats[i] = src.mats[iparam * stride + i];
+  }
+  return op;
+}
+
 //------------------------------------------------------------------------------
 // JSON conversion
 //------------------------------------------------------------------------------
diff --git a/src/framework/results/data/metadata.hpp b/src/framework/results/data/metadata.hpp
index cf7cb39bb1..789906b903 100644
--- a/src/framework/results/data/metadata.hpp
+++ b/src/framework/results/data/metadata.hpp
@@ -64,6 +64,8 @@ struct Metadata : public DataMap<SingleData, json_t, 1>,
 
   // Combine stored data
   Metadata &combine(Metadata &&other);
+
+  Metadata &copy(Metadata &other);
 };
 
 //------------------------------------------------------------------------------
@@ -77,6 +79,13 @@ Metadata &Metadata::combine(Metadata &&other) {
   return *this;
 }
 
+Metadata &Metadata::copy(Metadata &other) {
+  DataMap<SingleData, json_t, 1>::copy(other);
+  DataMap<SingleData, json_t, 2>::copy(other);
+  DataMap<SingleData, json_t, 3>::copy(other);
+  return *this;
+}
+
 json_t Metadata::to_json() {
   json_t result = json_t::object();
   DataMap<SingleData, json_t, 1>::add_to_json(result);
diff --git a/src/framework/results/data/subtypes/data_map.hpp b/src/framework/results/data/subtypes/data_map.hpp
index 8c942ae0ac..2d46bd19f9 100644
--- a/src/framework/results/data/subtypes/data_map.hpp
+++ b/src/framework/results/data/subtypes/data_map.hpp
@@ -43,6 +43,9 @@ class DataMap {
   // Combine with another data object
   void combine(DataMap<Data, T, N> &&other);
 
+  // copy from another data onject
+  void copy(DataMap<Data, T, N> &other);
+
   // Clear all stored data
   void clear();
 
@@ -75,6 +78,9 @@ class DataMap<Data, T, 1> {
   // Combine with another data object
   void combine(DataMap<Data, T, 1> &&other);
 
+  // copy from another data onject
+  void copy(DataMap<Data, T, 1> &other);
+
   // Clear all stored data
   void clear();
 
@@ -128,6 +134,22 @@ void DataMap<Data, T, N>::combine(DataMap<Data, T, N> &&other) {
   }
 }
 
+template <template <class> class Data, class T, size_t N>
+void DataMap<Data, T, N>::copy(DataMap<Data, T, N> &other) {
+  if (enabled) {
+    for (auto &pair : other.data_) {
+      const auto &key = pair.first;
+      // If empty we copy data without accumulating
+      if (data_.find(key) == data_.end()) {
+        data_[key] = pair.second;
+      } else {
+        auto t = pair.second;
+        data_[key].combine(std::move(t));
+      }
+    }
+  }
+}
+
 template <template <class> class Data, class T, size_t N>
 void DataMap<Data, T, N>::clear() {
   data_.clear();
@@ -186,6 +208,22 @@ void DataMap<Data, T, 1>::combine(DataMap<Data, T, 1> &&other) {
   }
 }
 
+template <template <class> class Data, class T>
+void DataMap<Data, T, 1>::copy(DataMap<Data, T, 1> &other) {
+  if (enabled) {
+    for (auto &pair : other.data_) {
+      const auto &key = pair.first;
+      // If empty we copy data without accumulating
+      if (data_.find(key) == data_.end()) {
+        data_[key] = pair.second;
+      } else {
+        auto t = pair.second;
+        data_[key].combine(std::move(t));
+      }
+    }
+  }
+}
+
 template <template <class> class Data, class T>
 void DataMap<Data, T, 1>::clear() {
   data_.clear();
diff --git a/src/noise/noise_model.hpp b/src/noise/noise_model.hpp
index feff38054e..ef3df0a4ce 100644
--- a/src/noise/noise_model.hpp
+++ b/src/noise/noise_model.hpp
@@ -307,6 +307,8 @@ Circuit NoiseModel::sample_noise_circuit(const Circuit &circ, RngEngine &rng,
   noisy_circ.seed = circ.seed;
   noisy_circ.shots = circ.shots;
   noisy_circ.header = circ.header;
+  noisy_circ.num_bind_params = circ.num_bind_params;
+  noisy_circ.seed_for_params = circ.seed_for_params;
 
   // Reserve double length of ops just to be safe
   noisy_circ.ops.reserve(2 * circ.ops.size());
diff --git a/src/simulators/batch_shots_executor.hpp b/src/simulators/batch_shots_executor.hpp
index bc991b2a0c..0c4a67cb56 100644
--- a/src/simulators/batch_shots_executor.hpp
+++ b/src/simulators/batch_shots_executor.hpp
@@ -16,6 +16,7 @@
 #define _batch_shots_executor_hpp_
 
 #include "simulators/parallel_state_executor.hpp"
+#include "transpile/batch_converter.hpp"
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -29,6 +30,9 @@ namespace AER {
 
 namespace CircuitExecutor {
 
+using OpItr = std::vector<Operations::Op>::const_iterator;
+using ResultItr = std::vector<ExperimentResult>::iterator;
+
 //-------------------------------------------------------------------------
 // batched-shots executor class implementation
 //-------------------------------------------------------------------------
@@ -43,8 +47,7 @@ class BatchShotsExecutor : public virtual MultiStateExecutor<state_t> {
       16; // multi-shot parallelization is applied if qubits is less than max
           // qubits
   bool enable_batch_multi_shots_ =
-      false;                 // multi-shot parallelization can be applied
-  uint_t local_state_index_; // local shot ID of current loop
+      false; // multi-shot parallelization can be applied
 public:
   BatchShotsExecutor();
   virtual ~BatchShotsExecutor();
@@ -54,33 +57,48 @@ class BatchShotsExecutor : public virtual MultiStateExecutor<state_t> {
   void set_parallelization(const Config &config, const Circuit &circ,
                            const Noise::NoiseModel &noise) override;
 
+  void run_circuit_with_sampling(Circuit &circ, const Config &config,
+                                 RngEngine &init_rng,
+                                 ResultItr result) override;
+
   void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise,
                          const Config &config, RngEngine &init_rng,
-                         ExperimentResult &result, bool sample_noise) override;
+                         ResultItr result_it, bool sample_noise) override;
 
   // apply ops for multi-shots to one group
   template <typename InputIterator>
   void apply_ops_batched_shots_for_group(int_t i_group, InputIterator first,
                                          InputIterator last,
                                          const Noise::NoiseModel &noise,
-                                         ExperimentResult &result,
-                                         RngEngine &init_rng, uint_t rng_seed,
+                                         ResultItr result,
+                                         std::vector<RngEngine> &rng,
                                          bool final_ops);
 
   // apply op to multiple shots , return flase if op is not supported to execute
   // in a batch
   virtual bool apply_batched_op(const int_t istate, const Operations::Op &op,
-                                ExperimentResult &result,
-                                std::vector<RngEngine> &rng,
+                                ResultItr result, std::vector<RngEngine> &rng,
                                 bool final_op = false) {
     return false;
   }
 
   // apply sampled noise to multiple-shots (this is used for ops contains
   // non-Pauli operators)
-  void apply_batched_noise_ops(
-      const int_t i_group, const std::vector<std::vector<Operations::Op>> &ops,
-      ExperimentResult &result, std::vector<RngEngine> &rng);
+  void
+  apply_batched_noise_ops(const int_t i_group,
+                          const std::vector<std::vector<Operations::Op>> &ops,
+                          ResultItr result, std::vector<RngEngine> &rng);
+
+  // batched expval Pauli
+  void apply_batched_expval(const int_t istate, const Operations::Op &op,
+                            ResultItr result);
+
+  // sample measure for runtime parameter binding
+  template <typename InputIterator>
+  void batched_measure_sampler(InputIterator first_meas,
+                               InputIterator last_meas, uint_t shots,
+                               uint_t i_group, ResultItr result,
+                               std::vector<RngEngine> &rng);
 };
 
 template <class state_t>
@@ -96,6 +114,11 @@ void BatchShotsExecutor<state_t>::set_config(const Config &config) {
   // enable batched multi-shots/experiments optimization
   batched_shots_gpu_ = config.batched_shots_gpu;
 
+  // enable batch execution for runtime parameter binding
+  if (Base::num_bind_params_ > 1 && Base::sim_device_ == Device::GPU) {
+    batched_shots_gpu_ = true;
+  }
+
   batched_shots_gpu_max_qubits_ = config.batched_shots_gpu_max_qubits;
   if (Base::method_ == Method::density_matrix ||
       Base::method_ == Method::unitary)
@@ -110,12 +133,10 @@ void BatchShotsExecutor<state_t>::set_parallelization(
   enable_batch_multi_shots_ = false;
   if (batched_shots_gpu_ && Base::sim_device_ != Device::CPU) {
     enable_batch_multi_shots_ = true;
-    if (circ.num_qubits >= batched_shots_gpu_max_qubits_)
+    if (circ.num_qubits > batched_shots_gpu_max_qubits_)
       enable_batch_multi_shots_ = false;
-    else if (circ.shots == 1)
+    else if (circ.shots == 1 && circ.num_bind_params == 1)
       enable_batch_multi_shots_ = false;
-    //    else if (Base::multiple_chunk_required(circ, noise))
-    //      enable_batch_multi_shots_ = false;
   }
 
 #ifdef AER_CUSTATEVEC
@@ -125,14 +146,180 @@ void BatchShotsExecutor<state_t>::set_parallelization(
 #endif
 }
 
+template <class state_t>
+void BatchShotsExecutor<state_t>::run_circuit_with_sampling(
+    Circuit &circ, const Config &config, RngEngine &init_rng,
+    ResultItr result_it) {
+  if (circ.num_bind_params == 1 || !enable_batch_multi_shots_) {
+    return Executor<state_t>::run_circuit_with_sampling(circ, config, init_rng,
+                                                        result_it);
+  }
+
+  Noise::NoiseModel dummy_noise;
+  state_t dummy_state;
+  int_t i;
+  int_t i_begin, n_shots;
+
+  Base::num_qubits_ = circ.num_qubits;
+  Base::num_creg_memory_ = circ.num_memory;
+  Base::num_creg_registers_ = circ.num_registers;
+  Base::num_bind_params_ = circ.num_bind_params;
+
+  if (Base::sim_device_ == Device::GPU) {
+#ifdef _OPENMP
+    if (omp_get_num_threads() == 1)
+      Base::shot_omp_parallel_ = true;
+#endif
+  } else if (Base::sim_device_ == Device::ThrustCPU) {
+    Base::shot_omp_parallel_ = false;
+  }
+
+  // distribute parameters
+  Base::set_distribution(circ.num_bind_params);
+  uint_t mem = Base::required_memory_mb(config, circ, dummy_noise);
+  if (Base::sim_device_ == Device::GPU && Base::num_gpus_ > 0)
+    Base::num_max_shots_ = Base::max_gpu_memory_mb_ * 8 / 10 / mem;
+  else
+    Base::num_max_shots_ = Base::max_memory_mb_ / mem;
+  if (Base::num_max_shots_ == 0)
+    Base::num_max_shots_ = 1;
+
+  auto fusion_pass = Base::transpile_fusion(circ.opset(), config);
+  ExperimentResult fusion_result;
+  fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
+                               fusion_result);
+  // convert parameters into matrix in cvector_t format
+  auto timer_start = myclock_t::now();
+  Transpile::BatchConverter batch_converter;
+  batch_converter.set_config(config);
+  batch_converter.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
+                                   fusion_result);
+  auto time_taken =
+      std::chrono::duration<double>(myclock_t::now() - timer_start).count();
+  for (i = 0; i < circ.num_bind_params; i++) {
+    ExperimentResult &result = *(result_it + i);
+    result.metadata.copy(fusion_result.metadata);
+    // Add batched multi-shots optimizaiton metadata
+    result.metadata.add(true, "batched_shots_optimization");
+    result.metadata.add(time_taken, "parameter_bind_batch_converter_time");
+  }
+
+  Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ);
+
+#ifdef AER_MPI
+  // if shots are distributed to MPI processes, allocate cregs to be gathered
+  if (Base::num_process_per_experiment_ > 1)
+    Base::cregs_.resize(circ.num_bind_params * circ.shots);
+#endif
+
+  auto first_meas = circ.first_measure_pos; // Position of first measurement op
+  bool final_ops = (first_meas == circ.ops.size());
+
+  // adjust max_matrix_qubits_ so that all shots can be stored on GPU
+  if (circ.ops.begin() + first_meas != circ.ops.end())
+    Base::max_sampling_shots_ = circ.shots;
+
+  i_begin = 0;
+  while (i_begin < Base::num_local_states_) {
+    // loop for states can be stored in available memory
+    n_shots = Base::num_local_states_ - i_begin;
+    n_shots = std::min(n_shots, (int_t)Base::num_max_shots_);
+
+    // allocate shots
+    this->allocate_states(n_shots, config);
+
+    // Set state config
+    for (i = 0; i < n_shots; i++) {
+      Base::states_[i].set_parallelization(Base::parallel_state_update_);
+    }
+
+    // initialization (equivalent to initialize_qreg + initialize_creg)
+    auto init_group = [this](int_t ig) {
+      for (uint_t j = Base::top_state_of_group_[ig];
+           j < Base::top_state_of_group_[ig + 1]; j++) {
+        // enabling batch shots optimization
+        Base::states_[j].qreg().enable_batch(true);
+
+        // initialize qreg here
+        Base::states_[j].qreg().set_num_qubits(Base::num_qubits_);
+        Base::states_[j].qreg().initialize();
+
+        // initialize creg here
+        Base::states_[j].qreg().initialize_creg(Base::num_creg_memory_,
+                                                Base::num_creg_registers_);
+      }
+    };
+    Utils::apply_omp_parallel_for(
+        (Base::num_groups_ > 1 && Base::shot_omp_parallel_), 0,
+        Base::num_groups_, init_group, Base::num_groups_);
+
+    // apply ops to multiple-shots
+    auto apply_ops_lambda = [this, circ, init_rng, first_meas, final_ops,
+                             dummy_noise, &result_it](int_t i) {
+      std::vector<RngEngine> rng(Base::num_states_in_group_[i]);
+      for (int_t j = 0; j < Base::num_states_in_group_[i]; j++) {
+        uint_t iparam =
+            Base::global_state_index_ + Base::top_state_of_group_[i] + j;
+        if (iparam == 0)
+          rng[j] = init_rng;
+        else
+          rng[j].set_seed(circ.seed_for_params[iparam]);
+      }
+      apply_ops_batched_shots_for_group(i, circ.ops.cbegin(),
+                                        circ.ops.cbegin() + first_meas,
+                                        dummy_noise, result_it, rng, final_ops);
+
+      batched_measure_sampler(circ.ops.begin() + first_meas, circ.ops.end(),
+                              circ.shots, i, result_it, rng);
+    };
+    Utils::apply_omp_parallel_for(
+        (Base::num_groups_ > 1 && Base::shot_omp_parallel_), 0,
+        Base::num_groups_, apply_ops_lambda, Base::num_groups_);
+
+    Base::global_state_index_ += n_shots;
+    i_begin += n_shots;
+  }
+
+  // gather cregs on MPI processes and save to result
+#ifdef AER_MPI
+  if (Base::num_process_per_experiment_ > 1) {
+    Base::gather_creg_memory(Base::cregs_, Base::state_index_begin_);
+
+    for (i = 0; i < circ.num_bind_params; i++) {
+      for (int_t j = 0; j < circ.shots; j++) {
+        (result_it + i)
+            ->save_count_data(Base::cregs_[i * circ.shots + j],
+                              Base::save_creg_memory_);
+      }
+    }
+    Base::cregs_.clear();
+  }
+#endif
+
+#ifdef AER_THRUST_GPU
+  if (Base::sim_device_ == Device::GPU) {
+    int nDev;
+    if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
+      cudaGetLastError();
+      nDev = 0;
+    }
+    if (nDev > Base::num_groups_)
+      nDev = Base::num_groups_;
+    for (i = 0; i < circ.num_bind_params; i++)
+      (result_it + i)
+          ->metadata.add(nDev, "batched_shots_optimization_parallel_gpus");
+  }
+#endif
+}
+
 template <class state_t>
 void BatchShotsExecutor<state_t>::run_circuit_shots(
     Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
-    RngEngine &init_rng, ExperimentResult &result, bool sample_noise) {
+    RngEngine &init_rng, ResultItr result_it, bool sample_noise) {
   state_t dummy_state;
   // if batched-shot is not applicable, use base multi-shots executor
   if (!enable_batch_multi_shots_) {
-    return Base::run_circuit_shots(circ, noise, config, init_rng, result,
+    return Base::run_circuit_shots(circ, noise, config, init_rng, result_it,
                                    sample_noise);
   }
 
@@ -141,6 +328,8 @@ void BatchShotsExecutor<state_t>::run_circuit_shots(
   Base::num_qubits_ = circ.num_qubits;
   Base::num_creg_memory_ = circ.num_memory;
   Base::num_creg_registers_ = circ.num_registers;
+  Base::num_bind_params_ = circ.num_bind_params;
+  Base::num_shots_per_bind_param_ = circ.shots;
 
   if (Base::sim_device_ == Device::GPU) {
 #ifdef _OPENMP
@@ -151,46 +340,50 @@ void BatchShotsExecutor<state_t>::run_circuit_shots(
     Base::shot_omp_parallel_ = false;
   }
 
-  Base::set_distribution(circ.shots);
+  Base::set_distribution(circ.shots * Base::num_bind_params_);
   Base::num_max_shots_ = Base::get_max_parallel_shots(config, circ, noise);
   if (Base::num_max_shots_ == 0)
     Base::num_max_shots_ = 1;
 
-  RngEngine rng = init_rng;
-
   Circuit circ_opt;
   if (sample_noise)
-    circ_opt =
-        noise.sample_noise(circ, rng, Noise::NoiseModel::Method::circuit, true);
+    circ_opt = noise.sample_noise(circ, init_rng,
+                                  Noise::NoiseModel::Method::circuit, true);
   else
     circ_opt = circ;
   auto fusion_pass = Base::transpile_fusion(circ_opt.opset(), config);
-
+  ExperimentResult fusion_result;
   fusion_pass.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(),
-                               result);
-  Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ_opt);
+                               fusion_result);
+  // convert parameters into matrix in cvector_t format
+  Transpile::BatchConverter batch_converter;
+  batch_converter.set_config(config);
+  batch_converter.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(),
+                                   fusion_result);
 
-  // Add batched multi-shots optimizaiton metadata
-  result.metadata.add(true, "batched_shots_optimization");
+  Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ_opt);
 
   int_t i;
   int_t i_begin, n_shots;
 
+  for (i = 0; i < Base::num_bind_params_; i++) {
+    ExperimentResult &result = *(result_it + i);
+    result.metadata.copy(fusion_result.metadata);
+    // Add batched multi-shots optimizaiton metadata
+    result.metadata.add(true, "batched_shots_optimization");
+  }
+
 #ifdef AER_MPI
   // if shots are distributed to MPI processes, allocate cregs to be gathered
   if (Base::num_process_per_experiment_ > 1)
-    Base::cregs_.resize(circ_opt.shots);
+    Base::cregs_.resize(circ_opt.shots * Base::num_bind_params_);
 #endif
 
   i_begin = 0;
   while (i_begin < Base::num_local_states_) {
-    local_state_index_ = Base::global_state_index_ + i_begin;
-
     // loop for states can be stored in available memory
-    n_shots = std::min(Base::num_local_states_, Base::num_max_shots_);
-    if (i_begin + n_shots > Base::num_local_states_) {
-      n_shots = Base::num_local_states_ - i_begin;
-    }
+    n_shots = Base::num_local_states_ - i_begin;
+    n_shots = std::min(n_shots, (int_t)Base::num_max_shots_);
 
     // allocate shots
     this->allocate_states(n_shots, config);
@@ -198,9 +391,7 @@ void BatchShotsExecutor<state_t>::run_circuit_shots(
     // Set state config
     for (i = 0; i < n_shots; i++) {
       Base::states_[i].set_parallelization(Base::parallel_state_update_);
-      Base::states_[i].set_global_phase(circ.global_phase_angle);
     }
-    this->set_global_phase(circ_opt.global_phase_angle);
 
     // initialization (equivalent to initialize_qreg + initialize_creg)
     auto init_group = [this](int_t ig) {
@@ -220,40 +411,57 @@ void BatchShotsExecutor<state_t>::run_circuit_shots(
     };
     Utils::apply_omp_parallel_for(
         (Base::num_groups_ > 1 && Base::shot_omp_parallel_), 0,
-        Base::num_groups_, init_group);
-
-    this->apply_global_phase(); // this is parallelized in sub-classes
+        Base::num_groups_, init_group, Base::num_groups_);
 
     // apply ops to multiple-shots
-    if (Base::num_groups_ > 1 && Base::shot_omp_parallel_) {
-      std::vector<ExperimentResult> par_results(Base::num_groups_);
-#pragma omp parallel for num_threads(Base::num_groups_)
-      for (i = 0; i < Base::num_groups_; i++)
-        apply_ops_batched_shots_for_group(
-            i, circ_opt.ops.cbegin(), circ_opt.ops.cend(), noise,
-            par_results[i], rng, circ_opt.seed, true);
-
-      for (auto &res : par_results)
-        result.combine(std::move(res));
-    } else {
-      for (i = 0; i < Base::num_groups_; i++)
-        apply_ops_batched_shots_for_group(i, circ_opt.ops.cbegin(),
-                                          circ_opt.ops.cend(), noise, result,
-                                          rng, circ_opt.seed, true);
+    std::vector<std::vector<ExperimentResult>> par_results(Base::num_groups_);
+    auto apply_ops_lambda = [this, circ, circ_opt, &par_results, init_rng,
+                             noise](int_t i) {
+      par_results[i].resize(circ.num_bind_params);
+      std::vector<RngEngine> rng(Base::num_states_in_group_[i]);
+      for (int_t j = 0; j < Base::num_states_in_group_[i]; j++) {
+        uint_t ishot =
+            Base::global_state_index_ + Base::top_state_of_group_[i] + j;
+        uint_t iparam = ishot / Base::num_shots_per_bind_param_;
+        if (ishot == 0)
+          rng[j] = init_rng;
+        else {
+          if (Base::num_bind_params_ > 1)
+            rng[j].set_seed(circ.seed_for_params[iparam] +
+                            (ishot % Base::num_shots_per_bind_param_));
+          else
+            rng[j].set_seed(circ_opt.seed + ishot);
+        }
+      }
+      apply_ops_batched_shots_for_group(i, circ_opt.ops.cbegin(),
+                                        circ_opt.ops.cend(), noise,
+                                        par_results[i].begin(), rng, true);
+    };
+    Utils::apply_omp_parallel_for(
+        (Base::num_groups_ > 1 && Base::shot_omp_parallel_), 0,
+        Base::num_groups_, apply_ops_lambda, Base::num_groups_);
+
+    for (auto &res : par_results) {
+      for (i = 0; i < Base::num_bind_params_; i++) {
+        (result_it + i)->combine(std::move(res[i]));
+      }
     }
 
     // collect measured bits and copy memory
     for (i = 0; i < n_shots; i++) {
       if (Base::num_process_per_experiment_ > 1) {
         Base::states_[i].qreg().read_measured_data(
-            Base::cregs_[local_state_index_ + i]);
+            Base::cregs_[Base::global_state_index_ + i_begin + i]);
       } else {
+        uint_t ishot = Base::global_state_index_ + i;
+        uint_t iparam = ishot / Base::num_shots_per_bind_param_;
         Base::states_[i].qreg().read_measured_data(Base::states_[i].creg());
-        result.save_count_data(Base::states_[i].creg(),
-                               Base::save_creg_memory_);
+        (result_it + iparam)
+            ->save_count_data(Base::states_[i].creg(), Base::save_creg_memory_);
       }
     }
 
+    Base::global_state_index_ += n_shots;
     i_begin += n_shots;
   }
 
@@ -262,13 +470,16 @@ void BatchShotsExecutor<state_t>::run_circuit_shots(
   if (Base::num_process_per_experiment_ > 1) {
     Base::gather_creg_memory(Base::cregs_, Base::state_index_begin_);
 
-    for (i = 0; i < circ_opt.shots; i++)
-      result.save_count_data(Base::cregs_[i], Base::save_creg_memory_);
+    for (i = 0; i < circ_opt.shots; i++) {
+      uint_t iparam = i / Base::num_shots_per_bind_param_;
+      (result_it + iparam)
+          ->save_count_data(Base::cregs_[i], Base::save_creg_memory_);
+    }
     Base::cregs_.clear();
   }
 #endif
 
-#ifdef AER_THRUST_GPU
+#ifdef AER_THRUST_CUDA
   if (Base::sim_device_ == Device::GPU) {
     int nDev;
     if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
@@ -277,7 +488,9 @@ void BatchShotsExecutor<state_t>::run_circuit_shots(
     }
     if (nDev > Base::num_groups_)
       nDev = Base::num_groups_;
-    result.metadata.add(nDev, "batched_shots_optimization_parallel_gpus");
+    for (i = 0; i < Base::num_bind_params_; i++)
+      (result_it + i)
+          ->metadata.add(nDev, "batched_shots_optimization_parallel_gpus");
   }
 #endif
 }
@@ -286,25 +499,15 @@ template <class state_t>
 template <typename InputIterator>
 void BatchShotsExecutor<state_t>::apply_ops_batched_shots_for_group(
     int_t i_group, InputIterator first, InputIterator last,
-    const Noise::NoiseModel &noise, ExperimentResult &result,
-    RngEngine &init_rng, uint_t rng_seed, bool final_ops) {
+    const Noise::NoiseModel &noise, ResultItr result_it,
+    std::vector<RngEngine> &rng, bool final_ops) {
   uint_t istate = Base::top_state_of_group_[i_group];
-  std::vector<RngEngine> rng(Base::num_states_in_group_[i_group]);
 #ifdef _OPENMP
   int num_inner_threads = omp_get_max_threads() / omp_get_num_threads();
 #else
   int num_inner_threads = 1;
 #endif
 
-  for (uint_t j = Base::top_state_of_group_[i_group];
-       j < Base::top_state_of_group_[i_group + 1]; j++)
-    if (local_state_index_ + j == 0)
-      rng[j - Base::top_state_of_group_[i_group]] = init_rng;
-    else {
-      rng[j - Base::top_state_of_group_[i_group]].set_seed(
-          rng_seed + local_state_index_ + j);
-    }
-
   for (auto op = first; op != last; ++op) {
     if (op->type == Operations::OpType::sample_noise) {
       // sample error here
@@ -353,24 +556,25 @@ void BatchShotsExecutor<state_t>::apply_ops_batched_shots_for_group(
       if (count_ops == 0) {
         continue; // do nothing
       }
-      if (non_pauli_gate_count == 0) { // ptimization for Pauli error
+      if (non_pauli_gate_count == 0) { // optimization for Pauli error
         Base::states_[istate].qreg().apply_batched_pauli_ops(noise_ops);
       } else {
         // otherwise execute each circuit
-        apply_batched_noise_ops(i_group, noise_ops, result, rng);
+        apply_batched_noise_ops(i_group, noise_ops, result_it, rng);
       }
     } else {
-      if (!apply_batched_op(istate, *op, result, rng,
+      if (!apply_batched_op(istate, *op, result_it, rng,
                             final_ops && (op + 1 == last))) {
         // call apply_op for each state
-        for (uint_t j = Base::top_state_of_group_[i_group];
-             j < Base::top_state_of_group_[i_group + 1]; j++) {
-          Base::states_[j].qreg().enable_batch(false);
-          Base::states_[j].qreg().read_measured_data(Base::states_[j].creg());
-          Base::states_[j].apply_op(*op, result,
-                                    rng[j - Base::top_state_of_group_[i_group]],
-                                    final_ops && (op + 1 == last));
-          Base::states_[j].qreg().enable_batch(true);
+        for (int_t j = 0; j < Base::num_states_in_group_[i_group]; j++) {
+          uint_t is = Base::top_state_of_group_[i_group] + j;
+          uint_t ip = (Base::global_state_index_ + is) /
+                      Base::num_shots_per_bind_param_;
+          Base::states_[is].qreg().enable_batch(false);
+          Base::states_[is].qreg().read_measured_data(Base::states_[is].creg());
+          Base::states_[is].apply_op(*op, *(result_it + ip), rng[j],
+                                     final_ops && (op + 1 == last));
+          Base::states_[is].qreg().enable_batch(true);
         }
       }
     }
@@ -380,7 +584,7 @@ void BatchShotsExecutor<state_t>::apply_ops_batched_shots_for_group(
 template <class state_t>
 void BatchShotsExecutor<state_t>::apply_batched_noise_ops(
     const int_t i_group, const std::vector<std::vector<Operations::Op>> &ops,
-    ExperimentResult &result, std::vector<RngEngine> &rng) {
+    ResultItr result_it, std::vector<RngEngine> &rng) {
   int_t i, j, k, count, nop, pos = 0;
   uint_t istate = Base::top_state_of_group_[i_group];
   count = ops.size();
@@ -443,24 +647,16 @@ void BatchShotsExecutor<state_t>::apply_batched_noise_ops(
       cop.conditional = true;
       cop.conditional_reg = sys_reg;
 
-      if (!apply_batched_op(istate, cop, result, rng, false)) {
+      if (!apply_batched_op(istate, cop, result_it, rng, false)) {
         // call apply_op for each state
-        /*if(cop.conditional){
-          //copy creg to local state
-          reg_t reg_pos(1);
-          reg_t mem_pos;
-          int bit =
-        Base::states_[j].qreg().measured_cregister(cop.conditional_reg);
-          const reg_t reg = Utils::int2reg(bit, 2, 1);
-          reg_pos[0] = cop.conditional_reg;
-          Base::states_[j].creg().store_measure(reg, mem_pos, reg_pos);
-        }*/
-        for (uint_t j = Base::top_state_of_group_[i_group];
-             j < Base::top_state_of_group_[i_group + 1]; j++) {
-          Base::states_[j].qreg().enable_batch(false);
-          Base::states_[j].apply_op(
-              cop, result, rng[j - Base::top_state_of_group_[i_group]], false);
-          Base::states_[j].qreg().enable_batch(true);
+        for (int_t j = 0; j < Base::num_states_in_group_[i_group]; j++) {
+          uint_t is = Base::top_state_of_group_[i_group] + j;
+          uint_t ip = (Base::global_state_index_ + is) /
+                      Base::num_shots_per_bind_param_;
+          Base::states_[is].qreg().enable_batch(false);
+          Base::states_[is].qreg().read_measured_data(Base::states_[is].creg());
+          Base::states_[is].apply_op(cop, *(result_it + ip), rng[j], false);
+          Base::states_[is].qreg().enable_batch(true);
         }
       }
     }
@@ -469,6 +665,203 @@ void BatchShotsExecutor<state_t>::apply_batched_noise_ops(
   }
 }
 
+template <class state_t>
+void BatchShotsExecutor<state_t>::apply_batched_expval(const int_t istate,
+                                                       const Operations::Op &op,
+                                                       ResultItr result) {
+  std::vector<double> val;
+  bool variance = (op.type == Operations::OpType::save_expval_var);
+  for (int_t i = 0; i < op.expval_params.size(); i++) {
+    std::complex<double> cprm;
+
+    if (variance)
+      cprm = std::complex<double>(std::get<1>(op.expval_params[i]),
+                                  std::get<2>(op.expval_params[i]));
+    else
+      cprm = std::get<1>(op.expval_params[i]);
+    bool last = (i == op.expval_params.size() - 1);
+
+    Base::states_[istate].qreg().batched_expval_pauli(
+        val, op.qubits, std::get<0>(op.expval_params[i]), variance, cprm, last);
+  }
+
+  if (val.size() == 0)
+    return;
+
+  if (variance) {
+    for (int_t i = 0; i < val.size() / 2; i++) {
+      uint_t ip = (Base::global_state_index_ + istate + i) /
+                  Base::num_shots_per_bind_param_;
+
+      std::vector<double> expval_var(2);
+      expval_var[0] = val[i * 2];                               // mean
+      expval_var[1] = val[i * 2 + 1] - val[i * 2] * val[i * 2]; // variance
+      (result + ip)
+          ->save_data_average(Base::states_[istate + i].creg(),
+                              op.string_params[0], expval_var, op.type,
+                              op.save_type);
+    }
+  } else {
+    for (int_t i = 0; i < val.size(); i++) {
+      uint_t ip = (Base::global_state_index_ + istate + i) /
+                  Base::num_shots_per_bind_param_;
+
+      (result + ip)
+          ->save_data_average(Base::states_[istate + i].creg(),
+                              op.string_params[0], val[i], op.type,
+                              op.save_type);
+    }
+  }
+}
+
+template <class state_t>
+template <typename InputIterator>
+void BatchShotsExecutor<state_t>::batched_measure_sampler(
+    InputIterator first_meas, InputIterator last_meas, uint_t shots,
+    uint_t i_group, ResultItr result, std::vector<RngEngine> &rng) {
+  uint_t par_states = 1;
+  if (Base::max_parallel_threads_ >= Base::num_groups_ * 2) {
+    par_states =
+        std::min((uint_t)(Base::max_parallel_threads_ / Base::num_groups_),
+                 Base::num_states_in_group_[i_group]);
+  }
+
+  // Check if meas_circ is empty, and if so return initial creg
+  if (first_meas == last_meas) {
+    return;
+  }
+
+  std::vector<Operations::Op> meas_ops;
+  std::vector<Operations::Op> roerror_ops;
+  for (auto op = first_meas; op != last_meas; op++) {
+    if (op->type == Operations::OpType::roerror) {
+      roerror_ops.push_back(*op);
+    } else { /*(op.type == Operations::OpType::measure) */
+      meas_ops.push_back(*op);
+    }
+  }
+
+  // Get measured qubits from circuit sort and delete duplicates
+  std::vector<uint_t> meas_qubits; // measured qubits
+  for (const auto &op : meas_ops) {
+    for (size_t j = 0; j < op.qubits.size(); ++j)
+      meas_qubits.push_back(op.qubits[j]);
+  }
+  sort(meas_qubits.begin(), meas_qubits.end());
+  meas_qubits.erase(unique(meas_qubits.begin(), meas_qubits.end()),
+                    meas_qubits.end());
+
+  // Make qubit map of position in vector of measured qubits
+  std::unordered_map<uint_t, uint_t> qubit_map;
+  for (uint_t j = 0; j < meas_qubits.size(); ++j) {
+    qubit_map[meas_qubits[j]] = j;
+  }
+
+  // Maps of memory and register to qubit position
+  std::map<uint_t, uint_t> memory_map;
+  std::map<uint_t, uint_t> register_map;
+  for (const auto &op : meas_ops) {
+    for (size_t j = 0; j < op.qubits.size(); ++j) {
+      auto pos = qubit_map[op.qubits[j]];
+      if (!op.memory.empty())
+        memory_map[op.memory[j]] = pos;
+      if (!op.registers.empty())
+        register_map[op.registers[j]] = pos;
+    }
+  }
+
+  // Generate the samples
+  auto timer_start = myclock_t::now();
+  std::vector<double> rnd_shots(Base::num_states_in_group_[i_group] * shots);
+
+  auto make_random_proc = [this, shots, &rnd_shots, par_states, i_group,
+                           &rng](int_t i) {
+    uint_t i_state, state_end;
+    i_state = Base::num_states_in_group_[i_group] * i / par_states;
+    state_end = Base::num_states_in_group_[i_group] * (i + 1) / par_states;
+
+    for (; i_state < state_end; i_state++) {
+      for (int_t j = 0; j < shots; j++)
+        rnd_shots[i_state * shots + j] =
+            rng[i_state].rand(0, 1) + (double)i_state;
+    }
+  };
+  Utils::apply_omp_parallel_for((par_states > 1), 0, par_states,
+                                make_random_proc, par_states);
+
+  reg_t allbit_samples =
+      Base::states_[Base::top_state_of_group_[i_group]].qreg().sample_measure(
+          rnd_shots);
+
+  uint_t mask = (1ull << Base::num_qubits_) - 1;
+
+  // Process samples
+  uint_t num_memory =
+      (memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first;
+  uint_t num_registers =
+      (register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first;
+
+  auto save_counts_proc = [this, shots, par_states, i_group, num_memory,
+                           num_registers, &result, &allbit_samples, memory_map,
+                           register_map, &rng, mask, meas_qubits,
+                           roerror_ops](int_t j) {
+    uint_t i_state, state_end;
+    i_state = Base::num_states_in_group_[i_group] * j / par_states;
+    state_end = Base::num_states_in_group_[i_group] * (j + 1) / par_states;
+
+    for (; i_state < state_end; i_state++) {
+      uint_t is = Base::top_state_of_group_[i_group] + i_state;
+      uint_t ip = (Base::global_state_index_ + is);
+
+      for (int_t i = 0; i < shots; i++) {
+        ClassicalRegister creg;
+        creg.initialize(num_memory, num_registers);
+        reg_t all_samples(meas_qubits.size());
+
+        uint_t val = allbit_samples[i_state * shots + i] & mask;
+        reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
+        for (int_t mq = 0; mq < meas_qubits.size(); mq++) {
+          all_samples[mq] = allbit_sample[meas_qubits[mq]];
+        }
+
+        // process memory bit measurements
+        for (const auto &pair : memory_map) {
+          creg.store_measure(reg_t({all_samples[pair.second]}),
+                             reg_t({pair.first}), reg_t());
+        }
+        // process register bit measurements
+        for (const auto &pair : register_map) {
+          creg.store_measure(reg_t({all_samples[pair.second]}), reg_t(),
+                             reg_t({pair.first}));
+        }
+
+        // process read out errors for memory and registers
+        for (const Operations::Op &roerror : roerror_ops)
+          creg.apply_roerror(roerror, rng[i_state]);
+
+        // Save count data
+        if (Base::num_process_per_experiment_ > 1)
+          Base::cregs_[ip * shots + i] = creg;
+        else
+          (result + ip)->save_count_data(creg, Base::save_creg_memory_);
+      }
+    }
+  };
+  Utils::apply_omp_parallel_for((par_states > 1), 0, par_states,
+                                save_counts_proc, par_states);
+
+  auto time_taken =
+      std::chrono::duration<double>(myclock_t::now() - timer_start).count();
+
+  for (int_t i_state = 0; i_state < Base::num_states_in_group_[i_group];
+       i_state++) {
+    uint_t ip = Base::global_state_index_ + Base::top_state_of_group_[i_group] +
+                i_state;
+    (result + ip)->metadata.add(time_taken, "sample_measure_time");
+    (result + ip)->metadata.add(true, "measure_sampling");
+  }
+}
+
 //-------------------------------------------------------------------------
 } // end namespace CircuitExecutor
 //-------------------------------------------------------------------------
diff --git a/src/simulators/circuit_executor.hpp b/src/simulators/circuit_executor.hpp
index 73a2d502e8..b5ea6193a0 100644
--- a/src/simulators/circuit_executor.hpp
+++ b/src/simulators/circuit_executor.hpp
@@ -35,6 +35,7 @@ namespace AER {
 namespace CircuitExecutor {
 
 using OpItr = std::vector<Operations::Op>::const_iterator;
+using ResultItr = std::vector<ExperimentResult>::iterator;
 
 // Timer type
 using myclock_t = std::chrono::high_resolution_clock;
@@ -50,7 +51,7 @@ class Base {
 
   virtual void run_circuit(Circuit &circ, const Noise::NoiseModel &noise,
                            const Config &config, const Method method,
-                           const Device device, ExperimentResult &result) = 0;
+                           const Device device, ResultItr result) = 0;
 
   // Return an estimate of the required memory for a circuit.
   virtual size_t required_memory_mb(const Config &config,
@@ -88,8 +89,9 @@ class Executor : public Base {
   int max_parallel_shots_;
   size_t max_memory_mb_;
   size_t max_gpu_memory_mb_;
-  int num_gpus_;      // max number of GPU per process
-  reg_t target_gpus_; // GPUs to be used
+  size_t min_gpu_memory_mb_; // minimum size per GPU
+  int num_gpus_;             // max number of GPU per process
+  reg_t target_gpus_;        // GPUs to be used
 
   // use explicit parallelization
   bool explicit_parallelization_;
@@ -124,13 +126,17 @@ class Executor : public Base {
   // if circuit has statevector operations or not
   bool has_statevector_ops_;
 
+  // runtime parameter binding
+  uint_t num_bind_params_ = 1;
+  uint_t num_shots_per_bind_param_ = 1;
+
 public:
   Executor();
   virtual ~Executor() {}
 
   void run_circuit(Circuit &circ, const Noise::NoiseModel &noise,
                    const Config &config, const Method method,
-                   const Device device, ExperimentResult &result) override;
+                   const Device device, ResultItr result) override;
 
   // Return an estimate of the required memory for a circuit.
   size_t required_memory_mb(const Config &config, const Circuit &circuit,
@@ -186,17 +192,21 @@ class Executor : public Base {
                                    const Noise::NoiseModel &noise);
 
   virtual void run_circuit_with_sampling(Circuit &circ, const Config &config,
-                                         RngEngine &init_rng,
-                                         ExperimentResult &result);
+                                         RngEngine &init_rng, ResultItr result);
 
   virtual void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise,
                                  const Config &config, RngEngine &init_rng,
-                                 ExperimentResult &result, bool sample_noise);
+                                 ResultItr result, bool sample_noise);
+
+  void run_circuit_with_parameter_binding(state_t &state, OpItr first,
+                                          OpItr last, ExperimentResult &result,
+                                          RngEngine &rng, const uint_t iparam,
+                                          bool final_op);
 
   template <typename InputIterator>
   void measure_sampler(InputIterator first_meas, InputIterator last_meas,
                        uint_t shots, state_t &state, ExperimentResult &result,
-                       RngEngine &rng) const;
+                       RngEngine &rng, bool save_creg_to_state = false) const;
 
 #ifdef AER_MPI
   void gather_creg_memory(std::vector<ClassicalRegister> &cregs,
@@ -296,7 +306,6 @@ void Executor<state_t>::set_config(const Config &config) {
   } else if (precision == "single") {
     sim_precision_ = Precision::Single;
   }
-
   // set target GPUs
 #ifdef AER_THRUST_GPU
   int nDev = 0;
@@ -341,8 +350,14 @@ size_t Executor<state_t>::get_gpu_memory_mb() {
     size_t freeMem, totalMem;
     cudaSetDevice(target_gpus_[iDev]);
     cudaMemGetInfo(&freeMem, &totalMem);
+    if (iDev == 0)
+      min_gpu_memory_mb_ = totalMem;
+    else if (totalMem < min_gpu_memory_mb_)
+      min_gpu_memory_mb_ = totalMem;
     total_physical_memory += totalMem;
   }
+
+  min_gpu_memory_mb_ >>= 20;
 #endif
 
 #ifdef AER_MPI
@@ -352,8 +367,6 @@ size_t Executor<state_t>::get_gpu_memory_mb() {
   MPI_Allreduce(&locMem, &minMem, 1, MPI_UINT64_T, MPI_MIN, distributed_comm_);
   total_physical_memory = minMem;
 
-  int t = num_gpus_;
-  MPI_Allreduce(&t, &num_gpus_, 1, MPI_INT, MPI_MAX, distributed_comm_);
 #endif
 
   return total_physical_memory >> 20;
@@ -384,12 +397,13 @@ uint_t Executor<state_t>::get_max_parallel_shots(
     const Noise::NoiseModel &noise) const {
   uint_t mem = required_memory_mb(config, circ, noise);
   if (mem == 0)
-    return circ.shots;
+    return circ.shots * circ.num_bind_params;
 
   if (sim_device_ == Device::GPU && num_gpus_ > 0) {
-    return std::min(circ.shots, (max_gpu_memory_mb_ * 8 / 10 / mem));
+    return std::min(circ.shots * circ.num_bind_params,
+                    (max_gpu_memory_mb_ * 8 / 10 / mem));
   } else {
-    return std::min(circ.shots, (max_memory_mb_ / mem));
+    return std::min(circ.shots * circ.num_bind_params, (max_memory_mb_ / mem));
   }
 }
 
@@ -452,7 +466,8 @@ void Executor<state_t>::set_parallelization(const Config &config,
   case Method::unitary:
   case Method::matrix_product_state: {
     if (circ.shots == 1 || num_process_per_experiment_ > 1 ||
-        (!noise.has_quantum_errors() && check_measure_sampling_opt(circ))) {
+        (!noise.has_quantum_errors() && check_measure_sampling_opt(circ) &&
+         circ.num_bind_params == 1)) {
       parallel_shots_ = 1;
       parallel_state_update_ =
           std::max<int>({1, max_parallel_threads_ / parallel_experiments_});
@@ -518,8 +533,7 @@ template <class state_t>
 void Executor<state_t>::run_circuit(Circuit &circ,
                                     const Noise::NoiseModel &noise,
                                     const Config &config, const Method method,
-                                    const Device device,
-                                    ExperimentResult &result) {
+                                    const Device device, ResultItr result_it) {
   // Start individual circuit timer
   auto timer_start = myclock_t::now(); // state circuit timer
 
@@ -538,29 +552,32 @@ void Executor<state_t>::run_circuit(Circuit &circ,
     rng.set_seed(circ.seed);
 
     // Output data container
-    result.set_config(config);
-    result.metadata.add(method_names_.at(method), "method");
-    if (sim_device_ == Device::GPU)
-      result.metadata.add("GPU", "device");
-    else if (sim_device_ == Device::ThrustCPU)
-      result.metadata.add("Thrust", "device");
-    else
-      result.metadata.add("CPU", "device");
-
-    // Circuit qubit metadata
-    result.metadata.add(circ.num_qubits, "num_qubits");
-    result.metadata.add(circ.num_memory, "num_clbits");
-    result.metadata.add(circ.qubits(), "active_input_qubits");
-    result.metadata.add(circ.qubit_map(), "input_qubit_map");
-    result.metadata.add(circ.remapped_qubits, "remapped_qubits");
-    result.metadata.add(max_memory_mb_, "max_memory_mb");
-    if (sim_device_ == Device::GPU)
-      result.metadata.add(max_gpu_memory_mb_, "max_gpu_memory_mb");
-
-    // Add measure sampling to metadata
-    // Note: this will set to `true` if sampling is enabled for the circuit
-    result.metadata.add(false, "measure_sampling");
-    result.metadata.add(false, "batched_shots_optimization");
+    for (int_t i = 0; i < circ.num_bind_params; i++) {
+      ExperimentResult &result = *(result_it + i);
+      result.set_config(config);
+      result.metadata.add(method_names_.at(method), "method");
+      if (sim_device_ == Device::GPU)
+        result.metadata.add("GPU", "device");
+      else if (sim_device_ == Device::ThrustCPU)
+        result.metadata.add("Thrust", "device");
+      else
+        result.metadata.add("CPU", "device");
+
+      // Circuit qubit metadata
+      result.metadata.add(circ.num_qubits, "num_qubits");
+      result.metadata.add(circ.num_memory, "num_clbits");
+      result.metadata.add(circ.qubits(), "active_input_qubits");
+      result.metadata.add(circ.qubit_map(), "input_qubit_map");
+      result.metadata.add(circ.remapped_qubits, "remapped_qubits");
+      result.metadata.add(max_memory_mb_, "max_memory_mb");
+      if (sim_device_ == Device::GPU)
+        result.metadata.add(max_gpu_memory_mb_, "max_gpu_memory_mb");
+
+      // Add measure sampling to metadata
+      // Note: this will set to `true` if sampling is enabled for the circuit
+      result.metadata.add(false, "measure_sampling");
+      result.metadata.add(false, "batched_shots_optimization");
+    }
 
     // Validate gateset and memory requirements, raise exception if they're
     // exceeded
@@ -576,12 +593,18 @@ void Executor<state_t>::run_circuit(Circuit &circ,
       // Ideal circuit
       if (noise.is_ideal()) {
         opt_circ = circ;
-        result.metadata.add("ideal", "noise");
+        for (int_t i = 0; i < circ.num_bind_params; i++) {
+          ExperimentResult &result = *(result_it + i);
+          result.metadata.add("ideal", "noise");
+        }
       }
       // Readout error only
       else if (noise.has_quantum_errors() == false) {
         opt_circ = noise.sample_noise(circ, rng);
-        result.metadata.add("readout", "noise");
+        for (int_t i = 0; i < circ.num_bind_params; i++) {
+          ExperimentResult &result = *(result_it + i);
+          result.metadata.add("readout", "noise");
+        }
       }
       // Superop noise sampling
       else if (method == Method::density_matrix || method == Method::superop ||
@@ -589,60 +612,91 @@ void Executor<state_t>::run_circuit(Circuit &circ,
         // Sample noise using SuperOp method
         opt_circ =
             noise.sample_noise(circ, rng, Noise::NoiseModel::Method::superop);
-        result.metadata.add("superop", "noise");
+        for (int_t i = 0; i < circ.num_bind_params; i++) {
+          ExperimentResult &result = *(result_it + i);
+          result.metadata.add("superop", "noise");
+        }
       }
       // Kraus noise sampling
       else if (noise.opset().contains(Operations::OpType::kraus) ||
                noise.opset().contains(Operations::OpType::superop)) {
         opt_circ =
             noise.sample_noise(circ, rng, Noise::NoiseModel::Method::kraus);
-        result.metadata.add("kraus", "noise");
+        for (int_t i = 0; i < circ.num_bind_params; i++) {
+          ExperimentResult &result = *(result_it + i);
+          result.metadata.add("kraus", "noise");
+        }
       }
       // General circuit noise sampling
       else {
         noise_sampling = true;
-        result.metadata.add("circuit", "noise");
+        for (int_t i = 0; i < circ.num_bind_params; i++) {
+          ExperimentResult &result = *(result_it + i);
+          result.metadata.add("circuit", "noise");
+        }
       }
 
       if (noise_sampling) {
-        run_circuit_shots(circ, noise, config, rng, result, true);
+        run_circuit_shots(circ, noise, config, rng, result_it, true);
       } else {
         // Run multishot simulation without noise sampling
         bool can_sample = opt_circ.can_sample;
         can_sample &= check_measure_sampling_opt(opt_circ);
 
         if (can_sample)
-          run_circuit_with_sampling(opt_circ, config, rng, result);
+          run_circuit_with_sampling(opt_circ, config, rng, result_it);
         else
-          run_circuit_shots(opt_circ, noise, config, rng, result, false);
+          run_circuit_shots(opt_circ, noise, config, rng, result_it, false);
       }
     }
-    // Report success
-    result.status = ExperimentResult::Status::completed;
-
-    // Pass through circuit header and add metadata
-    result.header = circ.header;
-    result.shots = circ.shots;
-    result.seed = circ.seed;
-    result.metadata.add(parallel_shots_, "parallel_shots");
-    result.metadata.add(parallel_state_update_, "parallel_state_update");
+    for (int_t i = 0; i < circ.num_bind_params; i++) {
+      ExperimentResult &result = *(result_it + i);
+      // Report success
+      result.status = ExperimentResult::Status::completed;
+
+      // Pass through circuit header and add metadata
+      result.header = circ.header;
+      result.shots = circ.shots;
+      if (circ.num_bind_params > 1)
+        result.seed = circ.seed_for_params[i];
+      else
+        result.seed = circ.seed;
+      result.metadata.add(parallel_shots_, "parallel_shots");
+      result.metadata.add(parallel_state_update_, "parallel_state_update");
+      if (circ.num_bind_params > 1) {
+        result.metadata.add(true, "runtime_parameter_bind");
+        result.metadata.add(circ.num_bind_params, "num_bind_params");
+        result.metadata.add(i, "bind_param_index");
+      } else {
+        result.metadata.add(false, "runtime_parameter_bind");
+        result.metadata.add(1, "num_bind_params");
+      }
+      if (sim_device_ == Device::GPU) {
 #ifdef AER_CUSTATEVEC
-    if (sim_device_ == Device::GPU)
-      result.metadata.add(cuStateVec_enable_, "cuStateVec_enable");
+        result.metadata.add(cuStateVec_enable_, "cuStateVec_enable");
 #endif
-    if (sim_device_ == Device::GPU)
-      result.metadata.add(target_gpus_, "target_gpus");
+        result.metadata.add(target_gpus_, "target_gpus");
+      }
+    }
 
     // Add timer data
     auto timer_stop = myclock_t::now(); // stop timer
     double time_taken =
         std::chrono::duration<double>(timer_stop - timer_start).count();
-    result.time_taken = time_taken;
+    for (int_t i = 0; i < circ.num_bind_params; i++) {
+      ExperimentResult &result = *(result_it + i);
+      result.time_taken = time_taken;
+      // save time also to metadata to pick time in primitive result
+      result.metadata.add(time_taken, "time_taken");
+    }
   }
   // If an exception occurs during execution, catch it and pass it to the output
   catch (std::exception &e) {
-    result.status = ExperimentResult::Status::error;
-    result.message = e.what();
+    for (int_t i = 0; i < circ.num_bind_params; i++) {
+      ExperimentResult &result = *(result_it + i);
+      result.status = ExperimentResult::Status::error;
+      result.message = e.what();
+    }
   }
 }
 
@@ -650,150 +704,230 @@ template <class state_t>
 void Executor<state_t>::run_circuit_with_sampling(Circuit &circ,
                                                   const Config &config,
                                                   RngEngine &init_rng,
-                                                  ExperimentResult &result) {
-  state_t state;
-
+                                                  ResultItr result_it) {
   // Optimize circuit
   Noise::NoiseModel dummy_noise;
+  state_t dummy_state;
 
   auto fusion_pass = transpile_fusion(circ.opset(), config);
-  fusion_pass.optimize_circuit(circ, dummy_noise, state.opset(), result);
-
+  ExperimentResult fusion_result;
+  fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
+                               fusion_result);
   auto max_bits = get_max_matrix_qubits(circ);
 
-  // Set state config
-  state.set_config(config);
-  state.set_parallelization(parallel_state_update_);
-  state.set_global_phase(circ.global_phase_angle);
+  auto first_meas = circ.first_measure_pos; // Position of first measurement op
+  bool final_ops = (first_meas == circ.ops.size());
 
-  state.set_distribution(1);
-  state.set_max_matrix_qubits(max_bits);
+  auto circ_shots = circ.shots;
+  circ.shots = 1;
+  int_t par_shots = (int_t)get_max_parallel_shots(config, circ, dummy_noise);
+  par_shots = std::min((int_t)parallel_shots_, par_shots);
+  circ.shots = circ_shots;
 
-  RngEngine rng = init_rng;
+  num_bind_params_ = circ.num_bind_params;
 
-  auto first_meas = circ.first_measure_pos; // Position of first measurement op
-  bool final_ops = (first_meas == circ.ops.size());
+  auto run_circuit_lambda = [this, circ, &result_it, &fusion_result, config,
+                             init_rng, max_bits, first_meas, final_ops,
+                             par_shots](int_t i) {
+    uint_t iparam, param_end;
+    iparam = circ.num_bind_params * i / par_shots;
+    param_end = circ.num_bind_params * (i + 1) / par_shots;
+
+    for (; iparam < param_end; iparam++) {
+      ExperimentResult &result = *(result_it + iparam);
+      result.metadata.copy(fusion_result.metadata);
+      RngEngine rng;
+      if (iparam == 0)
+        rng = init_rng;
+      else
+        rng.set_seed(circ.seed_for_params[iparam]);
+
+      // Set state config
+      state_t state;
+      state.set_config(config);
+      state.set_parallelization(parallel_state_update_);
+
+      state.set_distribution(1);
+      state.set_max_matrix_qubits(max_bits);
+      if (circ.ops.begin() + first_meas != circ.ops.end()) {
+        state.set_max_sampling_shots(circ.shots);
+      }
+
+      if (circ.global_phase_for_params.size() == circ.num_bind_params)
+        state.set_global_phase(circ.global_phase_for_params[iparam]);
+      else
+        state.set_global_phase(circ.global_phase_angle);
 
-  // allocate qubit register
+        // allocate qubit register
 #ifdef AER_CUSTATEVEC
-  state.enable_cuStateVec(cuStateVec_enable_);
+      state.enable_cuStateVec(cuStateVec_enable_);
 #endif
-  state.allocate(circ.num_qubits, circ.num_qubits);
-  state.set_num_global_qubits(circ.num_qubits);
-  state.enable_density_matrix(!has_statevector_ops_);
+      state.allocate(circ.num_qubits, circ.num_qubits);
+      state.set_num_global_qubits(circ.num_qubits);
+      state.enable_density_matrix(!has_statevector_ops_);
 
-  // Run circuit instructions before first measure
-  state.initialize_qreg(circ.num_qubits);
-  state.initialize_creg(circ.num_memory, circ.num_registers);
-
-  state.apply_ops(circ.ops.cbegin(), circ.ops.cbegin() + first_meas, result,
-                  rng, final_ops);
+      // Run circuit instructions before first measure
+      state.initialize_qreg(circ.num_qubits);
+      state.initialize_creg(circ.num_memory, circ.num_registers);
 
-  // Get measurement operations and set of measured qubits
-  measure_sampler(circ.ops.begin() + first_meas, circ.ops.end(), circ.shots,
-                  state, result, rng);
+      if (circ.num_bind_params > 1) {
+        run_circuit_with_parameter_binding(state, circ.ops.cbegin(),
+                                           circ.ops.cbegin() + first_meas,
+                                           result, rng, iparam, final_ops);
+      } else {
+        state.apply_ops(circ.ops.cbegin(), circ.ops.cbegin() + first_meas,
+                        result, rng, final_ops);
+      }
 
-  // Add measure sampling metadata
-  result.metadata.add(true, "measure_sampling");
+      // Get measurement operations and set of measured qubits
+      measure_sampler(circ.ops.begin() + first_meas, circ.ops.end(), circ.shots,
+                      state, result, rng);
+      // Add measure sampling metadata
+      result.metadata.add(true, "measure_sampling");
 
-  state.add_metadata(result);
+      state.add_metadata(result);
+    }
+  };
+  Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots,
+                                run_circuit_lambda, par_shots);
 }
 
 template <class state_t>
 void Executor<state_t>::run_circuit_shots(
     Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
-    RngEngine &init_rng, ExperimentResult &result, bool sample_noise) {
+    RngEngine &init_rng, ResultItr result_it, bool sample_noise) {
 
   // insert runtime noise sample ops here
   int_t par_shots = (int_t)get_max_parallel_shots(config, circ, noise);
   par_shots = std::min((int_t)parallel_shots_, par_shots);
-  std::vector<ExperimentResult> par_results(par_shots);
 
-  uint_t num_shots = circ.shots;
-  uint_t seed_begin = circ.seed;
+  uint_t num_shots = circ.shots * circ.num_bind_params;
 
   // MPI distribution settings
   std::vector<ClassicalRegister> cregs;
   reg_t shot_begin(distributed_procs_);
   reg_t shot_end(distributed_procs_);
   for (int_t i = 0; i < distributed_procs_; i++) {
-    shot_begin[i] = circ.shots * i / distributed_procs_;
-    shot_end[i] = circ.shots * (i + 1) / distributed_procs_;
+    shot_begin[i] = num_shots * i / distributed_procs_;
+    shot_end[i] = num_shots * (i + 1) / distributed_procs_;
   }
-  num_shots = shot_end[distributed_rank_] - shot_begin[distributed_rank_];
-  seed_begin += shot_begin[distributed_rank_];
-  cregs.resize(circ.shots);
+  uint_t num_local_shots =
+      shot_end[distributed_rank_] - shot_begin[distributed_rank_];
 
   int max_matrix_qubits;
   auto fusion_pass = transpile_fusion(circ.opset(), config);
   if (!sample_noise) {
     Noise::NoiseModel dummy_noise;
     state_t dummy_state;
+    auto fusion_pass = transpile_fusion(circ.opset(), config);
+    ExperimentResult fusion_result;
     fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
-                                 result);
+                                 fusion_result);
+    for (int_t i = 0; i < circ.num_bind_params; i++) {
+      ExperimentResult &result = *(result_it + i);
+      result.metadata.copy(fusion_result.metadata);
+    }
     max_matrix_qubits = get_max_matrix_qubits(circ);
   } else {
     max_matrix_qubits = get_max_matrix_qubits(circ);
     max_matrix_qubits = std::max(max_matrix_qubits, (int)fusion_pass.max_qubit);
   }
+  num_bind_params_ = circ.num_bind_params;
+
+  std::vector<std::vector<ExperimentResult>> par_results(par_shots);
+  for (int_t i = 0; i < par_shots; i++) {
+    par_results[i].resize(num_bind_params_);
+  }
+
+  if (distributed_procs_ > 1)
+    cregs.resize(num_shots);
 
   // run each shot
   auto run_circuit_lambda = [this, &par_results, circ, noise, config, par_shots,
-                             sample_noise, num_shots, seed_begin, shot_begin,
-                             &cregs, init_rng, max_matrix_qubits,
-                             fusion_pass](int_t i) {
+                             sample_noise, num_shots, shot_begin, &cregs,
+                             init_rng, max_matrix_qubits,
+                             num_local_shots](int_t i) {
     state_t state;
     uint_t i_shot, shot_end;
-    i_shot = num_shots * i / par_shots;
-    shot_end = num_shots * (i + 1) / par_shots;
+    i_shot = num_local_shots * i / par_shots;
+    shot_end = num_local_shots * (i + 1) / par_shots;
+
+    auto fusion_pass = transpile_fusion(circ.opset(), config);
 
     // Set state config
     state.set_config(config);
     state.set_parallelization(this->parallel_state_update_);
-    state.set_global_phase(circ.global_phase_angle);
     state.enable_density_matrix(!has_statevector_ops_);
 
     state.set_distribution(this->num_process_per_experiment_);
     state.set_num_global_qubits(circ.num_qubits);
-    state.set_max_matrix_qubits(max_matrix_qubits);
-#ifdef AER_CUSTATEVEC
-    state.enable_cuStateVec(cuStateVec_enable_);
-#endif
-    state.allocate(circ.num_qubits, circ.num_qubits);
 
     for (; i_shot < shot_end; i_shot++) {
       RngEngine rng;
-      if (i_shot == 0)
+      uint_t shot_index = shot_begin[distributed_rank_] + i_shot;
+      uint_t iparam = shot_index / circ.shots;
+      if (shot_index == 0 && iparam == 0)
         rng = init_rng;
-      else
-        rng.set_seed(seed_begin + i_shot);
-
-      state.initialize_qreg(circ.num_qubits);
-      state.initialize_creg(circ.num_memory, circ.num_registers);
+      else {
+        if (circ.num_bind_params > 1) {
+          uint_t lid = shot_index % circ.shots;
+          rng.set_seed(circ.seed_for_params[iparam] + lid);
+        } else
+          rng.set_seed(circ.seed + shot_index);
+      }
+      ExperimentResult &result = par_results[i][iparam];
 
+      Circuit circ_opt;
       if (sample_noise) {
-        Circuit circ_opt;
         Noise::NoiseModel dummy_noise;
         circ_opt = noise.sample_noise(circ, rng);
         fusion_pass.optimize_circuit(circ_opt, dummy_noise, state.opset(),
-                                     par_results[i]);
-        state.apply_ops(circ_opt.ops.cbegin(), circ_opt.ops.cend(),
-                        par_results[i], rng, true);
+                                     result);
+        state.set_max_matrix_qubits(get_max_matrix_qubits(circ_opt));
+      } else
+        state.set_max_matrix_qubits(max_matrix_qubits);
+
+      if (circ.global_phase_for_params.size() == circ.num_bind_params)
+        state.set_global_phase(circ.global_phase_for_params[iparam]);
+      else
+        state.set_global_phase(circ.global_phase_angle);
+#ifdef AER_CUSTATEVEC
+      state.enable_cuStateVec(cuStateVec_enable_);
+#endif
+      state.allocate(circ.num_qubits, circ.num_qubits);
+      state.initialize_qreg(circ.num_qubits);
+      state.initialize_creg(circ.num_memory, circ.num_registers);
+
+      if (sample_noise) {
+        if (circ.num_bind_params > 1) {
+          run_circuit_with_parameter_binding(state, circ_opt.ops.cbegin(),
+                                             circ_opt.ops.cend(), result, rng,
+                                             iparam, true);
+        } else {
+          state.apply_ops(circ_opt.ops.cbegin(), circ_opt.ops.cend(), result,
+                          rng, true);
+        }
       } else {
-        state.apply_ops(circ.ops.cbegin(), circ.ops.cend(), par_results[i], rng,
-                        true);
+        if (circ.num_bind_params > 1) {
+          run_circuit_with_parameter_binding(state, circ.ops.cbegin(),
+                                             circ.ops.cend(), result, rng,
+                                             iparam, true);
+        } else {
+          state.apply_ops(circ.ops.cbegin(), circ.ops.cend(), result, rng,
+                          true);
+        }
       }
       if (distributed_procs_ > 1) {
         // save creg to be gathered
-        cregs[shot_begin[distributed_rank_] + i_shot] = state.creg();
+        cregs[shot_index] = state.creg();
       } else {
-        par_results[i].save_count_data(state.creg(), save_creg_memory_);
+        result.save_count_data(state.creg(), save_creg_memory_);
       }
+      state.add_metadata(result);
     }
-    state.add_metadata(par_results[i]);
   };
   Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots,
-                                run_circuit_lambda);
+                                run_circuit_lambda, par_shots);
 
   // gather cregs on MPI processes and save to result
 #ifdef AER_MPI
@@ -801,15 +935,16 @@ void Executor<state_t>::run_circuit_shots(
     gather_creg_memory(cregs, shot_begin);
 
     // save cregs to result
-    num_shots = circ.shots;
-    auto save_cregs = [this, &par_results, par_shots, num_shots,
+    num_shots = circ.shots * circ.num_bind_params;
+    auto save_cregs = [this, &par_results, par_shots, num_shots, circ,
                        cregs](int_t i) {
       uint_t i_shot, shot_end;
       i_shot = num_shots * i / par_shots;
       shot_end = num_shots * (i + 1) / par_shots;
 
       for (; i_shot < shot_end; i_shot++) {
-        par_results[i].save_count_data(cregs[i_shot], save_creg_memory_);
+        uint_t ip = i_shot / circ.shots;
+        par_results[i][ip].save_count_data(cregs[i_shot], save_creg_memory_);
       }
     };
     Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots, save_cregs,
@@ -818,17 +953,49 @@ void Executor<state_t>::run_circuit_shots(
 #endif
 
   for (auto &res : par_results) {
-    result.combine(std::move(res));
+    for (int_t i = 0; i < circ.num_bind_params; i++) {
+      (result_it + i)->combine(std::move(res[i]));
+    }
   }
-#ifdef AER_CUSTATEVEC
   if (sim_device_ == Device::GPU) {
-    result.metadata.add(cuStateVec_enable_, "cuStateVec_enable");
-    if (par_shots >= num_gpus_)
-      result.metadata.add(num_gpus_, "gpu_parallel_shots_");
-    else
-      result.metadata.add(par_shots, "gpu_parallel_shots_");
-  }
+    for (int_t i = 0; i < circ.num_bind_params; i++) {
+#ifdef AER_CUSTATEVEC
+      (result_it + i)->metadata.add(cuStateVec_enable_, "cuStateVec_enable");
 #endif
+      if (par_shots >= num_gpus_)
+        (result_it + i)->metadata.add(num_gpus_, "gpu_parallel_shots_");
+      else
+        (result_it + i)->metadata.add(par_shots, "gpu_parallel_shots_");
+    }
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::run_circuit_with_parameter_binding(
+    state_t &state, OpItr first, OpItr last, ExperimentResult &result,
+    RngEngine &rng, const uint_t iparam, bool final_op) {
+  OpItr op_begin = first;
+  OpItr op = first;
+
+  while (op != last) {
+    // run with parameter bind
+    if (op->has_bind_params) {
+      if (op_begin != op) {
+        // run ops before this
+        state.apply_ops(op_begin, op, result, rng, false);
+      }
+
+      std::vector<Operations::Op> binded_op(1);
+      binded_op[0] = Operations::bind_parameter(*op, iparam, num_bind_params_);
+      state.apply_ops(binded_op.cbegin(), binded_op.cend(), result, rng,
+                      final_op && (op == last - 1));
+      op_begin = op + 1;
+    }
+    op++;
+  }
+  if (op_begin != last) {
+    state.apply_ops(op_begin, last, result, rng, final_op);
+  }
 }
 
 template <class state_t>
@@ -837,7 +1004,8 @@ void Executor<state_t>::measure_sampler(InputIterator first_meas,
                                         InputIterator last_meas, uint_t shots,
                                         state_t &state,
                                         ExperimentResult &result,
-                                        RngEngine &rng) const {
+                                        RngEngine &rng,
+                                        bool save_creg_to_state) const {
   // Check if meas_circ is empty, and if so return initial creg
   if (first_meas == last_meas) {
     while (shots-- > 0) {
@@ -918,7 +1086,10 @@ void Executor<state_t>::measure_sampler(InputIterator first_meas,
       creg.apply_roerror(roerror, rng);
 
     // Save count data
-    result.save_count_data(creg, save_creg_memory_);
+    if (save_creg_to_state)
+      state.creg() = creg;
+    else
+      result.save_count_data(creg, save_creg_memory_);
   }
 }
 
diff --git a/src/simulators/density_matrix/densitymatrix_executor.hpp b/src/simulators/density_matrix/densitymatrix_executor.hpp
index 6c7d28e923..08708bf8ff 100644
--- a/src/simulators/density_matrix/densitymatrix_executor.hpp
+++ b/src/simulators/density_matrix/densitymatrix_executor.hpp
@@ -30,6 +30,7 @@ namespace AER {
 
 namespace DensityMatrix {
 
+using ResultItr = std::vector<ExperimentResult>::iterator;
 //-------------------------------------------------------------------------
 // batched-shots executor for density matrix
 //-------------------------------------------------------------------------
@@ -57,11 +58,11 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
   // apply op to multiple shots , return flase if op is not supported to execute
   // in a batch
   bool apply_batched_op(const int_t istate, const Operations::Op &op,
-                        ExperimentResult &result, std::vector<RngEngine> &rng,
+                        ResultItr result, std::vector<RngEngine> &rng,
                         bool final_op = false) override;
 
   bool apply_branching_op(CircuitExecutor::Branch &root,
-                          const Operations::Op &op, ExperimentResult &result,
+                          const Operations::Op &op, ResultItr result,
                           bool final_op) override;
 
   // Initializes an n-qubit state to the all |0> state
@@ -73,9 +74,12 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
   template <typename list_t>
   void initialize_from_vector(const list_t &vec);
 
+  void run_circuit_with_sampling(Circuit &circ, const Config &config,
+                                 RngEngine &init_rng,
+                                 ResultItr result) override;
   void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise,
                          const Config &config, RngEngine &init_rng,
-                         ExperimentResult &result, bool sample_noise) override;
+                         ResultItr result_it, bool sample_noise) override;
 
   bool allocate_states(uint_t num_states, const Config &config) override {
     return BasePar::allocate_states(num_states, config);
@@ -126,6 +130,16 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
   cmatrix_t reduced_density_matrix_helper(const reg_t &qubits,
                                           const reg_t &qubits_sorted);
 
+  // Helper functions for shot-branching
+  void apply_save_density_matrix(CircuitExecutor::Branch &root,
+                                 const Operations::Op &op, ResultItr result,
+                                 bool final_op);
+  void apply_save_state(CircuitExecutor::Branch &root, const Operations::Op &op,
+                        ResultItr result, bool final_op);
+  void apply_save_probs(CircuitExecutor::Branch &root, const Operations::Op &op,
+                        ResultItr result);
+  void apply_save_amplitudes(CircuitExecutor::Branch &root,
+                             const Operations::Op &op, ResultItr result);
   //-----------------------------------------------------------------------
   // Measurement Helpers
   //-----------------------------------------------------------------------
@@ -301,17 +315,32 @@ void Executor<densmat_t>::set_config(const Config &config) {
   BaseBatch::set_config(config);
 }
 
+template <class state_t>
+void Executor<state_t>::run_circuit_with_sampling(Circuit &circ,
+                                                  const Config &config,
+                                                  RngEngine &init_rng,
+                                                  ResultItr result_it) {
+  Noise::NoiseModel dummy_noise;
+  if (BasePar::multiple_chunk_required(config, circ, dummy_noise)) {
+    return BasePar::run_circuit_with_sampling(circ, config, init_rng,
+                                              result_it);
+  } else {
+    return BaseBatch::run_circuit_with_sampling(circ, config, init_rng,
+                                                result_it);
+  }
+}
+
 template <class state_t>
 void Executor<state_t>::run_circuit_shots(
     Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
-    RngEngine &init_rng, ExperimentResult &result, bool sample_noise) {
+    RngEngine &init_rng, ResultItr result_it, bool sample_noise) {
   state_t dummy_state;
   if (BasePar::multiple_chunk_required(config, circ, noise)) {
-    return BasePar::run_circuit_shots(circ, noise, config, init_rng, result,
+    return BasePar::run_circuit_shots(circ, noise, config, init_rng, result_it,
                                       sample_noise);
   } else {
-    return BaseBatch::run_circuit_shots(circ, noise, config, init_rng, result,
-                                        sample_noise);
+    return BaseBatch::run_circuit_shots(circ, noise, config, init_rng,
+                                        result_it, sample_noise);
   }
 }
 
@@ -373,7 +402,7 @@ bool Executor<densmat_t>::apply_parallel_op(const Operations::Op &op,
 template <class state_t>
 bool Executor<state_t>::apply_batched_op(const int_t istate,
                                          const Operations::Op &op,
-                                         ExperimentResult &result,
+                                         ResultItr result,
                                          std::vector<RngEngine> &rng,
                                          bool final_op) {
   if (op.conditional) {
@@ -424,8 +453,7 @@ bool Executor<state_t>::apply_batched_op(const int_t istate,
 template <class state_t>
 bool Executor<state_t>::apply_branching_op(CircuitExecutor::Branch &root,
                                            const Operations::Op &op,
-                                           ExperimentResult &result,
-                                           bool final_op) {
+                                           ResultItr result, bool final_op) {
   RngEngine dummy;
   if (Base::states_[root.state_index()].creg().check_conditional(op)) {
     switch (op.type) {
@@ -439,13 +467,20 @@ bool Executor<state_t>::apply_branching_op(CircuitExecutor::Branch &root,
     // save ops
     case Operations::OpType::save_expval:
     case Operations::OpType::save_expval_var:
+      Base::apply_save_expval(root, op, result);
+      break;
     case Operations::OpType::save_state:
+      apply_save_state(root, op, result, final_op);
+      break;
     case Operations::OpType::save_densmat:
+      apply_save_density_matrix(root, op, result, final_op);
+      break;
     case Operations::OpType::save_probs:
     case Operations::OpType::save_probs_ket:
+      apply_save_probs(root, op, result);
+      break;
     case Operations::OpType::save_amps_sq:
-      // call save functions in state class
-      Base::states_[root.state_index()].apply_op(op, result, dummy, final_op);
+      apply_save_amplitudes(root, op, result);
       break;
     default:
       return false;
@@ -759,6 +794,142 @@ Executor<densmat_t>::reduced_density_matrix_helper(const reg_t &qubits,
   return reduced_state;
 }
 
+template <class densmat_t>
+void Executor<densmat_t>::apply_save_density_matrix(
+    CircuitExecutor::Branch &root, const Operations::Op &op, ResultItr result,
+    bool final_op) {
+  cmatrix_t mat;
+  mat = Base::states_[root.state_index()].reduced_density_matrix(op.qubits,
+                                                                 final_op);
+
+  std::vector<bool> copied(Base::num_bind_params_, false);
+  for (int_t i = 0; i < root.num_shots(); i++) {
+    uint_t ip = root.param_index(i);
+    if (!copied[ip]) {
+      (result + ip)
+          ->save_data_average(Base::states_[root.state_index()].creg(),
+                              op.string_params[0], mat, op.type, op.save_type);
+      copied[ip] = true;
+    }
+  }
+}
+
+template <class densmat_t>
+void Executor<densmat_t>::apply_save_state(CircuitExecutor::Branch &root,
+                                           const Operations::Op &op,
+                                           ResultItr result, bool final_op) {
+  if (op.qubits.size() !=
+      Base::states_[root.state_index()].qreg().num_qubits()) {
+    throw std::invalid_argument(op.name + " was not applied to all qubits."
+                                          " Only the full state can be saved.");
+  }
+  // Renamp single data type to average
+  Operations::DataSubType save_type;
+  switch (op.save_type) {
+  case Operations::DataSubType::single:
+    save_type = Operations::DataSubType::average;
+    break;
+  case Operations::DataSubType::c_single:
+    save_type = Operations::DataSubType::c_average;
+    break;
+  default:
+    save_type = op.save_type;
+  }
+
+  // Default key
+  std::string key = (op.string_params[0] == "_method_") ? "density_matrix"
+                                                        : op.string_params[0];
+
+  std::vector<bool> copied(Base::num_bind_params_, false);
+  if (final_op) {
+    auto state = Base::states_[root.state_index()].move_to_matrix();
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      uint_t ip = root.param_index(i);
+      if (!copied[ip]) {
+        (result + ip)
+            ->save_data_average(Base::states_[root.state_index()].creg(), key,
+                                state, OpType::save_densmat, save_type);
+        copied[ip] = true;
+      }
+    }
+  } else {
+    auto state = Base::states_[root.state_index()].copy_to_matrix();
+
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      uint_t ip = root.param_index(i);
+      if (!copied[ip]) {
+        (result + ip)
+            ->save_data_average(Base::states_[root.state_index()].creg(), key,
+                                state, OpType::save_densmat, save_type);
+        copied[ip] = true;
+      }
+    }
+  }
+}
+
+template <class densmat_t>
+void Executor<densmat_t>::apply_save_probs(CircuitExecutor::Branch &root,
+                                           const Operations::Op &op,
+                                           ResultItr result) {
+  // get probs as hexadecimal
+  auto probs =
+      Base::states_[root.state_index()].qreg().probabilities(op.qubits);
+
+  std::vector<bool> copied(Base::num_bind_params_, false);
+  if (op.type == Operations::OpType::save_probs_ket) {
+    // Convert to ket dict
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      uint_t ip = root.param_index(i);
+      if (!copied[ip]) {
+        (result + ip)
+            ->save_data_average(
+                Base::states_[root.state_index()].creg(), op.string_params[0],
+                Utils::vec2ket(probs, Base::json_chop_threshold_, 16), op.type,
+                op.save_type);
+        copied[ip] = true;
+      }
+    }
+  } else {
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      uint_t ip = root.param_index(i);
+      if (!copied[ip]) {
+        (result + ip)
+            ->save_data_average(Base::states_[root.state_index()].creg(),
+                                op.string_params[0], probs, op.type,
+                                op.save_type);
+        copied[ip] = true;
+      }
+    }
+  }
+}
+
+template <class densmat_t>
+void Executor<densmat_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
+                                                const Operations::Op &op,
+                                                ResultItr result) {
+  if (op.int_params.empty()) {
+    throw std::invalid_argument(
+        "Invalid save_amplitudes instructions (empty params).");
+  }
+  const int_t size = op.int_params.size();
+  rvector_t amps_sq(size, 0);
+  for (int_t i = 0; i < size; ++i) {
+    amps_sq[i] =
+        Base::states_[root.state_index()].qreg().probability(op.int_params[i]);
+  }
+  std::vector<bool> copied(Base::num_bind_params_, false);
+  for (int_t i = 0; i < root.num_shots(); i++) {
+    uint_t ip = root.param_index(i);
+    if (!copied[ip]) {
+      (result + ip)
+          ->save_data_average(Base::states_[root.state_index()].creg(),
+                              op.string_params[0], amps_sq, op.type,
+                              op.save_type);
+      copied[ip] = true;
+    }
+  }
+}
+
 //=========================================================================
 // Implementation: Reset and Measurement Sampling
 //=========================================================================
diff --git a/src/simulators/density_matrix/densitymatrix_state.hpp b/src/simulators/density_matrix/densitymatrix_state.hpp
index a5bfa46585..9041adc1dd 100644
--- a/src/simulators/density_matrix/densitymatrix_state.hpp
+++ b/src/simulators/density_matrix/densitymatrix_state.hpp
@@ -133,6 +133,9 @@ class State : public QuantumState::State<densmat_t> {
   std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
                                     RngEngine &rng) override;
 
+  // Helper function for computing expectation value
+  double expval_pauli(const reg_t &qubits, const std::string &pauli) override;
+
   //-----------------------------------------------------------------------
   // Additional methods
   //-----------------------------------------------------------------------
@@ -146,6 +149,9 @@ class State : public QuantumState::State<densmat_t> {
   auto move_to_matrix();
   auto copy_to_matrix();
 
+  // Return the reduced density matrix for the simulator
+  cmatrix_t reduced_density_matrix(const reg_t &qubits, bool last_op = false);
+
   template <typename list_t>
   void initialize_from_vector(const list_t &vec);
 
@@ -210,12 +216,7 @@ class State : public QuantumState::State<densmat_t> {
   void apply_save_amplitudes_sq(const Operations::Op &op,
                                 ExperimentResult &result);
 
-  // Helper function for computing expectation value
-  virtual double expval_pauli(const reg_t &qubits,
-                              const std::string &pauli) override;
-
   // Return the reduced density matrix for the simulator
-  cmatrix_t reduced_density_matrix(const reg_t &qubits, bool last_op = false);
   cmatrix_t reduced_density_matrix_helper(const reg_t &qubits,
                                           const reg_t &qubits_sorted);
 
@@ -339,6 +340,8 @@ bool State<densmat_t>::allocate(uint_t num_qubits, uint_t block_bits,
                                 uint_t num_parallel_shots) {
   if (BaseState::max_matrix_qubits_ > 0)
     BaseState::qreg_.set_max_matrix_bits(BaseState::max_matrix_qubits_);
+  if (BaseState::max_sampling_shots_ > 0)
+    BaseState::qreg_.set_max_sampling_shots(BaseState::max_sampling_shots_);
 
   BaseState::qreg_.set_target_gpus(BaseState::target_gpus_);
   BaseState::qreg_.chunk_setup(block_bits * 2, block_bits * 2, 0, 1);
diff --git a/src/simulators/multi_state_executor.hpp b/src/simulators/multi_state_executor.hpp
index 570464ec03..27e533c702 100644
--- a/src/simulators/multi_state_executor.hpp
+++ b/src/simulators/multi_state_executor.hpp
@@ -41,6 +41,7 @@ class MultiStateExecutor : public Executor<state_t> {
 protected:
   std::vector<state_t> states_;
   std::vector<ClassicalRegister> cregs_; // classical registers for all shots
+  reg_t circuit_seeds_;
 
   // number of qubits for the circuit
   uint_t num_qubits_;
@@ -62,7 +63,8 @@ class MultiStateExecutor : public Executor<state_t> {
   uint_t num_max_shots_ =
       1; // max number of shots can be stored on available memory
 
-  int max_matrix_qubits_; // max qubits for matrix
+  int max_matrix_qubits_ = 0;  // max qubits for matrix
+  int max_sampling_shots_ = 0; // max shots for sampling
 
   // shot branching
   bool shot_branching_enable_ = true;
@@ -84,10 +86,6 @@ class MultiStateExecutor : public Executor<state_t> {
   // Threshold for chopping small values to zero in JSON
   double json_chop_threshold_ = 1e-10;
 
-  // Set a global phase exp(1j * theta) for the state
-  bool has_global_phase_ = false;
-  complex_t global_phase_ = 1;
-
   // number of threads for inner loop of shot-branching
   int_t shot_branch_parallel_ = 1;
 
@@ -109,23 +107,26 @@ class MultiStateExecutor : public Executor<state_t> {
 
   void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise,
                          const Config &config, RngEngine &init_rng,
-                         ExperimentResult &result, bool sample_noise) override;
+                         ResultItr result_it, bool sample_noise) override;
 
   void run_circuit_with_shot_branching(
       uint_t top_state, uint_t num_states, Circuit &circ,
       const Noise::NoiseModel &noise, const Config &config, RngEngine &init_rng,
-      uint_t ishot, uint_t nshots, ExperimentResult &result, bool sample_noise);
+      uint_t ishot, uint_t nshots, ResultItr result_it, bool sample_noise);
 
   // apply op for shot-branching, return false if op is not applied in sub-class
   virtual bool apply_branching_op(Branch &root, const Operations::Op &op,
-                                  ExperimentResult &result, bool final_op) {
+                                  ResultItr result_it, bool final_op) {
     std::cout << "  base is called, implement for each method" << std::endl;
     return false;
   }
 
+  // apply op with runtime parameterization
+  virtual void apply_runtime_parameterization(Branch &root,
+                                              const Operations::Op &op);
+
   // Apply the global phase
   virtual void apply_global_phase() {}
-  void set_global_phase(double theta);
 
   void set_parallelization(const Config &config, const Circuit &circ,
                            const Noise::NoiseModel &noise) override;
@@ -136,8 +137,7 @@ class MultiStateExecutor : public Executor<state_t> {
 
   template <typename InputIterator>
   void measure_sampler(InputIterator first_meas, InputIterator last_meas,
-                       uint_t shots, Branch &branch, ExperimentResult &result,
-                       std::vector<RngEngine> &rng);
+                       Branch &branch, ResultItr result_it);
 
   // sampling measure
   virtual std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
@@ -146,6 +146,9 @@ class MultiStateExecutor : public Executor<state_t> {
     // this is for single rng, impement in sub-class for multi-shots case
     return state.sample_measure(qubits, shots, rng[0]);
   }
+
+  void apply_save_expval(Branch &root, const Operations::Op &op,
+                         ResultItr result);
 };
 
 template <class state_t>
@@ -182,17 +185,6 @@ void MultiStateExecutor<state_t>::set_config(const Config &config) {
     num_threads_per_group_ = config.num_threads_per_device.value();
 }
 
-template <class state_t>
-void MultiStateExecutor<state_t>::set_global_phase(double theta) {
-  if (Linalg::almost_equal(theta, 0.0)) {
-    has_global_phase_ = false;
-    global_phase_ = 1;
-  } else {
-    has_global_phase_ = true;
-    global_phase_ = std::exp(complex_t(0.0, theta));
-  }
-}
-
 template <class state_t>
 void MultiStateExecutor<state_t>::set_distribution(uint_t num_states) {
 
@@ -245,10 +237,19 @@ bool MultiStateExecutor<state_t>::allocate_states(uint_t num_shots,
 template <class state_t>
 void MultiStateExecutor<state_t>::run_circuit_shots(
     Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
-    RngEngine &init_rng, ExperimentResult &result, bool sample_noise) {
+    RngEngine &init_rng, ResultItr result_it, bool sample_noise) {
   num_qubits_ = circ.num_qubits;
   num_creg_memory_ = circ.num_memory;
   num_creg_registers_ = circ.num_registers;
+  Base::num_bind_params_ = circ.num_bind_params;
+  Base::num_shots_per_bind_param_ = circ.shots;
+
+  if (circ.num_bind_params > 1)
+    circuit_seeds_ = circ.seed_for_params;
+  else {
+    circuit_seeds_.resize(1);
+    circuit_seeds_[0] = circ.seed;
+  }
 
   if (this->sim_device_ == Device::GPU) {
 #ifdef _OPENMP
@@ -258,8 +259,7 @@ void MultiStateExecutor<state_t>::run_circuit_shots(
   } else if (this->sim_device_ == Device::ThrustCPU) {
     shot_omp_parallel_ = false;
   }
-
-  set_distribution(circ.shots);
+  set_distribution(circ.shots * Base::num_bind_params_);
   num_max_shots_ = Base::get_max_parallel_shots(config, circ, noise);
 
   bool shot_branching = false;
@@ -270,7 +270,7 @@ void MultiStateExecutor<state_t>::run_circuit_shots(
     shot_branching = false;
 
   if (!shot_branching) {
-    return Base::run_circuit_shots(circ, noise, config, init_rng, result,
+    return Base::run_circuit_shots(circ, noise, config, init_rng, result_it,
                                    sample_noise);
   }
   // disable cuStateVec if shot-branching is enabled
@@ -281,6 +281,8 @@ void MultiStateExecutor<state_t>::run_circuit_shots(
 
   Noise::NoiseModel dummy_noise;
   state_t dummy_state;
+  RngEngine dummy_rng;
+  dummy_rng.set_seed(circ.seed); // this is not used actually
 
   Circuit circ_opt;
   if (sample_noise) {
@@ -289,19 +291,19 @@ void MultiStateExecutor<state_t>::run_circuit_shots(
                                   Noise::NoiseModel::Method::circuit, true);
     auto fusion_pass = Base::transpile_fusion(circ_opt.opset(), config);
     fusion_pass.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(),
-                                 result);
+                                 *result_it);
     max_matrix_qubits_ = Base::get_max_matrix_qubits(circ_opt);
   } else {
     auto fusion_pass = Base::transpile_fusion(circ.opset(), config);
     fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
-                                 result);
+                                 *result_it);
     max_matrix_qubits_ = Base::get_max_matrix_qubits(circ);
   }
 
 #ifdef AER_MPI
   // if shots are distributed to MPI processes, allocate cregs to be gathered
   if (Base::num_process_per_experiment_ > 1)
-    cregs_.resize(circ.shots);
+    cregs_.resize(circ.shots * Base::num_bind_params_);
 #endif
 
   // reserve states
@@ -315,7 +317,8 @@ void MultiStateExecutor<state_t>::run_circuit_shots(
         std::min((int_t)Base::parallel_shots_, (int_t)num_local_states_);
   }
   shot_branch_parallel_ = Base::parallel_shots_ / par_shots;
-  std::vector<ExperimentResult> par_results(par_shots);
+
+  std::vector<std::vector<ExperimentResult>> par_results(par_shots);
 
   auto parallel_shot_branching = [this, &par_results, par_shots, &circ,
                                   &circ_opt, noise, config, &init_rng,
@@ -324,6 +327,7 @@ void MultiStateExecutor<state_t>::run_circuit_shots(
     uint_t ishot = i * num_local_states_ / par_shots;
     uint_t nshots = (i + 1) * num_local_states_ / par_shots;
     nshots -= ishot;
+    par_results[i].resize(Base::num_bind_params_);
 
     // state distribution
     uint_t istate, nstates;
@@ -340,11 +344,11 @@ void MultiStateExecutor<state_t>::run_circuit_shots(
       if (sample_noise) {
         run_circuit_with_shot_branching(istate, nstates, circ_opt, noise,
                                         config, init_rng, ishot, nshots,
-                                        par_results[i], sample_noise);
+                                        par_results[i].begin(), sample_noise);
       } else {
         run_circuit_with_shot_branching(istate, nstates, circ, noise, config,
-                                        init_rng, ishot, nshots, par_results[i],
-                                        sample_noise);
+                                        init_rng, ishot, nshots,
+                                        par_results[i].begin(), sample_noise);
       }
     }
   };
@@ -363,12 +367,13 @@ void MultiStateExecutor<state_t>::run_circuit_shots(
       shot_end = num_global_states_ * (i + 1) / par_shots;
 
       for (; i_shot < shot_end; i_shot++) {
+        uint_t ip = i_shot / Base::num_shots_per_bind_param_;
         if (cregs_[i_shot].memory_size() > 0) {
           std::string memory_hex = cregs_[i_shot].memory_hex();
-          par_results[i].data.add_accum(static_cast<uint_t>(1ULL), "counts",
-                                        memory_hex);
+          par_results[i][ip].data.add_accum(static_cast<uint_t>(1ULL), "counts",
+                                            memory_hex);
           if (Base::save_creg_memory_) {
-            par_results[i].data.add_list(std::move(memory_hex), "memory");
+            par_results[i][ip].data.add_list(std::move(memory_hex), "memory");
           }
         }
       }
@@ -380,17 +385,23 @@ void MultiStateExecutor<state_t>::run_circuit_shots(
 #endif
 
   for (auto &res : par_results) {
-    result.combine(std::move(res));
+    for (int_t i = 0; i < Base::num_bind_params_; i++) {
+      (result_it + i)->combine(std::move(res[i]));
+    }
   }
 
-  result.metadata.add(true, "shot_branching_enabled");
+  for (int_t i = 0; i < Base::num_bind_params_; i++) {
+    (result_it + i)->metadata.add(true, "shot_branching_enabled");
+    (result_it + i)
+        ->metadata.add(sample_noise, "runtime_noise_sampling_enabled");
+  }
 }
 
 template <class state_t>
 void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
     uint_t top_state, uint_t num_states, Circuit &circ,
     const Noise::NoiseModel &noise, const Config &config, RngEngine &init_rng,
-    uint_t ishot, uint_t nshots, ExperimentResult &result, bool sample_noise) {
+    uint_t ishot, uint_t nshots, ResultItr result_it, bool sample_noise) {
   std::vector<std::shared_ptr<Branch>> branches;
   OpItr first;
   OpItr last;
@@ -425,49 +436,94 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
   if (par_shots == 0)
     par_shots = 1;
 
-  // initialize local shots
+  // initialize local shots and parameter indices
   std::vector<RngEngine> shots_storage(nshots);
-  if (global_state_index_ + ishot == 0)
-    shots_storage[0] = init_rng;
-  else
-    shots_storage[0].set_seed(circ.seed + global_state_index_ + ishot);
-  if (par_shots > 1) {
+  std::vector<std::shared_ptr<Branch>> waiting_branches;
+
+  // TO DO : parameter index is only needed at the first parameter bind
+  // store parameter indices
+  if (Base::num_bind_params_ > 1) {
+    if (par_shots > 1) {
 #pragma omp parallel for num_threads(par_shots)
-    for (int_t i = 1; i < nshots; i++)
-      shots_storage[i].set_seed(circ.seed + global_state_index_ + ishot + i);
+      for (int_t i = 0; i < nshots; i++) {
+        uint_t gid = global_state_index_ + ishot + i;
+        uint_t ip = gid / Base::num_shots_per_bind_param_;
+        shots_storage[i].set_seed(circ.seed_for_params[ip] +
+                                  (gid % Base::num_shots_per_bind_param_));
+      }
+    } else {
+      for (int_t i = 0; i < nshots; i++) {
+        uint_t gid = global_state_index_ + ishot + i;
+        uint_t ip = gid / Base::num_shots_per_bind_param_;
+        shots_storage[i].set_seed(circ.seed_for_params[ip] +
+                                  (gid % Base::num_shots_per_bind_param_));
+      }
+    }
   } else {
-    for (int_t i = 1; i < nshots; i++)
-      shots_storage[i].set_seed(circ.seed + global_state_index_ + ishot + i);
+    if (global_state_index_ + ishot == 0)
+      shots_storage[0] = init_rng;
+    else
+      shots_storage[0].set_seed(circ.seed + global_state_index_ + ishot);
+    if (par_shots > 1) {
+#pragma omp parallel for num_threads(par_shots)
+      for (int_t i = 1; i < nshots; i++)
+        shots_storage[i].set_seed(circ.seed + global_state_index_ + ishot + i);
+    } else {
+      for (int_t i = 1; i < nshots; i++)
+        shots_storage[i].set_seed(circ.seed + global_state_index_ + ishot + i);
+    }
+  }
+
+  // initial state
+  waiting_branches.push_back(std::make_shared<Branch>());
+  waiting_branches[0]->set_shots(shots_storage);
+  waiting_branches[0]->op_iterator() = first;
+  if (Base::num_bind_params_ > 1) {
+    waiting_branches[0]->set_param_index(global_state_index_ + ishot,
+                                         Base::num_shots_per_bind_param_);
+  } else {
+    waiting_branches[0]->set_param_index(0, 0);
   }
+  shots_storage.clear();
 
-  std::vector<ExperimentResult> par_results(par_shots);
+  std::vector<std::vector<ExperimentResult>> par_results(par_shots);
+  for (int_t i = 0; i < par_shots; i++) {
+    par_results[i].resize(Base::num_bind_params_);
+  }
 
-  uint_t num_shots_saved = 0;
+  reg_t num_shots_saved(Base::num_bind_params_, 0);
 
   // loop until all local shots are simulated
-  while (shots_storage.size() > 0) {
+  while (waiting_branches.size() > 0) {
     uint_t num_active_states = 1;
 
-    // initial state
-    branches.push_back(std::make_shared<Branch>());
-    branches[0]->state_index() = top_state;
-    branches[0]->set_shots(shots_storage);
-    branches[0]->op_iterator() = first;
-    branches[0]->shot_index() =
-        global_state_index_ + nshots - shots_storage.size();
-    shots_storage.clear();
-
-    // initialize initial state
-    states_[top_state].set_parallelization(this->parallel_state_update_);
-    states_[top_state].set_global_phase(circ.global_phase_angle);
-    states_[top_state].enable_density_matrix(!Base::has_statevector_ops_);
-    states_[top_state].initialize_qreg(num_qubits_);
-    states_[top_state].initialize_creg(num_creg_memory_, num_creg_registers_);
+    // set branches
+    for (int_t i = 0; i < waiting_branches.size(); i++) {
+      if (i > num_states)
+        break;
+      uint_t sid = top_state + i;
+      waiting_branches[i]->state_index() = sid;
+      waiting_branches[i]->op_iterator() = first;
+      branches.push_back(waiting_branches[i]);
+
+      // initialize state
+      states_[sid].set_parallelization(this->parallel_state_update_);
+      states_[sid].set_global_phase(circ.global_phase_angle);
+      states_[sid].enable_density_matrix(!Base::has_statevector_ops_);
+      states_[sid].initialize_qreg(num_qubits_);
+      states_[sid].initialize_creg(num_creg_memory_, num_creg_registers_);
+    }
+    if (waiting_branches.size() < num_states)
+      waiting_branches.clear();
+    else {
+      waiting_branches.erase(waiting_branches.begin(),
+                             waiting_branches.begin() + num_states);
+    }
 
     while (num_active_states > 0) { // loop until all branches execute all ops
       // functor for ops execution
       auto apply_ops_func = [this, &branches, &noise, &par_results, measure_seq,
-                             par_shots, num_active_states](int_t i) {
+                             last, par_shots, num_active_states](int_t i) {
         uint_t istate, state_end;
         istate = branches.size() * i / par_shots;
         state_end = branches.size() * (i + 1) / par_shots;
@@ -475,6 +531,8 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
         RngEngine dummy_rng;
 
         for (; istate < state_end; istate++) {
+          state_t &state = states_[branches[istate]->state_index()];
+
           while (branches[istate]->op_iterator() != measure_seq ||
                  branches[istate]->additional_ops().size() > 0) {
             // execute additional ops first if avaiable
@@ -484,7 +542,7 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
               while (iadd < num_add) {
                 if (apply_branching_op(*branches[istate],
                                        branches[istate]->additional_ops()[iadd],
-                                       par_results[i], false)) {
+                                       par_results[i].begin(), false)) {
                   // check if there are new branches
                   if (branches[istate]->num_branches() > 0) {
                     // if there are additional ops remaining, queue them on new
@@ -497,8 +555,8 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
                             branches[istate]->additional_ops()[k]);
                     }
                     branches[istate]->remove_empty_branches();
-                    states_[branches[istate]->state_index()].creg() =
-                        branches[istate]->creg();
+                    state.creg() = branches[istate]->creg();
+
                     // if there are some branches still remaining
                     if (branches[istate]->num_branches() > 0) {
                       nbranch += branches[istate]->num_branches();
@@ -508,9 +566,8 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
                     num_add = branches[istate]->additional_ops().size();
                   }
                 } else {
-                  states_[branches[istate]->state_index()].apply_op(
-                      branches[istate]->additional_ops()[iadd], par_results[i],
-                      dummy_rng, false);
+                  state.apply_op(branches[istate]->additional_ops()[iadd],
+                                 par_results[i][0], dummy_rng, false);
                 }
                 iadd++;
               }
@@ -521,34 +578,44 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
                 break;
               }
             }
+            OpItr op = branches[istate]->op_iterator();
+            if (op == measure_seq)
+              break;
+
             // then execute ops
-            if (branches[istate]->op_iterator() != measure_seq) {
-              if (!branches[istate]->apply_control_flow(
-                      states_[branches[istate]->state_index()].creg(),
-                      measure_seq)) {
-                if (!branches[istate]->apply_runtime_noise_sampling(
-                        states_[branches[istate]->state_index()].creg(),
-                        *branches[istate]->op_iterator(), noise)) {
-                  if (!apply_branching_op(*branches[istate],
-                                          *branches[istate]->op_iterator(),
-                                          par_results[i], true)) {
-                    states_[branches[istate]->state_index()].apply_op(
-                        *branches[istate]->op_iterator(), par_results[i],
-                        dummy_rng, true);
-                  }
-                }
-                branches[istate]->advance_iterator();
-                if (branches[istate]->num_branches() > 0) {
-                  branches[istate]->remove_empty_branches();
-                  states_[branches[istate]->state_index()].creg() =
-                      branches[istate]->creg();
+            if (!state.creg().check_conditional(*op)) {
+              branches[istate]->advance_iterator();
+              continue;
+            }
+            if (branches[istate]->apply_control_flow(state.creg(), measure_seq))
+              continue;
 
-                  // if there are some branches still remaining
-                  if (branches[istate]->num_branches() > 0) {
-                    nbranch += branches[istate]->num_branches();
-                    break;
-                  }
-                }
+            // runtime noise sampling
+            if (op->type == Operations::OpType::sample_noise) {
+              branches[istate]->apply_runtime_noise_sampling(state.creg(), *op,
+                                                             noise);
+            }
+            // runtime parameterizaion
+            else if (op->has_bind_params) {
+              apply_runtime_parameterization(*branches[istate], *op);
+            } else {
+              if (!apply_branching_op(*branches[istate], *op,
+                                      par_results[i].begin(),
+                                      (op + 1 == last))) {
+                state.apply_op(*op, par_results[i][0], dummy_rng,
+                               (op + 1 == last));
+              }
+            }
+
+            branches[istate]->advance_iterator();
+            if (branches[istate]->num_branches() > 0) {
+              branches[istate]->remove_empty_branches();
+              state.creg() = branches[istate]->creg();
+
+              // if there are some branches still remaining
+              if (branches[istate]->num_branches() > 0) {
+                nbranch += branches[istate]->num_branches();
+                break;
               }
             }
           }
@@ -571,22 +638,10 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
               if (branches[i]->branches()[j]->num_shots() > 0) {
                 // add new branched state
                 uint_t pos = branches.size();
-                if (pos >= num_states) { // if there is not enough memory to
-                                         // allocate copied state, shots are
-                                         // reserved to the next iteration
-                  // reset seed to reproduce same results
-                  for (int_t k = 0; k < branches[i]->branches()[j]->num_shots();
-                       k++) {
-                    branches[i]->branches()[j]->rng_shots()[k].set_seed(
-                        branches[i]
-                            ->branches()[j]
-                            ->rng_shots()[k]
-                            .initial_seed());
-                  }
-                  shots_storage.insert(
-                      shots_storage.end(),
-                      branches[i]->branches()[j]->rng_shots().begin(),
-                      branches[i]->branches()[j]->rng_shots().end());
+                if (pos >= num_states) {
+                  // if there is not enough memory, add to waiting list
+                  branches[i]->branches()[j]->reset_branch();
+                  waiting_branches.push_back(branches[i]->branches()[j]);
                 } else {
                   branches.push_back(branches[i]->branches()[j]);
                   branches[pos]->state_index() = top_state + pos;
@@ -641,9 +696,8 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
         state_end = branches.size() * (i + 1) / par_shots;
 
         for (; istate < state_end; istate++) {
-          measure_sampler(measure_seq, last, branches[istate]->num_shots(),
-                          *branches[istate], par_results[i],
-                          branches[istate]->rng_shots());
+          measure_sampler(measure_seq, last, *branches[istate],
+                          par_results[i].begin());
         }
       };
       bool can_parallel = par_shots > 1 && branches.size() > 1;
@@ -653,7 +707,8 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
       Utils::apply_omp_parallel_for(can_parallel, 0, par_shots,
                                     sampling_measure_func, par_shots);
 
-      result.metadata.add(true, "shot_branching_sampling_enabled");
+      for (int_t i = 0; i < Base::num_bind_params_; i++)
+        (result_it + i)->metadata.add(true, "shot_branching_sampling_enabled");
     } else {
       // save cregs to result
       auto save_cregs = [this, &branches, &par_results, par_shots](int_t i) {
@@ -664,18 +719,25 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
         for (; istate < state_end; istate++) {
           if (Base::num_process_per_experiment_ > 1) {
             for (int_t j = 0; j < branches[istate]->num_shots(); j++) {
-              cregs_[branches[istate]->shot_index() + j] =
-                  states_[branches[istate]->state_index()].creg();
+              uint_t idx = branches[istate]->rng_shots()[j].initial_seed();
+              uint_t ip = branches[istate]->param_index(j);
+              idx += ip * Base::num_shots_per_bind_param_;
+              idx -= circuit_seeds_[ip];
+              cregs_[idx] = states_[branches[istate]->state_index()].creg();
             }
           } else {
             std::string memory_hex =
                 states_[branches[istate]->state_index()].creg().memory_hex();
-            for (int_t j = 0; j < branches[istate]->num_shots(); j++)
-              par_results[i].data.add_accum(static_cast<uint_t>(1ULL), "counts",
-                                            memory_hex);
+            for (int_t j = 0; j < branches[istate]->num_shots(); j++) {
+              uint_t ip = branches[istate]->param_index(j);
+              par_results[i][ip].data.add_accum(static_cast<uint_t>(1ULL),
+                                                "counts", memory_hex);
+            }
             if (Base::save_creg_memory_) {
-              for (int_t j = 0; j < branches[istate]->num_shots(); j++)
-                par_results[i].data.add_list(memory_hex, "memory");
+              for (int_t j = 0; j < branches[istate]->num_shots(); j++) {
+                uint_t ip = branches[istate]->param_index(j);
+                par_results[i][ip].data.add_list(memory_hex, "memory");
+              }
             }
           }
         }
@@ -693,7 +755,34 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
   }
 
   for (auto &res : par_results) {
-    result.combine(std::move(res));
+    for (int_t i = 0; i < Base::num_bind_params_; i++) {
+      (result_it + i)->combine(std::move(res[i]));
+    }
+  }
+}
+
+template <class state_t>
+void MultiStateExecutor<state_t>::apply_runtime_parameterization(
+    Branch &root, const Operations::Op &op) {
+  uint_t nparams = root.num_params();
+
+  root.creg() = states_[root.state_index()].creg();
+  if (nparams == 1) {
+    uint_t ip = root.param_index(0);
+    Operations::Op bind_op =
+        Operations::bind_parameter(op, ip, Base::num_bind_params_);
+    root.add_op_after_branch(bind_op);
+  } else {
+    // branch shots
+    root.branch_shots_by_params();
+
+    // add binded op after branch
+    for (int_t i = 0; i < nparams; i++) {
+      uint_t ip = root.branches()[i]->param_index(0);
+      Operations::Op bind_op =
+          Operations::bind_parameter(op, ip, Base::num_bind_params_);
+      root.branches()[i]->add_op_after_branch(bind_op);
+    }
   }
 }
 
@@ -701,17 +790,26 @@ template <class state_t>
 template <typename InputIterator>
 void MultiStateExecutor<state_t>::measure_sampler(InputIterator first_meas,
                                                   InputIterator last_meas,
-                                                  uint_t shots, Branch &branch,
-                                                  ExperimentResult &result,
-                                                  std::vector<RngEngine> &rng) {
+                                                  Branch &branch,
+                                                  ResultItr result) {
   state_t &state = states_[branch.state_index()];
+  std::vector<RngEngine> &rng = branch.rng_shots();
+  uint_t shots = branch.num_shots();
+
   // Check if meas_circ is empty, and if so return initial creg
   if (first_meas == last_meas) {
-    for (int_t i = 0; i < shots; i++) {
-      if (Base::num_process_per_experiment_ > 1) {
-        cregs_[branch.shot_index() + i] = state.creg();
-      } else {
-        result.save_count_data(state.creg(), Base::save_creg_memory_);
+    if (Base::num_process_per_experiment_ > 1) {
+      for (int_t i = 0; i < shots; i++) {
+        uint_t idx = branch.rng_shots()[i].initial_seed();
+        uint_t ip = branch.param_index(i);
+        idx += ip * Base::num_shots_per_bind_param_;
+        idx -= circuit_seeds_[ip];
+        cregs_[idx] = state.creg();
+      }
+    } else {
+      for (int_t i = 0; i < shots; i++) {
+        uint_t ip = branch.param_index(i);
+        (result + ip)->save_count_data(state.creg(), Base::save_creg_memory_);
       }
     }
     return;
@@ -738,12 +836,8 @@ void MultiStateExecutor<state_t>::measure_sampler(InputIterator first_meas,
                     meas_qubits.end());
 
   // Generate the samples
-  auto timer_start = myclock_t::now();
   std::vector<reg_t> all_samples;
   all_samples = sample_measure(state, meas_qubits, shots, rng);
-  auto time_taken =
-      std::chrono::duration<double>(myclock_t::now() - timer_start).count();
-  result.metadata.add(time_taken, "sample_measure_time");
 
   // Make qubit map of position in vector of measured qubits
   std::unordered_map<uint_t, uint_t> qubit_map;
@@ -769,9 +863,8 @@ void MultiStateExecutor<state_t>::measure_sampler(InputIterator first_meas,
       (memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first;
   uint_t num_registers =
       (register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first;
-  ClassicalRegister creg;
   for (int_t i = 0; i < all_samples.size(); i++) {
-    creg = state.creg();
+    ClassicalRegister creg = state.creg();
 
     // process memory bit measurements
     for (const auto &pair : memory_map) {
@@ -790,13 +883,72 @@ void MultiStateExecutor<state_t>::measure_sampler(InputIterator first_meas,
 
     // save creg to gather
     if (Base::num_process_per_experiment_ > 1) {
-      for (int_t j = 0; j < shots; j++)
-        cregs_[branch.shot_index() + j] = creg;
+      uint_t idx = branch.rng_shots()[i].initial_seed();
+      uint_t ip = branch.param_index(i);
+      idx += ip * Base::num_shots_per_bind_param_;
+      idx -= circuit_seeds_[ip];
+      cregs_[idx] = creg;
     } else {
+      uint_t ip = branch.param_index(i);
       std::string memory_hex = creg.memory_hex();
-      result.data.add_accum(static_cast<uint_t>(1ULL), "counts", memory_hex);
+      (result + ip)
+          ->data.add_accum(static_cast<uint_t>(1ULL), "counts", memory_hex);
       if (Base::save_creg_memory_)
-        result.data.add_list(memory_hex, "memory");
+        (result + ip)->data.add_list(memory_hex, "memory");
+    }
+  }
+}
+
+template <class state_t>
+void MultiStateExecutor<state_t>::apply_save_expval(Branch &root,
+                                                    const Operations::Op &op,
+                                                    ResultItr result) {
+  // Check empty edge case
+  if (op.expval_params.empty()) {
+    throw std::invalid_argument(
+        "Invalid save expval instruction (Pauli components are empty).");
+  }
+  bool variance = (op.type == Operations::OpType::save_expval_var);
+
+  // Accumulate expval components
+  double expval(0.);
+  double sq_expval(0.);
+
+  for (const auto &param : op.expval_params) {
+    // param is tuple (pauli, coeff, sq_coeff)
+    auto val =
+        states_[root.state_index()].expval_pauli(op.qubits, std::get<0>(param));
+    expval += std::get<1>(param) * val;
+    if (variance) {
+      sq_expval += std::get<2>(param) * val;
+    }
+  }
+
+  std::vector<bool> copied(Base::num_bind_params_, false);
+  if (variance) {
+    std::vector<double> expval_var(2);
+    expval_var[0] = expval;                      // mean
+    expval_var[1] = sq_expval - expval * expval; // variance
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      uint_t ip = root.param_index(i);
+      if (!copied[ip]) {
+        (result + ip)
+            ->save_data_average(states_[root.state_index()].creg(),
+                                op.string_params[0], expval_var, op.type,
+                                op.save_type);
+        copied[ip] = true;
+      }
+    }
+  } else {
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      uint_t ip = root.param_index(i);
+      if (!copied[ip]) {
+        (result + ip)
+            ->save_data_average(states_[root.state_index()].creg(),
+                                op.string_params[0], expval, op.type,
+                                op.save_type);
+        copied[ip] = true;
+      }
     }
   }
 }
diff --git a/src/simulators/parallel_state_executor.hpp b/src/simulators/parallel_state_executor.hpp
index 1dbb0983fd..85121689a9 100644
--- a/src/simulators/parallel_state_executor.hpp
+++ b/src/simulators/parallel_state_executor.hpp
@@ -79,11 +79,11 @@ class ParallelStateExecutor : public virtual MultiStateExecutor<state_t> {
 
   void run_circuit_with_sampling(Circuit &circ, const Config &config,
                                  RngEngine &init_rng,
-                                 ExperimentResult &result) override;
+                                 ResultItr result_it) override;
 
   void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise,
                          const Config &config, RngEngine &init_rng,
-                         ExperimentResult &result, bool sample_noise) override;
+                         ResultItr result_it, bool sample_noise) override;
 
   template <typename InputIterator>
   void measure_sampler(InputIterator first_meas, InputIterator last_meas,
@@ -93,14 +93,14 @@ class ParallelStateExecutor : public virtual MultiStateExecutor<state_t> {
   // apply operations for multi-chunk simulator
   template <typename InputIterator>
   void apply_ops_chunks(InputIterator first, InputIterator last,
-                        ExperimentResult &result, RngEngine &rng,
+                        ExperimentResult &result, RngEngine &rng, uint_t iparam,
                         bool final_ops);
 
   // apply ops on cache memory
   template <typename InputIterator>
   void apply_cache_blocking_ops(const int_t iGroup, InputIterator first,
                                 InputIterator last, ExperimentResult &result,
-                                RngEngine &rng);
+                                RngEngine &rng, uint_t iparam);
 
   // apply parallel operations (implement for each simulation method)
   virtual bool apply_parallel_op(const Operations::Op &op,
@@ -184,9 +184,6 @@ class ParallelStateExecutor : public virtual MultiStateExecutor<state_t> {
   // collect matrix over multiple chunks
   auto apply_to_matrix(bool copy = false);
 
-  // Apply the global phase
-  virtual void apply_global_phase();
-
   uint_t mapped_index(const uint_t idx);
 };
 
@@ -317,7 +314,7 @@ bool ParallelStateExecutor<state_t>::allocate_states(uint_t num_states,
                                                      const Config &config) {
   int_t i;
   bool init_states = true;
-  bool ret = true;
+  uint_t num_states_allocated;
   // deallocate qregs before reallocation
   if (Base::states_.size() > 0) {
     if (Base::states_.size() == num_states)
@@ -345,6 +342,7 @@ bool ParallelStateExecutor<state_t>::allocate_states(uint_t num_states,
     // allocate qregs
     Base::states_[0].set_config(config);
     Base::states_[0].qreg().set_max_matrix_bits(Base::max_matrix_qubits_);
+    Base::states_[0].qreg().set_max_sampling_shots(Base::max_sampling_shots_);
     Base::states_[0].qreg().set_num_threads_per_group(
         Base::num_threads_per_group_);
     Base::states_[0].set_num_global_qubits(Base::num_qubits_);
@@ -352,36 +350,35 @@ bool ParallelStateExecutor<state_t>::allocate_states(uint_t num_states,
     Base::states_[0].qreg().cuStateVec_enable(Base::cuStateVec_enable_);
 #endif
     Base::states_[0].qreg().set_target_gpus(Base::target_gpus_);
-
-    ret &= Base::states_[0].qreg().chunk_setup(
+    num_states_allocated = Base::states_[0].qreg().chunk_setup(
         squbits, gqubits, Base::global_state_index_, num_states);
-    for (i = 1; i < num_states; i++) {
+    for (i = 1; i < num_states_allocated; i++) {
       Base::states_[i].set_config(config);
-      ret &= Base::states_[i].qreg().chunk_setup(Base::states_[0].qreg(),
-                                                 Base::global_state_index_ + i);
+      Base::states_[i].qreg().chunk_setup(Base::states_[0].qreg(),
+                                          Base::global_state_index_ + i);
       Base::states_[i].qreg().set_num_threads_per_group(
           Base::num_threads_per_group_);
       Base::states_[i].set_num_global_qubits(Base::num_qubits_);
     }
   }
-  Base::num_active_states_ = num_states;
+  Base::num_active_states_ = num_states_allocated;
 
   // initialize groups
   Base::top_state_of_group_.clear();
   Base::num_groups_ = 0;
-  for (i = 0; i < num_states; i++) {
+  for (i = 0; i < num_states_allocated; i++) {
     if (Base::states_[i].qreg().top_of_group()) {
       Base::top_state_of_group_.push_back(i);
       Base::num_groups_++;
     }
   }
-  Base::top_state_of_group_.push_back(num_states);
+  Base::top_state_of_group_.push_back(num_states_allocated);
   Base::num_states_in_group_.resize(Base::num_groups_);
   for (i = 0; i < Base::num_groups_; i++) {
     Base::num_states_in_group_[i] =
         Base::top_state_of_group_[i + 1] - Base::top_state_of_group_[i];
   }
-  return ret;
+  return (num_states_allocated == num_states);
 }
 
 template <class state_t>
@@ -412,128 +409,174 @@ uint_t ParallelStateExecutor<state_t>::mapped_index(const uint_t idx) {
 template <class state_t>
 void ParallelStateExecutor<state_t>::run_circuit_with_sampling(
     Circuit &circ, const Config &config, RngEngine &init_rng,
-    ExperimentResult &result) {
+    ResultItr result_it) {
 
   // Optimize circuit
   Noise::NoiseModel dummy_noise;
   state_t dummy_state;
+  ExperimentResult fusion_result;
 
+  // optimize circuit
   bool cache_block = false;
   if (multiple_chunk_required(config, circ, dummy_noise)) {
     auto fusion_pass = Base::transpile_fusion(circ.opset(), config);
     fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
-                                 result);
+                                 fusion_result);
 
     // Cache blocking pass
     auto cache_block_pass = transpile_cache_blocking(circ, dummy_noise, config);
     cache_block_pass.set_sample_measure(true);
     cache_block_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
-                                      result);
+                                      fusion_result);
     cache_block = cache_block_pass.enabled();
   }
   if (!cache_block) {
     return Executor<state_t>::run_circuit_with_sampling(circ, config, init_rng,
-                                                        result);
+                                                        result_it);
   }
   Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ);
+  Base::num_bind_params_ = circ.num_bind_params;
 
   uint_t nchunks =
       1ull << ((circ.num_qubits - cache_block_qubit_) * qubit_scale());
+
   Base::set_distribution(nchunks);
   allocate(circ.num_qubits, config);
-  // Set state config
-  for (uint_t i = 0; i < Base::states_.size(); i++) {
-    Base::states_[i].set_parallelization(Base::parallel_state_update_);
-    Base::states_[i].set_global_phase(circ.global_phase_angle);
-  }
-  Base::set_global_phase(circ.global_phase_angle);
 
-  // run with multi-chunks
-  RngEngine rng = init_rng;
+  for (uint_t iparam = 0; iparam < Base::num_bind_params_; iparam++) {
+    ExperimentResult &result = *(result_it + iparam);
+    result.metadata.copy(fusion_result.metadata);
 
-  auto &ops = circ.ops;
-  auto first_meas = circ.first_measure_pos; // Position of first measurement op
-  bool final_ops = (first_meas == ops.size());
+    // Set state config
+    for (uint_t i = 0; i < Base::states_.size(); i++) {
+      Base::states_[i].set_parallelization(Base::parallel_state_update_);
+      if (circ.global_phase_for_params.size() == circ.num_bind_params)
+        Base::states_[i].set_global_phase(circ.global_phase_for_params[iparam]);
+      else
+        Base::states_[i].set_global_phase(circ.global_phase_angle);
+    }
 
-  initialize_qreg(circ.num_qubits);
-  for (uint_t i = 0; i < Base::states_.size(); i++) {
-    Base::states_[i].initialize_creg(circ.num_memory, circ.num_registers);
-  }
+    // run with multi-chunks
+    RngEngine rng;
+    if (iparam == 0)
+      rng = init_rng;
+    else if (Base::num_bind_params_ > 1)
+      rng.set_seed(circ.seed_for_params[iparam]);
+    else
+      rng.set_seed(circ.seed);
+
+    auto &ops = circ.ops;
+    auto first_meas =
+        circ.first_measure_pos; // Position of first measurement op
+    bool final_ops = (first_meas == ops.size());
+
+    initialize_qreg(circ.num_qubits);
+    for (uint_t i = 0; i < Base::states_.size(); i++) {
+      Base::states_[i].initialize_creg(circ.num_memory, circ.num_registers);
+    }
 
-  // Run circuit instructions before first measure
-  apply_ops_chunks(ops.cbegin(), ops.cbegin() + first_meas, result, rng,
-                   final_ops);
+    // Run circuit instructions before first measure
+    apply_ops_chunks(ops.cbegin(), ops.cbegin() + first_meas, result, rng,
+                     iparam, final_ops);
 
-  // Get measurement operations and set of measured qubits
-  measure_sampler(circ.ops.begin() + first_meas, circ.ops.end(), circ.shots,
-                  result, rng);
+    // Get measurement operations and set of measured qubits
+    measure_sampler(circ.ops.begin() + first_meas, circ.ops.end(), circ.shots,
+                    result, rng);
 
-  // Add measure sampling metadata
-  result.metadata.add(true, "measure_sampling");
-  Base::states_[0].add_metadata(result);
+    // Add measure sampling metadata
+    result.metadata.add(true, "measure_sampling");
+    Base::states_[0].add_metadata(result);
+  }
 }
 
 template <class state_t>
 void ParallelStateExecutor<state_t>::run_circuit_shots(
     Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
-    RngEngine &init_rng, ExperimentResult &result, bool sample_noise) {
+    RngEngine &init_rng, ResultItr result_it, bool sample_noise) {
 
   if (!multiple_chunk_required(config, circ, noise)) {
-    return Base::run_circuit_shots(circ, noise, config, init_rng, result,
+    return Base::run_circuit_shots(circ, noise, config, init_rng, result_it,
                                    sample_noise);
   }
 
   uint_t nchunks =
       1ull << ((circ.num_qubits - cache_block_qubit_) * qubit_scale());
-  Base::set_distribution(nchunks);
+  Base::num_bind_params_ = circ.num_bind_params;
 
+  // Optimize circuit
+  Noise::NoiseModel dummy_noise;
+  state_t dummy_state;
   auto fusion_pass = Base::transpile_fusion(circ.opset(), config);
   auto cache_block_pass = transpile_cache_blocking(circ, noise, config);
+  ExperimentResult fusion_result;
+  if (!sample_noise) {
+    fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
+                                 fusion_result);
+    // Cache blocking pass
+    cache_block_pass.set_sample_measure(false);
+    cache_block_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
+                                      fusion_result);
+    Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ);
+  } else {
+    Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ);
+    Base::max_matrix_qubits_ =
+        std::max(Base::max_matrix_qubits_, (int)fusion_pass.max_qubit);
+  }
 
-  for (int_t ishot = 0; ishot < circ.shots; ishot++) {
-    RngEngine rng;
-    if (ishot == 0)
-      rng = init_rng;
-    else
-      rng.set_seed(circ.seed + ishot);
-
-    // Optimize circuit
-    Noise::NoiseModel dummy_noise;
-    state_t dummy_state;
+  Base::set_distribution(nchunks);
+  allocate(circ.num_qubits, config);
 
-    Circuit circ_opt;
-    if (sample_noise) {
-      circ_opt = noise.sample_noise(circ, rng);
-    } else {
-      circ_opt = circ;
+  for (uint_t iparam = 0; iparam < Base::num_bind_params_; iparam++) {
+    if (!sample_noise) {
+      ExperimentResult &result = *(result_it + iparam);
+      result.metadata.copy(fusion_result.metadata);
     }
-    fusion_pass.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(),
-                                 result);
-    Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ_opt);
 
-    // Cache blocking pass
-    cache_block_pass.set_sample_measure(false);
-    cache_block_pass.optimize_circuit(circ_opt, dummy_noise,
-                                      dummy_state.opset(), result);
-    allocate(circ.num_qubits, config);
+    for (int_t ishot = 0; ishot < circ.shots; ishot++) {
+      RngEngine rng;
+      if (iparam == 0 && ishot == 0)
+        rng = init_rng;
+      else if (Base::num_bind_params_ > 1)
+        rng.set_seed(circ.seed_for_params[iparam] + ishot);
+      else
+        rng.set_seed(circ.seed + ishot);
+
+      // Set state config and global phase
+      for (uint_t i = 0; i < Base::states_.size(); i++) {
+        Base::states_[i].set_parallelization(Base::parallel_state_update_);
+        if (circ.global_phase_for_params.size() == circ.num_bind_params)
+          Base::states_[i].set_global_phase(
+              circ.global_phase_for_params[iparam]);
+        else
+          Base::states_[i].set_global_phase(circ.global_phase_angle);
+      }
 
-    // Set state config
-    for (uint_t i = 0; i < Base::states_.size(); i++) {
-      Base::states_[i].set_parallelization(Base::parallel_state_update_);
-      Base::states_[i].set_global_phase(circ.global_phase_angle);
-    }
-    Base::set_global_phase(circ.global_phase_angle);
+      // initialize
+      initialize_qreg(circ.num_qubits);
+      for (uint_t i = 0; i < Base::states_.size(); i++) {
+        Base::states_[i].initialize_creg(circ.num_memory, circ.num_registers);
+      }
 
-    initialize_qreg(circ.num_qubits);
-    for (uint_t i = 0; i < Base::states_.size(); i++) {
-      Base::states_[i].initialize_creg(circ.num_memory, circ.num_registers);
+      if (sample_noise) {
+        Circuit circ_opt = noise.sample_noise(circ, rng);
+        fusion_pass.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(),
+                                     *(result_it + iparam));
+        // Cache blocking pass
+        cache_block_pass.set_sample_measure(false);
+        cache_block_pass.optimize_circuit(
+            circ_opt, dummy_noise, dummy_state.opset(), *(result_it + iparam));
+
+        apply_ops_chunks(circ_opt.ops.cbegin(), circ_opt.ops.cend(),
+                         *(result_it + iparam), rng, iparam, true);
+      } else {
+        apply_ops_chunks(circ.ops.cbegin(), circ.ops.cend(),
+                         *(result_it + iparam), rng, iparam, true);
+      }
+      (result_it + iparam)
+          ->save_count_data(Base::states_[0].creg(), Base::save_creg_memory_);
     }
-
-    apply_ops_chunks(circ_opt.ops.cbegin(), circ_opt.ops.cend(), result, rng,
-                     true);
-    result.save_count_data(Base::states_[0].creg(), Base::save_creg_memory_);
+    Base::states_[0].add_metadata(*(result_it + iparam));
   }
-  Base::states_[0].add_metadata(result);
 }
 
 template <class state_t>
@@ -688,11 +731,9 @@ void ParallelStateExecutor<state_t>::apply_roerror(const Operations::Op &op,
 
 template <class state_t>
 template <typename InputIterator>
-void ParallelStateExecutor<state_t>::apply_ops_chunks(InputIterator first,
-                                                      InputIterator last,
-                                                      ExperimentResult &result,
-                                                      RngEngine &rng,
-                                                      bool final_ops) {
+void ParallelStateExecutor<state_t>::apply_ops_chunks(
+    InputIterator first, InputIterator last, ExperimentResult &result,
+    RngEngine &rng, uint_t iparam, bool final_ops) {
   uint_t iOp, nOp;
   reg_t multi_swap;
 
@@ -700,7 +741,7 @@ void ParallelStateExecutor<state_t>::apply_ops_chunks(InputIterator first,
   iOp = 0;
 
   while (iOp < nOp) {
-    const Operations::Op op_iOp = *(first + iOp);
+    const Operations::Op &op_iOp = *(first + iOp);
     if (op_iOp.type == Operations::OpType::gate &&
         op_iOp.name == "swap_chunk") {
       // apply swap between chunks
@@ -753,25 +794,44 @@ void ParallelStateExecutor<state_t>::apply_ops_chunks(InputIterator first,
 #pragma omp parallel for num_threads(Base::num_groups_)
         for (int_t ig = 0; ig < Base::num_groups_; ig++)
           apply_cache_blocking_ops(ig, first + iOpBegin, first + iOpEnd, result,
-                                   rng);
+                                   rng, iparam);
       } else {
         for (int_t ig = 0; ig < Base::num_groups_; ig++)
           apply_cache_blocking_ops(ig, first + iOpBegin, first + iOpEnd, result,
-                                   rng);
+                                   rng, iparam);
       }
       iOp = iOpEnd;
     } else {
-      if (!apply_parallel_op(op_iOp, result, rng,
-                             final_ops && nOp == iOp + 1)) {
-        if (Base::num_groups_ > 1 && chunk_omp_parallel_) {
+      if (op_iOp.has_bind_params) {
+        std::vector<Operations::Op> bind_op(1);
+        bind_op[0] =
+            Operations::bind_parameter(op_iOp, iparam, Base::num_bind_params_);
+        if (!apply_parallel_op(bind_op[0], result, rng,
+                               final_ops && nOp == iOp + 1)) {
+          if (Base::num_groups_ > 1 && chunk_omp_parallel_) {
 #pragma omp parallel for num_threads(Base::num_groups_)
-          for (int_t ig = 0; ig < Base::num_groups_; ig++)
-            apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result,
-                                     rng);
-        } else {
-          for (int_t ig = 0; ig < Base::num_groups_; ig++)
-            apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result,
-                                     rng);
+            for (int_t ig = 0; ig < Base::num_groups_; ig++)
+              apply_cache_blocking_ops(ig, bind_op.cbegin(), bind_op.cend(),
+                                       result, rng, iparam);
+          } else {
+            for (int_t ig = 0; ig < Base::num_groups_; ig++)
+              apply_cache_blocking_ops(ig, bind_op.cbegin(), bind_op.cend(),
+                                       result, rng, iparam);
+          }
+        }
+      } else {
+        if (!apply_parallel_op(op_iOp, result, rng,
+                               final_ops && nOp == iOp + 1)) {
+          if (Base::num_groups_ > 1 && chunk_omp_parallel_) {
+#pragma omp parallel for num_threads(Base::num_groups_)
+            for (int_t ig = 0; ig < Base::num_groups_; ig++)
+              apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result,
+                                       rng, iparam);
+          } else {
+            for (int_t ig = 0; ig < Base::num_groups_; ig++)
+              apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result,
+                                       rng, iparam);
+          }
         }
       }
     }
@@ -819,13 +879,18 @@ template <class state_t>
 template <typename InputIterator>
 void ParallelStateExecutor<state_t>::apply_cache_blocking_ops(
     const int_t iGroup, InputIterator first, InputIterator last,
-    ExperimentResult &result, RngEngine &rng) {
+    ExperimentResult &result, RngEngine &rng, uint_t iparam) {
   // for each chunk in group
   for (int_t iChunk = Base::top_state_of_group_[iGroup];
        iChunk < Base::top_state_of_group_[iGroup + 1]; iChunk++) {
     // fecth chunk in cache
     if (Base::states_[iChunk].qreg().fetch_chunk()) {
-      Base::states_[iChunk].apply_ops(first, last, result, rng, false);
+      if (Base::num_bind_params_ > 1) {
+        Base::run_circuit_with_parameter_binding(
+            Base::states_[iChunk], first, last, result, rng, iparam, false);
+      } else {
+        Base::states_[iChunk].apply_ops(first, last, result, rng, false);
+      }
 
       // release chunk from cache
       Base::states_[iChunk].qreg().release_chunk();
@@ -1026,25 +1091,6 @@ void ParallelStateExecutor<state_t>::apply_save_expval(
   }
 }
 
-template <class state_t>
-void ParallelStateExecutor<state_t>::apply_global_phase() {
-  if (Base::has_global_phase_) {
-    if (chunk_omp_parallel_ && Base::num_groups_ > 1) {
-#pragma omp parallel for
-      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
-        for (int_t iChunk = Base::top_state_of_group_[ig];
-             iChunk < Base::top_state_of_group_[ig + 1]; iChunk++)
-          Base::states_[iChunk].qreg().apply_diagonal_matrix(
-              {0}, {Base::global_phase_, Base::global_phase_});
-      }
-    } else {
-      for (int_t i = 0; i < Base::states_.size(); i++)
-        Base::states_[i].qreg().apply_diagonal_matrix(
-            {0}, {Base::global_phase_, Base::global_phase_});
-    }
-  }
-}
-
 template <class state_t>
 void ParallelStateExecutor<state_t>::apply_chunk_swap(const reg_t &qubits) {
   uint_t nLarge = 1;
diff --git a/src/simulators/shot_branching.hpp b/src/simulators/shot_branching.hpp
index 358b07c08d..e9d1eb5811 100644
--- a/src/simulators/shot_branching.hpp
+++ b/src/simulators/shot_branching.hpp
@@ -29,12 +29,14 @@ class Branch {
   uint_t state_index_; // state index
   uint_t root_state_index_;
 
-  uint_t shot_index_; // starting shot index
-
   // creg to be stored to the state
   ClassicalRegister creg_;
   // random generators for shots
   std::vector<RngEngine> shots_;
+  // index of parameter for runtime parameter binding
+  reg_t param_index_;
+  reg_t param_shots_;
+
   // additional operations applied after shot branching
   std::vector<Operations::Op> additional_ops_;
 
@@ -63,7 +65,6 @@ class Branch {
 
   uint_t &state_index(void) { return state_index_; }
   uint_t &root_state_index(void) { return root_state_index_; }
-  uint_t &shot_index(void) { return shot_index_; }
   ClassicalRegister &creg(void) { return creg_; }
   std::vector<RngEngine> &rng_shots(void) { return shots_; }
   OpItr &op_iterator(void) { return iter_; }
@@ -140,6 +141,25 @@ class Branch {
                                     const Noise::NoiseModel &noise);
 
   void remove_empty_branches(void);
+
+  // reset shots to initial state
+  void reset_branch(void);
+
+  // for runtime parameterization
+  void set_param_index(uint_t ishot, uint_t nshots_per_param);
+  uint_t param_index(uint_t ishot) {
+    if (param_index_.size() == 1) {
+      return param_index_[0];
+    }
+    for (int_t i = 0; i < param_index_.size(); i++) {
+      if (param_shots_[i] > ishot) {
+        return param_index_[i];
+      }
+    }
+    return 0;
+  }
+  void branch_shots_by_params(void);
+  uint_t num_params(void) { return param_index_.size(); }
 };
 
 void Branch::branch_shots(reg_t &shots, int_t nbranch) {
@@ -150,15 +170,69 @@ void Branch::branch_shots(reg_t &shots, int_t nbranch) {
     branches_[i]->creg_ = creg_;
     branches_[i]->iter_ = iter_;
     branches_[i]->flow_marks_ = flow_marks_;
+
+    if (param_index_.size() > 1) {
+      branches_[i]->param_index_ = param_index_;
+      branches_[i]->param_shots_.resize(param_index_.size());
+      for (int_t j = 0; j < param_index_.size(); j++)
+        branches_[i]->param_shots_[j] = 0;
+    }
   }
+
+  uint_t pos = 0;
   for (int_t i = 0; i < shots.size(); i++) {
     branches_[shots[i]]->shots_.push_back(shots_[i]);
+
+    if (param_index_.size() > 1) {
+      if (i >= param_shots_[pos])
+        pos++;
+      branches_[shots[i]]->param_shots_[pos]++;
+    }
   }
-  // update shot indices
-  uint_t index = shot_index_;
-  for (int_t i = 0; i < nbranch; i++) {
-    branches_[i]->shot_index_ = index;
-    index += branches_[i]->shots_.size();
+
+  // set parameter indices
+  if (param_index_.size() > 1) {
+    for (int_t i = 0; i < nbranch; i++) {
+      uint_t pos = 0;
+      while (pos < branches_[i]->param_index_.size()) {
+        if (branches_[i]->param_shots_[pos] == 0) {
+          branches_[i]->param_index_.erase(branches_[i]->param_index_.begin() +
+                                           pos);
+          branches_[i]->param_shots_.erase(branches_[i]->param_index_.begin() +
+                                           pos);
+        } else {
+          if (pos > 0) {
+            branches_[i]->param_shots_[pos] +=
+                branches_[i]->param_shots_[pos - 1];
+          }
+          pos++;
+        }
+      }
+    }
+  } else {
+    for (int_t i = 0; i < nbranch; i++)
+      branches_[i]->set_param_index(param_index_[0], 0);
+  }
+}
+
+void Branch::branch_shots_by_params(void) {
+  branches_.resize(param_index_.size());
+
+  for (int_t i = 0; i < param_index_.size(); i++) {
+    branches_[i] = std::make_shared<Branch>();
+    branches_[i]->creg_ = creg_;
+    branches_[i]->iter_ = iter_;
+    branches_[i]->flow_marks_ = flow_marks_;
+  }
+  uint_t pos = 0;
+  for (int_t i = 0; i < shots_.size(); i++) {
+    if (i >= param_shots_[pos])
+      pos++;
+    branches_[pos]->shots_.push_back(shots_[i]);
+  }
+
+  for (int_t i = 0; i < param_index_.size(); i++) {
+    branches_[i]->set_param_index(param_index_[i], 0);
   }
 }
 
@@ -272,8 +346,9 @@ void Branch::remove_empty_branches(void) {
     if (branches_[j]->num_shots() > 0) {
       // copy shots to the root
       shots_ = branches_[j]->rng_shots();
+      param_index_ = branches_[j]->param_index_;
+      param_shots_ = branches_[j]->param_shots_;
       additional_ops_ = branches_[j]->additional_ops();
-      shot_index_ = branches_[j]->shot_index();
       creg_ = branches_[j]->creg();
       branches_[j].reset();
       istart = j + 1;
@@ -293,6 +368,39 @@ void Branch::remove_empty_branches(void) {
   branches_ = new_branches;
 }
 
+void Branch::reset_branch(void) {
+  // reset random seeds
+  for (int_t i = 0; i < shots_.size(); i++) {
+    shots_[i].set_seed(shots_[i].initial_seed());
+  }
+  additional_ops_.clear();
+  branches_.clear();
+  flow_marks_.clear();
+}
+
+void Branch::set_param_index(uint_t ishot, uint_t nshots_per_param) {
+  if (nshots_per_param == 0) {
+    param_index_.push_back(ishot);
+    param_shots_.push_back(shots_.size());
+    return;
+  }
+
+  uint_t pos = 0;
+  param_index_.clear();
+  param_shots_.clear();
+
+  param_index_.push_back(ishot / nshots_per_param);
+  for (int_t i = 1; i < shots_.size(); i++) {
+    uint_t ip = (ishot + i) / nshots_per_param;
+    if (ip != param_index_[pos]) {
+      param_shots_.push_back(i);
+      param_index_.push_back(ip);
+      pos++;
+    }
+  }
+  param_shots_.push_back(shots_.size());
+}
+
 //-------------------------------------------------------------------------
 } // namespace CircuitExecutor
 //-------------------------------------------------------------------------
diff --git a/src/simulators/state.hpp b/src/simulators/state.hpp
index 6209e1075d..d0cd4baac0 100644
--- a/src/simulators/state.hpp
+++ b/src/simulators/state.hpp
@@ -217,6 +217,9 @@ class Base {
   // set maximum number of qubits for matrix multiplication
   virtual void set_max_matrix_qubits(int_t bits) { max_matrix_qubits_ = bits; }
 
+  // set max sampling shots
+  void set_max_sampling_shots(int_t shots) { max_sampling_shots_ = shots; }
+
   // set max number of shots to execute in a batch (used in StateChunk class)
   virtual void set_max_bached_shots(uint_t shots) {}
 
@@ -259,6 +262,7 @@ class Base {
   complex_t global_phase_ = 1;
 
   int_t max_matrix_qubits_ = 0;
+  int_t max_sampling_shots_ = 0;
 
   std::string sim_device_name_ = "CPU";
 
diff --git a/src/simulators/statevector/chunk/chunk.hpp b/src/simulators/statevector/chunk/chunk.hpp
index 7d5c66415b..df6d068f91 100644
--- a/src/simulators/statevector/chunk/chunk.hpp
+++ b/src/simulators/statevector/chunk/chunk.hpp
@@ -165,10 +165,10 @@ class Chunk {
     }
   }
 
-  void ResizeMatrixBuffers(int bits) {
+  void ResizeMatrixBuffers(int bits, int max_shots) {
     // synchronize all kernel execution before changing matrix buffer size
     chunk_container_.lock()->synchronize(chunk_pos_);
-    chunk_container_.lock()->ResizeMatrixBuffers(bits);
+    chunk_container_.lock()->ResizeMatrixBuffers(bits, max_shots);
   }
 
   void CopyIn(Chunk<data_t> &src) {
@@ -262,6 +262,13 @@ class Chunk {
     }
     return chunk_container_.lock()->probability_buffer(chunk_pos_);
   }
+  void copy_reduce_buffer(std::vector<double> &ret, uint_t num_val) const {
+    if (cache_) {
+      return cache_->copy_reduce_buffer(ret, num_val);
+    }
+    return chunk_container_.lock()->copy_reduce_buffer(ret, chunk_pos_,
+                                                       num_val);
+  }
 
   void synchronize(void) const {
     if (cache_) {
@@ -316,6 +323,20 @@ class Chunk {
       chunk_container_.lock()->apply_matrix(chunk_pos_, qubits, control_bits,
                                             mat, chunk_index_, count);
   }
+  void apply_batched_matrix(const reg_t &qubits, const int_t control_bits,
+                            const cvector_t<double> &mat,
+                            const uint_t num_shots_per_matrix,
+                            const uint_t count) {
+    if (cache_)
+      cache_->chunk_container_.lock()->apply_batched_matrix(
+          cache_->chunk_pos_, qubits, control_bits, mat, num_shots_per_matrix,
+          chunk_index_, count);
+    else
+      chunk_container_.lock()->apply_batched_matrix(
+          chunk_pos_, qubits, control_bits, mat, num_shots_per_matrix,
+          chunk_index_, count);
+  }
+
   // apply diagonal matrix
   void apply_diagonal_matrix(const reg_t &qubits, const int_t control_bits,
                              const cvector_t<double> &diag,
@@ -327,6 +348,21 @@ class Chunk {
       chunk_container_.lock()->apply_diagonal_matrix(
           chunk_pos_, qubits, control_bits, diag, chunk_index_, count);
   }
+  void apply_batched_diagonal_matrix(const reg_t &qubits,
+                                     const int_t control_bits,
+                                     const cvector_t<double> &diag,
+                                     const uint_t num_shots_per_matrix,
+                                     const uint_t count) {
+    if (cache_)
+      cache_->chunk_container_.lock()->apply_batched_diagonal_matrix(
+          cache_->chunk_pos_, qubits, control_bits, diag, num_shots_per_matrix,
+          chunk_index_, count);
+    else
+      chunk_container_.lock()->apply_batched_diagonal_matrix(
+          chunk_pos_, qubits, control_bits, diag, num_shots_per_matrix,
+          chunk_index_, count);
+  }
+
   // apply (controlled) X
   void apply_X(const reg_t &qubits, const uint_t count) {
     if (cache_)
@@ -411,6 +447,14 @@ class Chunk {
     return chunk_container_.lock()->expval_pauli(chunk_pos_, qubits, pauli,
                                                  initial_phase);
   }
+  void batched_expval_pauli(const uint_t count, const reg_t &qubits,
+                            const std::string &pauli, bool variance,
+                            std::complex<double> param, bool first,
+                            const complex_t initial_phase) const {
+    chunk_container_.lock()->batched_expval_pauli(chunk_pos_, count, qubits,
+                                                  pauli, variance, param, first,
+                                                  initial_phase);
+  }
 };
 
 //------------------------------------------------------------------------------
diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp
index 029f9a039c..b249b12c95 100644
--- a/src/simulators/statevector/chunk/chunk_container.hpp
+++ b/src/simulators/statevector/chunk/chunk_container.hpp
@@ -135,6 +135,7 @@ class ChunkContainer
   virtual ~ChunkContainer() {}
 
   int_t chunk_bits(void) { return chunk_bits_; }
+  int_t num_qubits(void) { return num_qubits_; }
   int_t place(void) { return place_id_; }
   void set_place(int_t id, int_t n) {
     place_id_ = id;
@@ -171,7 +172,7 @@ class ChunkContainer
                           uint_t chunks, uint_t buffers = AER_MAX_BUFFERS,
                           bool multi_shots = false,
                           int matrix_bit = AER_DEFAULT_MATRIX_BITS,
-                          bool density_matrix = false) = 0;
+                          int max_shots = 0, bool density_matrix = false) = 0;
   virtual void Deallocate(void) = 0;
 
   virtual void Set(uint_t i, const thrust::complex<data_t> &t) = 0;
@@ -183,7 +184,7 @@ class ChunkContainer
                            uint_t size) const = 0;
   virtual void StoreUintParams(const std::vector<uint_t> &prm,
                                uint_t iChunk) const = 0;
-  virtual void ResizeMatrixBuffers(int bits) = 0;
+  virtual void ResizeMatrixBuffers(int bits, int max_shots) = 0;
 
   virtual void CopyIn(Chunk<data_t> &src, uint_t iChunk) = 0;
   virtual void CopyOut(Chunk<data_t> &dest, uint_t iChunk) = 0;
@@ -206,8 +207,8 @@ class ChunkContainer
                   uint_t count) const;
 
   template <typename Function>
-  void ExecuteSum2(double *pSum, Function func, uint_t iChunk,
-                   uint_t count) const;
+  void ExecuteSum2(double *pSum, Function func, uint_t iChunk, uint_t count,
+                   bool init = true) const;
 
   virtual reg_t sample_measure(uint_t iChunk, const std::vector<double> &rnds,
                                uint_t stride = 1, bool dot = true,
@@ -232,7 +233,7 @@ class ChunkContainer
   }
   virtual uint_t *param_pointer(uint_t iChunk) const { return NULL; }
 
-  virtual void synchronize(uint_t iChunk) { ; }
+  virtual void synchronize(uint_t iChunk) const { ; }
 
   // set qubits to be blocked
   virtual void set_blocked_qubits(uint_t iChunk, const reg_t &qubits) { ; }
@@ -254,6 +255,8 @@ class ChunkContainer
 
   virtual void copy_to_probability_buffer(std::vector<double> &buf, int pos) {}
 
+  virtual void copy_reduce_buffer(std::vector<double> &ret, uint_t iChunk,
+                                  uint_t num_val) const {}
   // classical register to store measured bits/used for bfunc operations
   virtual void allocate_creg(uint_t num_mem, uint_t num_reg) {}
   void set_num_creg_bits(uint_t bits) {
@@ -270,12 +273,23 @@ class ChunkContainer
                             const cvector_t<double> &mat, const uint_t gid,
                             const uint_t count);
 
+  virtual void apply_batched_matrix(const uint_t iChunk, const reg_t &qubits,
+                                    const int_t control_bits,
+                                    const cvector_t<double> &mat,
+                                    const uint_t num_shots_per_matrix,
+                                    const uint_t gid, const uint_t count);
+
   // apply diagonal matrix
   virtual void apply_diagonal_matrix(const uint_t iChunk, const reg_t &qubits,
                                      const int_t control_bits,
                                      const cvector_t<double> &diag,
                                      const uint_t gid, const uint_t count);
 
+  virtual void apply_batched_diagonal_matrix(
+      const uint_t iChunk, const reg_t &qubits, const int_t control_bits,
+      const cvector_t<double> &diag, const uint_t num_shots_per_matrix,
+      const uint_t gid, const uint_t count);
+
   // apply (controlled) X
   virtual void apply_X(const uint_t iChunk, const reg_t &qubits,
                        const uint_t gid, const uint_t count);
@@ -324,6 +338,12 @@ class ChunkContainer
                               const std::string &pauli,
                               const complex_t initial_phase) const;
 
+  virtual void batched_expval_pauli(const uint_t iChunk, const uint_t count,
+                                    const reg_t &qubits,
+                                    const std::string &pauli, bool variance,
+                                    std::complex<double> param, bool first,
+                                    const complex_t initial_phase) const;
+
 protected:
   int convert_blocked_qubit(int qubit) {
     int i;
@@ -639,8 +659,8 @@ struct complex_sum {
 template <typename data_t>
 template <typename Function>
 void ChunkContainer<data_t>::ExecuteSum2(double *pSum, Function func,
-                                         uint_t iChunk, uint_t count) const {
-
+                                         uint_t iChunk, uint_t count,
+                                         bool init) const {
 #ifdef AER_THRUST_GPU
   uint_t size = count * func.size(chunk_bits_);
 
@@ -673,7 +693,7 @@ void ChunkContainer<data_t>::ExecuteSum2(double *pSum, Function func,
           nt = QV_CUDA_NUM_THREADS;
         }
         dev_apply_function_sum_complex<data_t, Function>
-            <<<nb, nt, 0, strm>>>(buf, func, buf_size, ntotal);
+            <<<nb, nt, 0, strm>>>(buf, func, buf_size, ntotal, init);
       }
       cudaError_t err = cudaGetLastError();
       if (err != cudaSuccess) {
@@ -714,7 +734,7 @@ void ChunkContainer<data_t>::ExecuteSum2(double *pSum, Function func,
         }
         dim3 grid(nb, count, 1);
         dev_apply_function_sum_complex<data_t, Function>
-            <<<grid, nt, 0, strm>>>(buf, func, buf_size, ntotal);
+            <<<grid, nt, 0, strm>>>(buf, func, buf_size, ntotal, init);
       }
       cudaError_t err = cudaGetLastError();
       if (err != cudaSuccess) {
@@ -775,7 +795,10 @@ void ChunkContainer<data_t>::ExecuteSum2(double *pSum, Function func,
     if (count == 1 && pSum) {
       *((thrust::complex<double> *)pSum) = ret;
     } else {
-      *((thrust::complex<double> *)reduce_buffer(iChunk + i)) = ret;
+      if (init)
+        *((thrust::complex<double> *)reduce_buffer(iChunk + i)) = ret;
+      else
+        *((thrust::complex<double> *)reduce_buffer(iChunk + i)) += ret;
     }
   }
 #endif
@@ -876,6 +899,60 @@ void ChunkContainer<data_t>::apply_diagonal_matrix(
   }
 }
 
+template <typename data_t>
+void ChunkContainer<data_t>::apply_batched_matrix(
+    const uint_t iChunk, const reg_t &qubits, const int_t control_bits,
+    const cvector_t<double> &mat, const uint_t num_shots_per_matrix,
+    const uint_t gid, const uint_t count) {
+  const size_t N = qubits.size() - control_bits;
+  uint_t imat_begin = gid / num_shots_per_matrix;
+  uint_t imat_end = (gid + count - 1) / num_shots_per_matrix;
+  uint_t matrix_size = 1ull << (2 * N);
+
+  StoreMatrix(&mat[0] + imat_begin * matrix_size, iChunk,
+              (imat_end - imat_begin + 1) * matrix_size);
+  if (N == 1) {
+    Execute(
+        BatchedMatrixMult2x2<data_t>(qubits, imat_begin, num_shots_per_matrix),
+        iChunk, gid, count);
+  } else {
+    auto qubits_sorted = qubits;
+    std::sort(qubits_sorted.begin(), qubits_sorted.end());
+    for (int i = 0; i < N; i++) {
+      qubits_sorted.push_back(qubits[i]);
+    }
+    StoreUintParams(qubits_sorted, iChunk);
+
+    Execute(BatchedMatrixMultNxN<data_t>(N, imat_begin, num_shots_per_matrix),
+            iChunk, gid, count);
+  }
+}
+
+template <typename data_t>
+void ChunkContainer<data_t>::apply_batched_diagonal_matrix(
+    const uint_t iChunk, const reg_t &qubits, const int_t control_bits,
+    const cvector_t<double> &diag, const uint_t num_shots_per_matrix,
+    const uint_t gid, const uint_t count) {
+  const size_t N = qubits.size() - control_bits;
+  uint_t imat_begin = gid / num_shots_per_matrix;
+  uint_t imat_end = (gid + count - 1) / num_shots_per_matrix;
+  uint_t matrix_size = 1ull << N;
+
+  StoreMatrix(&diag[0] + imat_begin * matrix_size, iChunk,
+              (imat_end - imat_begin + 1) * matrix_size);
+  if (N == 1) {
+    Execute(BatchedDiagonalMatrixMult2x2<data_t>(qubits, imat_begin,
+                                                 num_shots_per_matrix),
+            iChunk, gid, count);
+  } else {
+    StoreUintParams(qubits, iChunk);
+
+    Execute(BatchedDiagonalMatrixMultNxN<data_t>(N, imat_begin,
+                                                 num_shots_per_matrix),
+            iChunk, gid, count);
+  }
+}
+
 template <typename data_t>
 void ChunkContainer<data_t>::apply_X(const uint_t iChunk, const reg_t &qubits,
                                      const uint_t gid, const uint_t count) {
@@ -1061,6 +1138,7 @@ ChunkContainer<data_t>::expval_pauli(const uint_t iChunk, const reg_t &qubits,
   // specialize x_max == 0
   if (x_mask == 0) {
     ExecuteSum(&ret, expval_pauli_Z_func<data_t>(z_mask), iChunk, 1);
+    synchronize(iChunk);
     return ret;
   }
 
@@ -1070,9 +1148,43 @@ ChunkContainer<data_t>::expval_pauli(const uint_t iChunk, const reg_t &qubits,
   add_y_phase(num_y, phase);
   ExecuteSum(&ret, expval_pauli_XYZ_func<data_t>(x_mask, z_mask, x_max, phase),
              iChunk, 1);
+  synchronize(iChunk);
   return ret;
 }
 
+template <typename data_t>
+void ChunkContainer<data_t>::batched_expval_pauli(
+    const uint_t iChunk, const uint_t count, const reg_t &qubits,
+    const std::string &pauli, bool variance, std::complex<double> param,
+    bool first, const complex_t initial_phase) const {
+  uint_t x_mask, z_mask, num_y, x_max;
+  std::tie(x_mask, z_mask, num_y, x_max) = pauli_masks_and_phase(qubits, pauli);
+
+  // Special case for only I Paulis
+  if (x_mask + z_mask == 0) {
+    ExecuteSum2(nullptr, batched_expval_I_func<data_t>(variance, param), iChunk,
+                count, first);
+    return;
+  }
+  double ret;
+  // specialize x_max == 0
+  if (x_mask == 0) {
+    ExecuteSum2(nullptr,
+                batched_expval_pauli_Z_func<data_t>(variance, param, z_mask),
+                iChunk, count, first);
+    return;
+  }
+
+  // Compute the overall phase of the operator.
+  // This is (-1j) ** number of Y terms modulo 4
+  auto phase = std::complex<data_t>(initial_phase);
+  add_y_phase(num_y, phase);
+  ExecuteSum2(nullptr,
+              batched_expval_pauli_XYZ_func<data_t>(variance, param, x_mask,
+                                                    z_mask, x_max, phase),
+              iChunk, count, first);
+}
+
 //------------------------------------------------------------------------------
 } // end namespace Chunk
 } // end namespace QV
diff --git a/src/simulators/statevector/chunk/chunk_manager.hpp b/src/simulators/statevector/chunk/chunk_manager.hpp
index 6bfca5397e..cb3c7ebbb9 100644
--- a/src/simulators/statevector/chunk/chunk_manager.hpp
+++ b/src/simulators/statevector/chunk/chunk_manager.hpp
@@ -57,6 +57,7 @@ class ChunkManager {
   int num_threads_per_group_;
   uint_t num_creg_bits_ = 0;
 
+  bool chunk_distribution_enable_ = true; // enable distribution over GPUs
   reg_t target_gpus_;
 
 public:
@@ -72,8 +73,8 @@ class ChunkManager {
   uint_t num_containers(void) { return chunks_.size(); }
 
   uint_t Allocate(int chunk_bits, int nqubits, uint_t nchunks,
-                  uint_t chunk_index, int matrix_bit, bool density_mat,
-                  reg_t &gpus, bool enable_cuStatevec);
+                  uint_t chunk_index, int matrix_bit, int max_shots,
+                  bool density_mat, reg_t &gpus, bool enable_cuStatevec);
   void Free(void);
 
   int num_devices(void) { return num_devices_; }
@@ -98,6 +99,8 @@ class ChunkManager {
   void execute_on_device(Function func,
                          const std::vector<std::complex<double>> &mat,
                          const std::vector<uint_t> &prm);
+
+  void enable_chunk_distribution(bool flg) { chunk_distribution_enable_ = flg; }
 };
 
 template <typename data_t>
@@ -161,8 +164,9 @@ ChunkManager<data_t>::~ChunkManager() {
 template <typename data_t>
 uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
                                       uint_t nchunks, uint_t chunk_index,
-                                      int matrix_bit, bool density_mat,
-                                      reg_t &gpus, bool enable_cuStatevec) {
+                                      int matrix_bit, int max_shots,
+                                      bool density_mat, reg_t &gpus,
+                                      bool enable_cuStatevec) {
   uint_t num_buffers;
   int iDev;
   uint_t is, ie, nc;
@@ -223,13 +227,35 @@ uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
         multi_gpu = false;
         num_places_ = 1;
 #else
-        multi_gpu = true;
-        num_places_ = num_devices_;
-        if (num_threads_per_group_ > 1)
-          num_places_ *= num_threads_per_group_;
-
-        if (num_places_ > omp_get_max_threads()) {
+        if (chunk_distribution_enable_) {
+          multi_gpu = true;
           num_places_ = num_devices_;
+          if (num_threads_per_group_ > 1)
+            num_places_ *= num_threads_per_group_;
+
+          if (num_places_ > omp_get_max_threads()) {
+            num_places_ = num_devices_;
+          }
+        } else {
+          multi_gpu = false;
+          num_places_ = 1;
+          idev_start = 0;
+
+          // define device to be allocated
+          if (num_devices_ > 1) {
+            size_t freeMem, totalMem, maxMem;
+            cudaSetDevice(0);
+            cudaMemGetInfo(&freeMem, &totalMem);
+            maxMem = freeMem;
+            for (i = 1; i < num_devices_; i++) {
+              cudaSetDevice(i);
+              cudaMemGetInfo(&freeMem, &totalMem);
+              if (freeMem > maxMem) {
+                maxMem = freeMem;
+                idev_start = i;
+              }
+            }
+          }
         }
 #endif
       } else { // single chunk
@@ -310,13 +336,13 @@ uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
       chunks_[iDev]->set_num_creg_bits(num_creg_bits_);
       if (num_devices_ > 0) {
         int id = target_gpus_[(iDev + idev_start) % num_devices_];
-        chunks_allocated +=
-            chunks_[iDev]->Allocate(id, chunk_bits, nqubits, nc, num_buffers,
-                                    multi_shots_, matrix_bit, density_matrix_);
+        chunks_allocated += chunks_[iDev]->Allocate(
+            id, chunk_bits, nqubits, nc, num_buffers, multi_shots_, matrix_bit,
+            max_shots, density_matrix_);
       } else {
-        chunks_allocated +=
-            chunks_[iDev]->Allocate(iDev, chunk_bits, nqubits, nc, num_buffers,
-                                    multi_shots_, matrix_bit, density_matrix_);
+        chunks_allocated += chunks_[iDev]->Allocate(
+            iDev, chunk_bits, nqubits, nc, num_buffers, multi_shots_,
+            matrix_bit, max_shots, density_matrix_);
       }
     }
     if (chunks_allocated < num_chunks_) {
@@ -335,9 +361,9 @@ uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
           chunks_[chunks_.size() - 1]->set_chunk_index(
               chunk_index_ + chunks_allocated +
               is); // set first chunk index for the container
-          chunks_[chunks_.size() - 1]->Allocate(-1, chunk_bits, nqubits, nc,
-                                                num_buffers, multi_shots_,
-                                                matrix_bit, density_matrix_);
+          chunks_[chunks_.size() - 1]->Allocate(
+              -1, chunk_bits, nqubits, nc, num_buffers, multi_shots_,
+              matrix_bit, max_shots, density_matrix_);
         }
       }
       num_places_ += nplaces_add;
@@ -351,7 +377,8 @@ uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
     iplace_host_ = chunks_.size();
     chunks_.push_back(std::make_shared<HostChunkContainer<data_t>>());
     chunks_[iplace_host_]->Allocate(-1, chunk_bits, nqubits, 0, AER_MAX_BUFFERS,
-                                    multi_shots_, matrix_bit, density_matrix_);
+                                    multi_shots_, matrix_bit, max_shots,
+                                    density_matrix_);
 #endif
   } else {
     for (iDev = 0; iDev < chunks_.size(); iDev++) {
diff --git a/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp b/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp
index 9fe2fadefd..e72d72003d 100644
--- a/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp
+++ b/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp
@@ -46,7 +46,7 @@ class cuStateVecChunkContainer : public DeviceChunkContainer<data_t> {
 
   uint_t Allocate(int idev, int chunk_bits, int num_qubits, uint_t chunks,
                   uint_t buffers, bool multi_shots, int matrix_bit,
-                  bool density_matrix) override;
+                  int max_shots, bool density_matrix) override;
   void Deallocate(void) override;
 
   reg_t sample_measure(uint_t iChunk, const std::vector<double> &rnds,
@@ -126,10 +126,11 @@ cuStateVecChunkContainer<data_t>::~cuStateVecChunkContainer(void) {
 template <typename data_t>
 uint_t cuStateVecChunkContainer<data_t>::Allocate(
     int idev, int chunk_bits, int num_qubits, uint_t chunks, uint_t buffers,
-    bool multi_shots, int matrix_bit, bool density_matrix) {
+    bool multi_shots, int matrix_bit, int max_shots, bool density_matrix) {
   uint_t nc;
   nc = BaseContainer::Allocate(idev, chunk_bits, num_qubits, chunks, buffers,
-                               multi_shots, matrix_bit, density_matrix);
+                               multi_shots, matrix_bit, max_shots,
+                               density_matrix);
 
   // initialize custatevevtor handle
   custatevecStatus_t err;
diff --git a/src/simulators/statevector/chunk/cuda_kernels.hpp b/src/simulators/statevector/chunk/cuda_kernels.hpp
index b74bb514ce..be8f10c524 100644
--- a/src/simulators/statevector/chunk/cuda_kernels.hpp
+++ b/src/simulators/statevector/chunk/cuda_kernels.hpp
@@ -205,7 +205,8 @@ __global__ void dev_reduce_sum(double *pReduceBuffer, uint_t n,
 template <typename data_t, typename kernel_t>
 __global__ void
 dev_apply_function_sum_complex(thrust::complex<double> *pReduceBuffer,
-                               kernel_t func, uint_t buf_size, uint_t count) {
+                               kernel_t func, uint_t buf_size, uint_t count,
+                               bool init) {
   // One cache entry per warp/wavefront
   __shared__ thrust::complex<double> cache[_MAX_THD / _WS];
   thrust::complex<double> sum;
@@ -220,7 +221,11 @@ dev_apply_function_sum_complex(thrust::complex<double> *pReduceBuffer,
   if (!func.check_conditional(i))
     return;
 
-  sum = func(i);
+  sum = 0.0;
+  if (!init && threadIdx.x == 0 && blockIdx.x == 0) {
+    sum = pReduceBuffer[buf_size * iChunk];
+  }
+  sum += func(i);
 
   // reduce in warp
   nw = min(blockDim.x, _WS);
diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp
index 6ae1ac9950..6ae28ae79d 100644
--- a/src/simulators/statevector/chunk/device_chunk_container.hpp
+++ b/src/simulators/statevector/chunk/device_chunk_container.hpp
@@ -34,6 +34,9 @@ namespace Chunk {
 // reserve 512MB of memory for Thrust internal use
 #define RESERVE_FOR_THRUST (1ull << 28)
 
+// max storage reserved for sampling measure
+#define AER_MAX_SAMPLING_SHOTS 1024
+
 //============================================================================
 // device chunk container class
 //============================================================================
@@ -112,7 +115,7 @@ class DeviceChunkContainer : public ChunkContainer<data_t> {
 
   uint_t Allocate(int idev, int chunk_bits, int num_qubits, uint_t chunks,
                   uint_t buffers, bool multi_shots, int matrix_bit,
-                  bool density_matrix) override;
+                  int max_shots, bool density_matrix) override;
   void Deallocate(void) override;
 
   void StoreMatrix(const std::vector<std::complex<double>> &mat,
@@ -121,9 +124,9 @@ class DeviceChunkContainer : public ChunkContainer<data_t> {
                    uint_t size) const override;
   void StoreUintParams(const std::vector<uint_t> &prm,
                        uint_t iChunk) const override;
-  void ResizeMatrixBuffers(int bits) override;
+  void ResizeMatrixBuffers(int bits, int max_shots) override;
 
-  void calculate_matrix_buffer_size(int bits);
+  void calculate_matrix_buffer_size(int bits, int shots);
 
   void set_device(void) const {
 #ifdef AER_THRUST_GPU
@@ -211,6 +214,8 @@ class DeviceChunkContainer : public ChunkContainer<data_t> {
   }
 
   void copy_to_probability_buffer(std::vector<double> &buf, int pos);
+  void copy_reduce_buffer(std::vector<double> &ret, uint_t iChunk,
+                          uint_t num_val) const override;
 
   void allocate_creg(uint_t num_mem, uint_t num_reg);
   int measured_cbit(uint_t iChunk, int qubit) {
@@ -288,7 +293,7 @@ class DeviceChunkContainer : public ChunkContainer<data_t> {
   }
   void request_creg_update(void) { creg_host_update_ = true; }
 
-  void synchronize(uint_t iChunk) {
+  void synchronize(uint_t iChunk) const {
 #ifdef AER_THRUST_GPU
     set_device();
     cudaStreamSynchronize(stream(iChunk));
@@ -315,7 +320,7 @@ template <typename data_t>
 uint_t DeviceChunkContainer<data_t>::Allocate(int idev, int chunk_bits,
                                               int num_qubits, uint_t chunks,
                                               uint_t buffers, bool multi_shots,
-                                              int matrix_bit,
+                                              int matrix_bit, int max_shots,
                                               bool density_matrix) {
   uint_t nc = chunks;
   uint_t i;
@@ -368,7 +373,7 @@ uint_t DeviceChunkContainer<data_t>::Allocate(int idev, int chunk_bits,
   matrix_buffer_size_ = 0;
   params_buffer_size_ = 0;
   max_blocked_gates_ = QV_MAX_BLOCKED_GATES;
-  calculate_matrix_buffer_size(matrix_bit);
+  calculate_matrix_buffer_size(matrix_bit, max_shots);
 
   reduce_buffer_size_ = 2;
 
@@ -403,7 +408,7 @@ uint_t DeviceChunkContainer<data_t>::Allocate(int idev, int chunk_bits,
   }
 
 #endif
-  ResizeMatrixBuffers(matrix_bit);
+  ResizeMatrixBuffers(matrix_bit, max_shots);
 
   this->num_chunks_ = nc;
   data_.resize((nc + buffers) << chunk_bits);
@@ -503,12 +508,23 @@ void DeviceChunkContainer<data_t>::Deallocate(void) {
 }
 
 template <typename data_t>
-void DeviceChunkContainer<data_t>::calculate_matrix_buffer_size(int bits) {
+void DeviceChunkContainer<data_t>::calculate_matrix_buffer_size(int bits,
+                                                                int shots) {
   uint_t size;
 
   // matrix buffer size
   this->matrix_bits_ = bits;
-  size = 1ull << (bits * 2);
+  // adjust matrix_bits_ so that all shots can be stored on GPU
+  if (shots > 1) {
+    if (shots > AER_MAX_SAMPLING_SHOTS)
+      shots = AER_MAX_SAMPLING_SHOTS;
+    uint_t b = this->matrix_bits_;
+    while ((1ull << (b * 2)) < shots) {
+      b++;
+    }
+    this->matrix_bits_ = b;
+  }
+  size = 1ull << (this->matrix_bits_ * 2);
 
   if (max_blocked_gates_ * 4 > size) {
     size = max_blocked_gates_ * 4;
@@ -528,15 +544,20 @@ void DeviceChunkContainer<data_t>::calculate_matrix_buffer_size(int bits) {
     size = QV_MAX_REGISTERS + max_blocked_gates_ * 4;
   }
   params_buffer_size_ = size;
+
+  if (shots > 1 && params_buffer_size_ < shots) {
+    params_buffer_size_ = shots;
+  }
 }
 
 template <typename data_t>
-void DeviceChunkContainer<data_t>::ResizeMatrixBuffers(int bits) {
+void DeviceChunkContainer<data_t>::ResizeMatrixBuffers(int bits,
+                                                       int max_shots) {
   uint_t size;
   uint_t n = num_matrices_ + this->num_buffers_;
 
   if (bits != this->matrix_bits_) {
-    calculate_matrix_buffer_size(bits);
+    calculate_matrix_buffer_size(bits, max_shots);
   }
 
   if (matrix_.size() < n * matrix_buffer_size_)
@@ -833,7 +854,7 @@ reg_t DeviceChunkContainer<data_t>::sample_measure(
 
   uint_t i, nshots, size;
   uint_t iBuf = 0;
-  if (multi_shots_) {
+  if (multi_shots_ && count == 1) {
     iBuf = iChunk;
     size = matrix_buffer_size_ * 2;
     if (size > params_buffer_size_)
@@ -1370,6 +1391,29 @@ void DeviceChunkContainer<data_t>::copy_to_probability_buffer(
 #endif
 }
 
+template <typename data_t>
+void DeviceChunkContainer<data_t>::copy_reduce_buffer(std::vector<double> &ret,
+                                                      uint_t iChunk,
+                                                      uint_t num_val) const {
+  uint_t count = ret.size();
+  std::vector<double> tmp(count * reduce_buffer_size_);
+#ifdef AER_THRUST_CUDA
+  set_device();
+  cudaMemcpyAsync(&tmp[0], reduce_buffer(iChunk),
+                  reduce_buffer_size_ * count * sizeof(double),
+                  cudaMemcpyDeviceToHost, stream(iChunk));
+  cudaStreamSynchronize(stream(iChunk));
+#else
+  thrust::copy_n(reduce_buffer_.begin() + iChunk * reduce_buffer_size_,
+                 count * reduce_buffer_size_, tmp.begin());
+#endif
+
+  for (int_t i = 0; i < count; i++) {
+    for (int_t j = 0; j < num_val; j++)
+      ret[i * num_val + j] = tmp[i * reduce_buffer_size_ + j];
+  }
+}
+
 //------------------------------------------------------------------------------
 } // end namespace Chunk
 } // end namespace QV
diff --git a/src/simulators/statevector/chunk/host_chunk_container.hpp b/src/simulators/statevector/chunk/host_chunk_container.hpp
index 092c49490b..e901086d45 100644
--- a/src/simulators/statevector/chunk/host_chunk_container.hpp
+++ b/src/simulators/statevector/chunk/host_chunk_container.hpp
@@ -43,7 +43,7 @@ class HostChunkContainer : public ChunkContainer<data_t> {
 
   uint_t Allocate(int idev, int chunk_bits, int num_qubits, uint_t chunks,
                   uint_t buffers, bool multi_shots, int matrix_bit,
-                  bool density_matrix) override;
+                  int max_shots, bool density_matrix) override;
   void Deallocate(void) override;
 
   void StoreMatrix(const std::vector<std::complex<double>> &mat,
@@ -59,7 +59,7 @@ class HostChunkContainer : public ChunkContainer<data_t> {
                        uint_t iChunk) const override {
     params_[iChunk] = (uint_t *)&prm[0];
   }
-  void ResizeMatrixBuffers(int bits) {}
+  void ResizeMatrixBuffers(int bits, int max_shots) {}
 
   void Set(uint_t i, const thrust::complex<data_t> &t) override {
     data_[i] = t;
@@ -118,7 +118,7 @@ template <typename data_t>
 uint_t HostChunkContainer<data_t>::Allocate(int idev, int chunk_bits,
                                             int num_qubits, uint_t chunks,
                                             uint_t buffers, bool multi_shots,
-                                            int matrix_bit,
+                                            int matrix_bit, int max_shots,
                                             bool density_matrix) {
   uint_t nc = chunks;
   uint_t i;
diff --git a/src/simulators/statevector/chunk/thrust_kernels.hpp b/src/simulators/statevector/chunk/thrust_kernels.hpp
index 360181272f..70f9c36134 100644
--- a/src/simulators/statevector/chunk/thrust_kernels.hpp
+++ b/src/simulators/statevector/chunk/thrust_kernels.hpp
@@ -905,7 +905,7 @@ class MatrixMultNxN : public GateFuncWithCache<data_t> {
   __host__ __device__ void
   run_with_cache(uint_t _tid, uint_t _idx,
                  thrust::complex<data_t> *_cache) const {
-    uint_t j, threadID;
+    uint_t j;
     thrust::complex<data_t> q, r;
     thrust::complex<double> m;
     uint_t mat_size, irow;
@@ -1198,6 +1198,119 @@ class MatrixMult2x2Controlled : public GateFuncBase<data_t> {
   const char *name(void) { return "matrix_Cmult2x2"; }
 };
 
+template <typename data_t>
+class BatchedMatrixMult2x2 : public GateFuncBase<data_t> {
+protected:
+  uint_t matrix_begin_;
+  uint_t num_shots_per_matrix_;
+  uint_t mask_;
+  uint_t cmask_;
+  uint_t offset_;
+  uint_t nqubits_;
+
+public:
+  BatchedMatrixMult2x2(const reg_t &qubits, uint_t imat,
+                       uint_t nshots_per_mat) {
+    int i;
+    nqubits_ = qubits.size();
+
+    offset_ = 1ull << qubits[nqubits_ - 1];
+    mask_ = (1ull << qubits[nqubits_ - 1]) - 1;
+    cmask_ = 0;
+    for (i = 0; i < nqubits_ - 1; i++) {
+      cmask_ |= (1ull << qubits[i]);
+    }
+    matrix_begin_ = imat;
+    num_shots_per_matrix_ = nshots_per_mat;
+  }
+
+  int qubits_count(void) { return 1; }
+  int num_control_bits(void) { return nqubits_ - 1; }
+
+  __host__ __device__ void operator()(const uint_t &i) const {
+    uint_t i0, i1;
+    thrust::complex<data_t> q0, q1;
+    thrust::complex<data_t> *vec0;
+    thrust::complex<data_t> *vec1;
+
+    vec0 = this->data_;
+
+    vec1 = vec0 + offset_;
+
+    i1 = i & mask_;
+    i0 = (i - i1) << 1;
+    i0 += i1;
+
+    if (((i0 + this->base_index_) & cmask_) == cmask_) {
+      thrust::complex<double> m0, m1, m2, m3;
+      q0 = vec0[i0];
+      q1 = vec1[i0];
+
+      uint_t iChunk = (this->base_index_ + i) >> this->chunk_bits_;
+      // matrix offset from the top of buffer
+      uint_t i_mat = (iChunk / num_shots_per_matrix_) - matrix_begin_;
+      thrust::complex<double> *mat = this->matrix_ + i_mat * 4ull;
+
+      m0 = mat[0];
+      m1 = mat[1];
+      m2 = mat[2];
+      m3 = mat[3];
+
+      vec0[i0] = m0 * q0 + m2 * q1;
+      vec1[i0] = m1 * q0 + m3 * q1;
+    }
+  }
+  const char *name(void) { return "BatchedMatrixMult2x2"; }
+};
+
+template <typename data_t>
+class BatchedMatrixMultNxN : public GateFuncWithCache<data_t> {
+protected:
+  uint_t matrix_begin_;
+  uint_t num_shots_per_matrix_;
+
+public:
+  BatchedMatrixMultNxN(uint_t nq, uint_t imat, uint_t nshots_per_mat)
+      : GateFuncWithCache<data_t>(nq) {
+    matrix_begin_ = imat;
+    num_shots_per_matrix_ = nshots_per_mat;
+  }
+
+  __host__ __device__ void
+  run_with_cache(uint_t _tid, uint_t _idx,
+                 thrust::complex<data_t> *_cache) const {
+    uint_t j;
+    thrust::complex<data_t> q, r;
+    thrust::complex<double> m;
+    uint_t mat_size, irow;
+    thrust::complex<data_t> *vec;
+    thrust::complex<double> *pMat;
+
+    uint_t iChunk = (this->base_index_ + _tid) >> this->chunk_bits_;
+    // matrix offset from the top of buffer
+    uint_t i_mat = (iChunk / num_shots_per_matrix_) - matrix_begin_;
+
+    mat_size = 1ull << this->nqubits_;
+
+    vec = this->data_;
+    pMat = this->matrix_ + i_mat * mat_size * mat_size;
+
+    irow = _tid & (mat_size - 1);
+
+    r = 0.0;
+    for (j = 0; j < mat_size; j++) {
+      m = pMat[irow + mat_size * j];
+      q = _cache[(_tid & 1023) - irow + j];
+
+      r += m * q;
+    }
+
+    vec[_idx] = r;
+  }
+
+  const char *name(void) { return "BatchedMatrixMultNxN"; }
+};
+
 //------------------------------------------------------------------------------
 // Diagonal matrix multiplication
 //------------------------------------------------------------------------------
@@ -1350,7 +1463,7 @@ class DiagonalMult2x2Controlled : public GateFuncBase<data_t> {
     }
   }
 
-  int qubits_count(void) { return nqubits; }
+  int qubits_count(void) { return 1; }
   int num_control_bits(void) { return nqubits - 1; }
 
   bool is_diagonal(void) { return true; }
@@ -1378,6 +1491,116 @@ class DiagonalMult2x2Controlled : public GateFuncBase<data_t> {
   const char *name(void) { return "diagonal_Cmult2x2"; }
 };
 
+template <typename data_t>
+class BatchedDiagonalMatrixMult2x2 : public GateFuncBase<data_t> {
+protected:
+  uint_t matrix_begin_;
+  uint_t num_shots_per_matrix_;
+  uint_t mask_;
+  uint_t cmask_;
+  uint_t offset_;
+  uint_t nqubits_;
+
+public:
+  BatchedDiagonalMatrixMult2x2(const reg_t &qubits, uint_t imat,
+                               uint_t nshots_per_mat) {
+    int i;
+    nqubits_ = qubits.size();
+
+    mask_ = (1ull << qubits[nqubits_ - 1]);
+    cmask_ = 0;
+    for (i = 0; i < nqubits_ - 1; i++) {
+      cmask_ |= (1ull << qubits[i]);
+    }
+    matrix_begin_ = imat;
+    num_shots_per_matrix_ = nshots_per_mat;
+  }
+
+  int qubits_count(void) { return 1; }
+  int num_control_bits(void) { return nqubits_ - 1; }
+  bool is_diagonal(void) { return true; }
+
+  __host__ __device__ void operator()(const uint_t &i) const {
+    uint_t gid;
+    thrust::complex<data_t> q0;
+    thrust::complex<double> m;
+    thrust::complex<data_t> *vec;
+
+    vec = this->data_;
+    gid = this->base_index_;
+
+    if (((i + gid) & cmask_) == cmask_) {
+      uint_t iChunk = (i + gid) >> this->chunk_bits_;
+      // matrix offset from the top of buffer
+      uint_t i_mat = (iChunk / num_shots_per_matrix_) - matrix_begin_;
+      thrust::complex<double> *mat = this->matrix_ + i_mat * 2ull;
+
+      q0 = vec[i];
+      if ((i + gid) & mask_) {
+        m = mat[1];
+      } else {
+        m = mat[0];
+      }
+      vec[i] = m * q0;
+    }
+  }
+  const char *name(void) { return "BatchedDiagonalMatrixMult2x2"; }
+};
+
+template <typename data_t>
+class BatchedDiagonalMatrixMultNxN : public GateFuncBase<data_t> {
+protected:
+  uint_t matrix_begin_;
+  uint_t num_shots_per_matrix_;
+  uint_t nqubits_;
+
+public:
+  BatchedDiagonalMatrixMultNxN(const uint_t nq, uint_t imat,
+                               uint_t nshots_per_mat) {
+    int i;
+    nqubits_ = nq;
+
+    matrix_begin_ = imat;
+    num_shots_per_matrix_ = nshots_per_mat;
+  }
+
+  int qubits_count(void) { return nqubits_; }
+  int num_control_bits(void) { return 0; }
+  bool is_diagonal(void) { return true; }
+
+  __host__ __device__ void operator()(const uint_t &i) const {
+    uint_t j, im;
+    thrust::complex<data_t> *vec;
+    thrust::complex<data_t> q;
+    thrust::complex<double> m;
+    uint_t *qubits;
+    uint_t gid;
+
+    gid = this->base_index_;
+
+    uint_t iChunk = (i + gid) >> this->chunk_bits_;
+    // matrix offset from the top of buffer
+    uint_t i_mat = (iChunk / num_shots_per_matrix_) - matrix_begin_;
+    thrust::complex<double> *mat = this->matrix_ + i_mat * 2ull;
+
+    vec = this->data_;
+    qubits = this->params_;
+
+    q = vec[i];
+
+    im = 0;
+    for (j = 0; j < nqubits_; j++) {
+      if ((((i + gid) >> qubits[j]) & 1) != 0) {
+        im += (1 << j);
+      }
+    }
+    m = mat[im];
+    vec[i] = m * q;
+  }
+
+  const char *name(void) { return "BatchedDiagonalMatrixMultNxN"; }
+};
+
 //------------------------------------------------------------------------------
 // Permutation
 //------------------------------------------------------------------------------
@@ -1797,6 +2020,7 @@ class norm_func : public GateFuncBase<data_t> {
 public:
   norm_func(void) {}
   bool is_diagonal(void) { return true; }
+  bool batch_enable(void) { return true; }
 
   __host__ __device__ double operator()(const uint_t &i) const {
     thrust::complex<data_t> q;
@@ -2107,7 +2331,7 @@ class expval_pauli_Z_func : public GateFuncBase<data_t> {
   expval_pauli_Z_func(uint_t z) { z_mask_ = z; }
 
   bool is_diagonal(void) { return true; }
-  bool batch_enable(void) { return false; }
+  bool batch_enable(void) { return true; }
 
   __host__ __device__ double operator()(const uint_t &i) const {
     thrust::complex<data_t> *vec;
@@ -2148,7 +2372,7 @@ class expval_pauli_XYZ_func : public GateFuncBase<data_t> {
     mask_u_ = ~((1ull << (x_max + 1)) - 1);
     mask_l_ = (1ull << x_max) - 1;
   }
-  bool batch_enable(void) { return false; }
+  bool batch_enable(void) { return true; }
 
   __host__ __device__ double operator()(const uint_t &i) const {
     thrust::complex<data_t> *vec;
@@ -2248,6 +2472,155 @@ class expval_pauli_inter_chunk_func : public GateFuncBase<data_t> {
   const char *name(void) { return "expval_pauli_inter_chunk"; }
 };
 
+template <typename data_t>
+class batched_expval_I_func : public GateFuncBase<data_t> {
+protected:
+  bool variance_;
+  double param_;
+  double param_var_;
+
+public:
+  batched_expval_I_func(bool var, thrust::complex<double> par) {
+    variance_ = var;
+    param_ = par.real();
+    param_var_ = par.imag();
+  }
+  bool is_diagonal(void) { return true; }
+  bool batch_enable(void) { return true; }
+
+  __host__ __device__ thrust::complex<double>
+  operator()(const uint_t &i) const {
+    thrust::complex<data_t> q;
+    thrust::complex<data_t> *vec;
+    double d, dv;
+
+    vec = this->data_;
+    q = vec[i];
+    d = (double)(q.real() * q.real() + q.imag() * q.imag());
+
+    if (variance_)
+      dv = d * param_var_;
+    d *= param_;
+    return thrust::complex<double>(d, dv);
+  }
+  const char *name(void) { return "batched_expval_I_func"; }
+};
+
+template <typename data_t>
+class batched_expval_pauli_Z_func : public GateFuncBase<data_t> {
+protected:
+  uint_t z_mask_;
+  bool variance_;
+  double param_;
+  double param_var_;
+
+public:
+  batched_expval_pauli_Z_func(bool var, thrust::complex<double> par, uint_t z) {
+    variance_ = var;
+    param_ = par.real();
+    param_var_ = par.imag();
+    z_mask_ = z;
+  }
+
+  bool is_diagonal(void) { return true; }
+  bool batch_enable(void) { return true; }
+
+  __host__ __device__ thrust::complex<double>
+  operator()(const uint_t &i) const {
+    thrust::complex<data_t> *vec;
+    thrust::complex<data_t> q0;
+    double d, dv;
+
+    vec = this->data_;
+
+    q0 = vec[i];
+    d = q0.real() * q0.real() + q0.imag() * q0.imag();
+
+    if (z_mask_ != 0) {
+      if (pop_count_kernel(i & z_mask_) & 1)
+        d = -d;
+    }
+
+    if (variance_)
+      dv = d * param_var_;
+    d *= param_;
+    return thrust::complex<double>(d, dv);
+  }
+  const char *name(void) { return "batched_expval_pauli_Z_func"; }
+};
+
+template <typename data_t>
+class batched_expval_pauli_XYZ_func : public GateFuncBase<data_t> {
+protected:
+  uint_t x_mask_;
+  uint_t z_mask_;
+  uint_t mask_l_;
+  uint_t mask_u_;
+  thrust::complex<data_t> phase_;
+  bool variance_;
+  double param_;
+  double param_var_;
+
+public:
+  batched_expval_pauli_XYZ_func(bool var, thrust::complex<double> par, uint_t x,
+                                uint_t z, uint_t x_max,
+                                std::complex<data_t> p) {
+    variance_ = var;
+    param_ = par.real();
+    param_var_ = par.imag();
+
+    x_mask_ = x;
+    z_mask_ = z;
+    phase_ = p;
+
+    mask_u_ = ~((1ull << (x_max + 1)) - 1);
+    mask_l_ = (1ull << x_max) - 1;
+  }
+  bool batch_enable(void) { return true; }
+
+  __host__ __device__ thrust::complex<double>
+  operator()(const uint_t &i) const {
+    thrust::complex<data_t> *vec;
+    thrust::complex<data_t> q0;
+    thrust::complex<data_t> q1;
+    thrust::complex<data_t> q0p;
+    thrust::complex<data_t> q1p;
+    double d0, d1, ret, ret_v;
+    uint_t idx0, idx1;
+
+    vec = this->data_;
+
+    idx0 = ((i << 1) & mask_u_) | (i & mask_l_);
+    idx1 = idx0 ^ x_mask_;
+
+    q0 = vec[idx0];
+    q1 = vec[idx1];
+    q0p = q1 * phase_;
+    q1p = q0 * phase_;
+    d0 = q0.real() * q0p.real() + q0.imag() * q0p.imag();
+    d1 = q1.real() * q1p.real() + q1.imag() * q1p.imag();
+
+    if (z_mask_ != 0) {
+      if (pop_count_kernel(idx0 & z_mask_) & 1)
+        ret = -d0;
+      else
+        ret = d0;
+      if (pop_count_kernel(idx1 & z_mask_) & 1)
+        ret -= d1;
+      else
+        ret += d1;
+    } else {
+      ret = d0 + d1;
+    }
+
+    if (variance_)
+      ret_v = ret * param_var_;
+    ret *= param_;
+    return thrust::complex<double>(ret, ret_v);
+  }
+  const char *name(void) { return "batched_expval_pauli_XYZ_func"; }
+};
+
 //------------------------------------------------------------------------------
 // Pauli application
 //------------------------------------------------------------------------------
diff --git a/src/simulators/statevector/indexes.hpp b/src/simulators/statevector/indexes.hpp
index c1f617f49f..f1b6122864 100644
--- a/src/simulators/statevector/indexes.hpp
+++ b/src/simulators/statevector/indexes.hpp
@@ -263,9 +263,15 @@ template <typename Lambda>
 inline void apply_lambda(const size_t start, const size_t stop,
                          const uint_t omp_threads, Lambda &&func) {
 
-#pragma omp parallel if (omp_threads > 1) num_threads(omp_threads)
-  {
+  if (omp_threads > 1) {
+#pragma omp parallel num_threads(omp_threads)
+    {
 #pragma omp for
+      for (int_t k = int_t(start); k < int_t(stop); k++) {
+        std::forward<Lambda>(func)(k);
+      }
+    }
+  } else {
     for (int_t k = int_t(start); k < int_t(stop); k++) {
       std::forward<Lambda>(func)(k);
     }
@@ -281,9 +287,15 @@ inline void apply_lambda(const size_t start, const size_t stop,
   const int_t END = stop >> NUM_QUBITS;
   auto qubits_sorted = qubits;
   std::sort(qubits_sorted.begin(), qubits_sorted.end());
-#pragma omp parallel if (omp_threads > 1) num_threads(omp_threads)
-  {
-#pragma omp for
+
+  if (omp_threads > 1) {
+#pragma omp parallel for num_threads(omp_threads)
+    for (int_t k = int_t(start); k < END; k++) {
+      // store entries touched by U
+      const auto inds = indexes(qubits, qubits_sorted, k);
+      std::forward<Lambda>(func)(inds);
+    }
+  } else {
     for (int_t k = int_t(start); k < END; k++) {
       // store entries touched by U
       const auto inds = indexes(qubits, qubits_sorted, k);
@@ -303,9 +315,16 @@ inline void apply_lambda(const size_t start, const size_t stop,
   auto qubits_sorted = qubits;
   std::sort(qubits_sorted.begin(), qubits_sorted.end());
 
-#pragma omp parallel if (omp_threads > 1) num_threads(omp_threads)
-  {
+  if (omp_threads > 1) {
+#pragma omp parallel num_threads(omp_threads)
+    {
 #pragma omp for
+      for (int_t k = int_t(start); k < END; k += gap) {
+        const auto inds = indexes(qubits, qubits_sorted, k);
+        std::forward<Lambda>(func)(inds, params);
+      }
+    }
+  } else {
     for (int_t k = int_t(start); k < END; k += gap) {
       const auto inds = indexes(qubits, qubits_sorted, k);
       std::forward<Lambda>(func)(inds, params);
@@ -331,13 +350,19 @@ apply_reduction_lambda(const size_t start, const size_t stop,
   // Reduction variables
   double val_re = 0.;
   double val_im = 0.;
-#pragma omp parallel reduction(+:val_re, val_im) if (omp_threads > 1) num_threads(omp_threads)
-  {
+  if (omp_threads > 1) {
+#pragma omp parallel reduction(+ : val_re, val_im) num_threads(omp_threads)
+    {
 #pragma omp for
+      for (int_t k = int_t(start); k < int_t(stop); k++) {
+        std::forward<Lambda>(func)(k, val_re, val_im);
+      }
+    } // end omp parallel
+  } else {
     for (int_t k = int_t(start); k < int_t(stop); k++) {
       std::forward<Lambda>(func)(k, val_re, val_im);
     }
-  } // end omp parallel
+  }
   return std::complex<double>(val_re, val_im);
 }
 
@@ -355,14 +380,21 @@ apply_reduction_lambda(const size_t start, const size_t stop,
   // Reduction variables
   double val_re = 0.;
   double val_im = 0.;
-#pragma omp parallel reduction(+:val_re, val_im) if (omp_threads > 1) num_threads(omp_threads)
-  {
+  if (omp_threads > 1) {
+#pragma omp parallel reduction(+ : val_re, val_im) num_threads(omp_threads)
+    {
 #pragma omp for
+      for (int_t k = int_t(start); k < END; k++) {
+        const auto inds = indexes(qubits, qubits_sorted, k);
+        std::forward<Lambda>(func)(inds, val_re, val_im);
+      }
+    } // end omp parallel
+  } else {
     for (int_t k = int_t(start); k < END; k++) {
       const auto inds = indexes(qubits, qubits_sorted, k);
       std::forward<Lambda>(func)(inds, val_re, val_im);
     }
-  } // end omp parallel
+  }
   return std::complex<double>(val_re, val_im);
 }
 
@@ -381,14 +413,21 @@ apply_reduction_lambda(const size_t start, const size_t stop,
   // Reduction variables
   double val_re = 0.;
   double val_im = 0.;
-#pragma omp parallel reduction(+:val_re, val_im) if (omp_threads > 1) num_threads(omp_threads)
-  {
+  if (omp_threads > 1) {
+#pragma omp parallel reduction(+ : val_re, val_im) num_threads(omp_threads)
+    {
 #pragma omp for
+      for (int_t k = int_t(start); k < END; k++) {
+        const auto inds = indexes(qubits, qubits_sorted, k);
+        std::forward<Lambda>(func)(inds, params, val_re, val_im);
+      }
+    } // end omp parallel
+  } else {
     for (int_t k = int_t(start); k < END; k++) {
       const auto inds = indexes(qubits, qubits_sorted, k);
       std::forward<Lambda>(func)(inds, params, val_re, val_im);
     }
-  } // end omp parallel
+  }
   return std::complex<double>(val_re, val_im);
 }
 
diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp
index a686899358..4039c7c5f3 100755
--- a/src/simulators/statevector/qubitvector.hpp
+++ b/src/simulators/statevector/qubitvector.hpp
@@ -144,9 +144,9 @@ class QubitVector {
                             const cvector_t<double> &state);
 
   // setup chunk
-  bool chunk_setup(int chunk_bits, int num_qubits, uint_t chunk_index,
-                   uint_t num_local_chunks);
-  bool chunk_setup(QubitVector<data_t> &base, const uint_t chunk_index);
+  uint_t chunk_setup(int chunk_bits, int num_qubits, uint_t chunk_index,
+                     uint_t num_local_chunks);
+  uint_t chunk_setup(QubitVector<data_t> &base, const uint_t chunk_index);
   uint_t chunk_index(void) { return chunk_index_; }
 
   // cache control for chunks on host
@@ -165,6 +165,7 @@ class QubitVector {
   void release_recv_buffer(void) const;
 
   void set_max_matrix_bits(int_t bits) {}
+  void set_max_sampling_shots(int_t shots) {}
 
   void synchronize(void) {}
 
@@ -348,6 +349,14 @@ class QubitVector {
   void apply_batched_kraus(const reg_t &qubits,
                            const std::vector<cmatrix_t> &kmats,
                            std::vector<RngEngine> &rng) {}
+  // apply matrices to each chunk in a batch
+  void apply_batched_matrix(const reg_t &qubits, const cvector_t<double> &mat,
+                            const uint_t num_matrices,
+                            const uint_t num_shots_per_matrix) {}
+  void apply_batched_diagonal_matrix(const reg_t &qubits,
+                                     const cvector_t<double> &mat,
+                                     const uint_t num_matrices,
+                                     const uint_t num_shots_per_matrix) {}
 
   //-----------------------------------------------------------------------
   // Norms
@@ -401,6 +410,10 @@ class QubitVector {
                       const uint_t z_count, const uint_t z_count_pair,
                       const complex_t initial_phase = 1.0) const;
 
+  void batched_expval_pauli(std::vector<double> &val, const reg_t &qubits,
+                            const std::string &pauli, bool variance,
+                            std::complex<double> param, bool last,
+                            const complex_t initial_phase = 1.0) const {}
   //-----------------------------------------------------------------------
   // JSON configuration settings
   //-----------------------------------------------------------------------
@@ -1026,18 +1039,18 @@ std::complex<double> QubitVector<data_t>::inner_product() const {
 
 // setup chunk
 template <typename data_t>
-bool QubitVector<data_t>::chunk_setup(int chunk_bits, int num_qubits,
-                                      uint_t chunk_index,
-                                      uint_t num_local_chunks) {
+uint_t QubitVector<data_t>::chunk_setup(int chunk_bits, int num_qubits,
+                                        uint_t chunk_index,
+                                        uint_t num_local_chunks) {
   chunk_index_ = chunk_index;
-  return true;
+  return num_local_chunks;
 }
 
 template <typename data_t>
-bool QubitVector<data_t>::chunk_setup(QubitVector<data_t> &base,
-                                      const uint_t chunk_index) {
+uint_t QubitVector<data_t>::chunk_setup(QubitVector<data_t> &base,
+                                        const uint_t chunk_index) {
   chunk_index_ = chunk_index;
-  return true;
+  return 0;
 }
 
 // prepare buffer for MPI send/recv
diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp
index 31431ae8bf..da96761fc0 100644
--- a/src/simulators/statevector/qubitvector_thrust.hpp
+++ b/src/simulators/statevector/qubitvector_thrust.hpp
@@ -142,10 +142,10 @@ class QubitVectorThrust {
                             const cvector_t<double> &state);
 
   // chunk setup
-  bool chunk_setup(int chunk_bits, int num_qubits, uint_t chunk_index,
-                   uint_t num_local_chunks);
-  bool chunk_setup(const QubitVectorThrust<data_t> &base,
-                   const uint_t chunk_index);
+  uint_t chunk_setup(int chunk_bits, int num_qubits, uint_t chunk_index,
+                     uint_t num_local_chunks);
+  uint_t chunk_setup(const QubitVectorThrust<data_t> &base,
+                     const uint_t chunk_index);
   uint_t chunk_index(void) { return chunk_index_; }
 
   // cache control for chunks on host
@@ -164,6 +164,7 @@ class QubitVectorThrust {
   void release_recv_buffer(void) const;
 
   void set_max_matrix_bits(int_t bits);
+  void set_max_sampling_shots(int_t shots);
 
   void synchronize(void) { chunk_.synchronize(); }
 
@@ -373,6 +374,15 @@ class QubitVectorThrust {
                                    const std::vector<cmatrix_t> &kmats,
                                    std::vector<RngEngine> &rng);
 
+  // apply matrices to each chunk in a batch
+  void apply_batched_matrix(const reg_t &qubits, const cvector_t<double> &mat,
+                            const uint_t num_matrices,
+                            const uint_t num_shots_per_matrix);
+  void apply_batched_diagonal_matrix(const reg_t &qubits,
+                                     const cvector_t<double> &mat,
+                                     const uint_t num_matrices,
+                                     const uint_t num_shots_per_matrix);
+
   //-----------------------------------------------------------------------
   // Norms
   //-----------------------------------------------------------------------
@@ -409,6 +419,10 @@ class QubitVectorThrust {
                       const uint_t z_count, const uint_t z_count_pair,
                       const complex_t initial_phase = 1.0) const;
 
+  void batched_expval_pauli(std::vector<double> &val, const reg_t &qubits,
+                            const std::string &pauli, bool variance,
+                            std::complex<double> param, bool last,
+                            const complex_t initial_phase = 1.0) const;
   //-----------------------------------------------------------------------
   // JSON configuration settings
   //-----------------------------------------------------------------------
@@ -482,6 +496,7 @@ class QubitVectorThrust {
   uint_t num_cmem_bits_ = 0;
 
   int_t max_matrix_bits_ = 0;
+  int_t max_sampling_shots_ = 0;
 
   //-----------------------------------------------------------------------
   // Config settings
@@ -684,10 +699,8 @@ void QubitVectorThrust<data_t>::copy_qv(const QubitVectorThrust<data_t> &obj) {
   num_threads_per_group_ = obj.num_threads_per_group_;
   max_matrix_bits_ = obj.max_matrix_bits_;
 
-  if (!chunk_setup(obj, obj.chunk_index_)) {
-    throw std::runtime_error(
-        "QubitVectorThrust: can not allocate chunk for copy");
-  }
+  chunk_setup(obj, obj.chunk_index_);
+
   set_num_qubits(obj.num_qubits());
 
   chunk_.set_device();
@@ -844,9 +857,10 @@ void QubitVectorThrust<data_t>::zero() {
 }
 
 template <typename data_t>
-bool QubitVectorThrust<data_t>::chunk_setup(int chunk_bits, int num_qubits,
-                                            uint_t chunk_index,
-                                            uint_t num_local_chunks) {
+uint_t QubitVectorThrust<data_t>::chunk_setup(int chunk_bits, int num_qubits,
+                                              uint_t chunk_index,
+                                              uint_t num_local_chunks) {
+  uint_t num_chunks_allocated = 0;
   // set global chunk ID / shot ID
   chunk_index_ = chunk_index;
 
@@ -860,7 +874,7 @@ bool QubitVectorThrust<data_t>::chunk_setup(int chunk_bits, int num_qubits,
         chunk_manager_->num_qubits() == num_qubits) {
       bool mapped = chunk_manager_->MapChunk(chunk_, 0);
       chunk_.set_chunk_index(chunk_index_);
-      return mapped;
+      return num_local_chunks;
     }
     chunk_manager_.reset();
   }
@@ -870,10 +884,10 @@ bool QubitVectorThrust<data_t>::chunk_setup(int chunk_bits, int num_qubits,
     chunk_manager_ = std::make_shared<Chunk::ChunkManager<data_t>>();
     chunk_manager_->set_num_threads_per_group(num_threads_per_group_);
     chunk_manager_->set_num_creg_bits(num_creg_bits_ + num_cmem_bits_);
-    chunk_manager_->Allocate(chunk_bits, num_qubits, num_local_chunks,
-                             chunk_index_, max_matrix_bits_,
-                             is_density_matrix(), target_gpus_,
-                             cuStateVec_enable_);
+    num_chunks_allocated = chunk_manager_->Allocate(
+        chunk_bits, num_qubits, num_local_chunks, chunk_index_,
+        max_matrix_bits_, max_sampling_shots_, is_density_matrix(),
+        target_gpus_, cuStateVec_enable_);
   }
 
   multi_chunk_distribution_ = false;
@@ -892,12 +906,13 @@ bool QubitVectorThrust<data_t>::chunk_setup(int chunk_bits, int num_qubits,
   bool mapped = chunk_manager_->MapChunk(chunk_, 0);
   chunk_.set_chunk_index(chunk_index_);
 
-  return mapped;
+  return num_chunks_allocated;
 }
 
 template <typename data_t>
-bool QubitVectorThrust<data_t>::chunk_setup(
-    const QubitVectorThrust<data_t> &base, const uint_t chunk_index) {
+uint_t
+QubitVectorThrust<data_t>::chunk_setup(const QubitVectorThrust<data_t> &base,
+                                       const uint_t chunk_index) {
   multi_chunk_distribution_ = base.multi_chunk_distribution_;
   cuStateVec_enable_ = base.cuStateVec_enable_;
   target_gpus_ = base.target_gpus_;
@@ -912,14 +927,14 @@ bool QubitVectorThrust<data_t>::chunk_setup(
   recv_chunk_.unmap();
 
   if (chunk_.is_mapped()) {
-    return true;
+    return 0;
   }
 
   // mapping/setting chunk
   chunk_manager_ = base.chunk_manager_;
   bool mapped = chunk_manager_->MapChunk(chunk_, 0);
 
-  return mapped;
+  return 0;
 }
 
 template <typename data_t>
@@ -928,6 +943,12 @@ void QubitVectorThrust<data_t>::set_max_matrix_bits(int_t bits) {
     max_matrix_bits_ = bits;
   }
 }
+
+template <typename data_t>
+void QubitVectorThrust<data_t>::set_max_sampling_shots(int_t shots) {
+  max_sampling_shots_ = shots;
+}
+
 template <typename data_t>
 void QubitVectorThrust<data_t>::set_num_qubits(size_t num_qubits) {
   num_qubits_ = num_qubits;
@@ -1168,9 +1189,9 @@ template <typename data_t>
 bool QubitVectorThrust<data_t>::enable_batch(bool flg) const {
   bool prev = enable_batch_;
 
-  //  if(flg != prev){
-  //    chunk_.synchronize();
-  //  }
+  if (flg != prev) {
+    chunk_.synchronize();
+  }
   enable_batch_ = flg;
 
   return prev;
@@ -1347,7 +1368,7 @@ void QubitVectorThrust<data_t>::apply_function(Function func,
   chunk_.Execute(func, chunk_count);
 
 #ifdef AER_DEBUG
-  DebugMsg(func.name(), chunk_count);
+  DebugMsg(func.name(), (int)chunk_count);
   DebugDump();
 #endif
 }
@@ -1362,8 +1383,8 @@ void QubitVectorThrust<data_t>::apply_function(
     if (!cuStateVec_enable_ && func.batch_enable() &&
         ((multi_chunk_distribution_ && chunk_.device() >= 0) ||
          enable_batch_)) {
-      if (chunk_.pos() ==
-          0) // only first chunk on device calculates all the chunks
+      // only first chunk on device calculates all the chunks
+      if (chunk_.pos() == 0)
         chunk_count = chunk_.container()->num_chunks();
       else
         return;
@@ -1378,7 +1399,7 @@ void QubitVectorThrust<data_t>::apply_function(
   chunk_.Execute(func, chunk_count);
 
 #ifdef AER_DEBUG
-  DebugMsg(func.name(), chunk_count);
+  DebugMsg(func.name(), (int)chunk_count);
   DebugDump();
 #endif
 }
@@ -1554,6 +1575,42 @@ void QubitVectorThrust<data_t>::apply_permutation_matrix(
   chunk_.apply_permutation(qubits, pairs, count);
 }
 
+template <typename data_t>
+void QubitVectorThrust<data_t>::apply_batched_matrix(
+    const reg_t &qubits, const cvector_t<double> &mat,
+    const uint_t num_matrices, const uint_t num_shots_per_matrix) {
+  uint_t count = get_chunk_count();
+  if (count == 0)
+    return;
+
+  uint_t matrix_size = mat.size() / num_matrices;
+  uint_t num_control_bits = 0;
+  if ((1ull << (qubits.size() * 2)) != matrix_size) {
+    num_control_bits = qubits.size() - 1;
+  }
+
+  chunk_.apply_batched_matrix(qubits, num_control_bits, mat,
+                              num_shots_per_matrix, count);
+}
+
+template <typename data_t>
+void QubitVectorThrust<data_t>::apply_batched_diagonal_matrix(
+    const reg_t &qubits, const cvector_t<double> &mat,
+    const uint_t num_matrices, const uint_t num_shots_per_matrix) {
+  uint_t count = get_chunk_count();
+  if (count == 0)
+    return;
+
+  uint_t matrix_size = mat.size() / num_matrices;
+  uint_t num_control_bits = 0;
+  if ((1ull << qubits.size()) != matrix_size) {
+    num_control_bits = qubits.size() - 1;
+  }
+
+  chunk_.apply_batched_diagonal_matrix(qubits, num_control_bits, mat,
+                                       num_shots_per_matrix, count);
+}
+
 /*******************************************************************************
  *
  * APPLY OPTIMIZED GATES
@@ -2624,6 +2681,39 @@ QubitVectorThrust<data_t>::expval_pauli(const reg_t &qubits,
   return ret;
 }
 
+template <typename data_t>
+void QubitVectorThrust<data_t>::batched_expval_pauli(
+    std::vector<double> &val, const reg_t &qubits, const std::string &pauli,
+    bool variance, std::complex<double> param, bool last,
+    const complex_t initial_phase) const {
+  uint_t i, count = 1;
+  if (enable_batch_) {
+    if (chunk_.pos() != 0) {
+      return; // first chunk execute all in batch
+    }
+    count = chunk_.container()->num_chunks();
+  }
+
+  bool init = false;
+  if (val.size() == 0) {
+    if (variance)
+      val.resize(count * 2);
+    else
+      val.resize(count);
+    init = true;
+  }
+
+  chunk_.batched_expval_pauli(count, qubits, pauli, variance, param, init,
+                              initial_phase);
+  if (last) {
+    if (variance)
+      chunk_.copy_reduce_buffer(val, 2);
+    else
+      chunk_.copy_reduce_buffer(val, 1);
+    chunk_.synchronize();
+  }
+}
+
 template <typename data_t>
 double QubitVectorThrust<data_t>::expval_pauli(
     const reg_t &qubits, const std::string &pauli,
diff --git a/src/simulators/statevector/statevector_executor.hpp b/src/simulators/statevector/statevector_executor.hpp
index 6c2071bcea..6cd6877211 100644
--- a/src/simulators/statevector/statevector_executor.hpp
+++ b/src/simulators/statevector/statevector_executor.hpp
@@ -30,6 +30,8 @@ namespace AER {
 
 namespace Statevector {
 
+using ResultItr = std::vector<ExperimentResult>::iterator;
+
 //-------------------------------------------------------------------------
 // Executor for statevector
 //-------------------------------------------------------------------------
@@ -48,8 +50,6 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
 protected:
   void set_config(const Config &config) override;
 
-  void apply_global_phase() override;
-
   bool shot_branching_supported(void) override { return true; }
 
   // apply parallel operations
@@ -59,11 +59,11 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
   // apply op to multiple shots , return flase if op is not supported to execute
   // in a batch
   bool apply_batched_op(const int_t istate, const Operations::Op &op,
-                        ExperimentResult &result, std::vector<RngEngine> &rng,
+                        ResultItr result, std::vector<RngEngine> &rng,
                         bool final_op = false) override;
 
   bool apply_branching_op(CircuitExecutor::Branch &root,
-                          const Operations::Op &op, ExperimentResult &result,
+                          const Operations::Op &op, ResultItr result,
                           bool final_op) override;
 
   // Initializes an n-qubit state to the all |0> state
@@ -72,9 +72,13 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
   auto move_to_vector(void);
   auto copy_to_vector(void);
 
+  void run_circuit_with_sampling(Circuit &circ, const Config &config,
+                                 RngEngine &init_rng,
+                                 ResultItr result) override;
+
   void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise,
                          const Config &config, RngEngine &init_rng,
-                         ExperimentResult &result, bool sample_noise) override;
+                         ResultItr result_it, bool sample_noise) override;
 
   bool allocate_states(uint_t num_states, const Config &config) override {
     return BasePar::allocate_states(num_states, config);
@@ -140,15 +144,18 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
   void apply_save_amplitudes(const Operations::Op &op,
                              ExperimentResult &result);
 
+  // Helper functions for shot-branching
+  void apply_save_density_matrix(CircuitExecutor::Branch &root,
+                                 const Operations::Op &op, ResultItr result);
+  void apply_save_probs(CircuitExecutor::Branch &root, const Operations::Op &op,
+                        ResultItr result);
   void apply_save_statevector(CircuitExecutor::Branch &root,
-                              const Operations::Op &op,
-                              ExperimentResult &result, bool last_op);
+                              const Operations::Op &op, ResultItr result,
+                              bool last_op);
   void apply_save_statevector_dict(CircuitExecutor::Branch &root,
-                                   const Operations::Op &op,
-                                   ExperimentResult &result);
+                                   const Operations::Op &op, ResultItr result);
   void apply_save_amplitudes(CircuitExecutor::Branch &root,
-                             const Operations::Op &op,
-                             ExperimentResult &result);
+                             const Operations::Op &op, ResultItr result);
 
   // Helper function for computing expectation value
   double expval_pauli(const reg_t &qubits, const std::string &pauli) override;
@@ -206,36 +213,30 @@ void Executor<state_t>::set_config(const Config &config) {
 }
 
 template <class state_t>
-void Executor<state_t>::apply_global_phase() {
-  if (Base::has_global_phase_) {
-    int_t i;
-    if (Base::shot_omp_parallel_ && Base::num_groups_ > 1) {
-#pragma omp parallel for
-      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
-        for (int_t iChunk = Base::top_state_of_group_[ig];
-             iChunk < Base::top_state_of_group_[ig + 1]; iChunk++)
-          Base::states_[iChunk].apply_diagonal_matrix(
-              {0}, {Base::global_phase_, Base::global_phase_});
-      }
-    } else {
-      for (i = 0; i < Base::states_.size(); i++)
-        Base::states_[i].apply_diagonal_matrix(
-            {0}, {Base::global_phase_, Base::global_phase_});
-    }
+void Executor<state_t>::run_circuit_with_sampling(Circuit &circ,
+                                                  const Config &config,
+                                                  RngEngine &init_rng,
+                                                  ResultItr result_it) {
+  Noise::NoiseModel dummy_noise;
+  if (BasePar::multiple_chunk_required(config, circ, dummy_noise)) {
+    return BasePar::run_circuit_with_sampling(circ, config, init_rng,
+                                              result_it);
+  } else {
+    return BaseBatch::run_circuit_with_sampling(circ, config, init_rng,
+                                                result_it);
   }
 }
 
 template <class state_t>
 void Executor<state_t>::run_circuit_shots(
     Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
-    RngEngine &init_rng, ExperimentResult &result, bool sample_noise) {
-  state_t dummy_state;
+    RngEngine &init_rng, ResultItr result_it, bool sample_noise) {
   if (BasePar::multiple_chunk_required(config, circ, noise)) {
-    return BasePar::run_circuit_shots(circ, noise, config, init_rng, result,
+    return BasePar::run_circuit_shots(circ, noise, config, init_rng, result_it,
                                       sample_noise);
   } else {
-    return BaseBatch::run_circuit_shots(circ, noise, config, init_rng, result,
-                                        sample_noise);
+    return BaseBatch::run_circuit_shots(circ, noise, config, init_rng,
+                                        result_it, sample_noise);
   }
 }
 
@@ -299,13 +300,26 @@ bool Executor<state_t>::apply_parallel_op(const Operations::Op &op,
 template <class state_t>
 bool Executor<state_t>::apply_batched_op(const int_t istate,
                                          const Operations::Op &op,
-                                         ExperimentResult &result,
+                                         ResultItr result,
                                          std::vector<RngEngine> &rng,
                                          bool final_op) {
   if (op.conditional) {
     Base::states_[istate].qreg().set_conditional(op.conditional_reg);
   }
 
+  // parameterization
+  if (op.has_bind_params) {
+    if (op.type == Operations::OpType::diagonal_matrix)
+      Base::states_[istate].qreg().apply_batched_diagonal_matrix(
+          op.qubits, op.params, Base::num_bind_params_,
+          Base::num_shots_per_bind_param_);
+    else
+      Base::states_[istate].qreg().apply_batched_matrix(
+          op.qubits, op.params, Base::num_bind_params_,
+          Base::num_shots_per_bind_param_);
+    return true;
+  }
+
   switch (op.type) {
   case Operations::OpType::barrier:
   case Operations::OpType::nop:
@@ -345,6 +359,10 @@ bool Executor<state_t>::apply_batched_op(const int_t istate,
   case Operations::OpType::kraus:
     Base::states_[istate].qreg().apply_batched_kraus(op.qubits, op.mats, rng);
     break;
+  case Operations::OpType::save_expval:
+  case Operations::OpType::save_expval_var:
+    BaseBatch::apply_batched_expval(istate, op, result);
+    break;
   case Operations::OpType::sim_op:
     if (op.name == "begin_register_blocking") {
       Base::states_[istate].qreg().enter_register_blocking(op.qubits);
@@ -367,8 +385,7 @@ bool Executor<state_t>::apply_batched_op(const int_t istate,
 template <class state_t>
 bool Executor<state_t>::apply_branching_op(CircuitExecutor::Branch &root,
                                            const Operations::Op &op,
-                                           ExperimentResult &result,
-                                           bool final_op) {
+                                           ResultItr result, bool final_op) {
   RngEngine dummy;
   if (Base::states_[root.state_index()].creg().check_conditional(op)) {
     switch (op.type) {
@@ -388,11 +405,14 @@ bool Executor<state_t>::apply_branching_op(CircuitExecutor::Branch &root,
     // save ops
     case Operations::OpType::save_expval:
     case Operations::OpType::save_expval_var:
+      Base::apply_save_expval(root, op, result);
+      break;
     case Operations::OpType::save_densmat:
+      apply_save_density_matrix(root, op, result);
+      break;
     case Operations::OpType::save_probs:
     case Operations::OpType::save_probs_ket:
-      // call save functions in state class
-      Base::states_[root.state_index()].apply_op(op, result, dummy, final_op);
+      apply_save_probs(root, op, result);
       break;
     case Operations::OpType::save_state:
     case Operations::OpType::save_statevec:
@@ -428,6 +448,7 @@ void Executor<state_t>::initialize_qreg(uint_t num_qubits) {
         if (Base::global_state_index_ + iChunk == 0 ||
             this->num_qubits_ == this->chunk_bits_) {
           Base::states_[iChunk].qreg().initialize();
+          Base::states_[iChunk].apply_global_phase();
         } else {
           Base::states_[iChunk].qreg().zero();
         }
@@ -438,13 +459,12 @@ void Executor<state_t>::initialize_qreg(uint_t num_qubits) {
       if (Base::global_state_index_ + i == 0 ||
           this->num_qubits_ == this->chunk_bits_) {
         Base::states_[i].qreg().initialize();
+        Base::states_[i].apply_global_phase();
       } else {
         Base::states_[i].qreg().zero();
       }
     }
   }
-
-  BasePar::apply_global_phase();
 }
 
 template <class state_t>
@@ -1685,11 +1705,74 @@ void Executor<state_t>::apply_kraus(CircuitExecutor::Branch &root,
   }
 }
 
+template <class state_t>
+void Executor<state_t>::apply_save_density_matrix(CircuitExecutor::Branch &root,
+                                                  const Operations::Op &op,
+                                                  ResultItr result) {
+  cmatrix_t reduced_state;
+
+  // Check if tracing over all qubits
+  if (op.qubits.empty()) {
+    reduced_state = cmatrix_t(1, 1);
+
+    reduced_state[0] = Base::states_[root.state_index()].qreg().norm();
+  } else {
+    reduced_state = Base::states_[root.state_index()].density_matrix(op.qubits);
+  }
+
+  std::vector<bool> copied(Base::num_bind_params_, false);
+  for (int_t i = 0; i < root.num_shots(); i++) {
+    uint_t ip = root.param_index(i);
+    if (!copied[ip]) {
+      (result + ip)
+          ->save_data_average(Base::states_[root.state_index()].creg(),
+                              op.string_params[0], reduced_state, op.type,
+                              op.save_type);
+      copied[ip] = true;
+    }
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_probs(CircuitExecutor::Branch &root,
+                                         const Operations::Op &op,
+                                         ResultItr result) {
+  // get probs as hexadecimal
+  auto probs =
+      Base::states_[root.state_index()].qreg().probabilities(op.qubits);
+
+  std::vector<bool> copied(Base::num_bind_params_, false);
+  if (op.type == Operations::OpType::save_probs_ket) {
+    // Convert to ket dict
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      uint_t ip = root.param_index(i);
+      if (!copied[ip]) {
+        (result + ip)
+            ->save_data_average(
+                Base::states_[root.state_index()].creg(), op.string_params[0],
+                Utils::vec2ket(probs, Base::json_chop_threshold_, 16), op.type,
+                op.save_type);
+        copied[ip] = true;
+      }
+    }
+  } else {
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      uint_t ip = root.param_index(i);
+      if (!copied[ip]) {
+        (result + ip)
+            ->save_data_average(Base::states_[root.state_index()].creg(),
+                                op.string_params[0], probs, op.type,
+                                op.save_type);
+        copied[ip] = true;
+      }
+    }
+  }
+}
+
 template <class state_t>
 void Executor<state_t>::apply_save_statevector(CircuitExecutor::Branch &root,
                                                const Operations::Op &op,
-                                               ExperimentResult &result,
-                                               bool last_op) {
+                                               ResultItr result, bool last_op) {
   if (op.qubits.size() != Base::num_qubits_) {
     throw std::invalid_argument(op.name +
                                 " was not applied to all qubits."
@@ -1701,22 +1784,25 @@ void Executor<state_t>::apply_save_statevector(CircuitExecutor::Branch &root,
   if (last_op) {
     const auto v = Base::states_[root.state_index()].move_to_vector();
     for (int_t i = 0; i < root.num_shots(); i++) {
-      result.save_data_pershot(Base::states_[root.state_index()].creg(), key, v,
-                               OpType::save_statevec, op.save_type);
+      uint_t ip = root.param_index(i);
+      (result + ip)
+          ->save_data_pershot(Base::states_[root.state_index()].creg(), key, v,
+                              OpType::save_statevec, op.save_type);
     }
   } else {
     const auto v = Base::states_[root.state_index()].copy_to_vector();
     for (int_t i = 0; i < root.num_shots(); i++) {
-      result.save_data_pershot(Base::states_[root.state_index()].creg(), key, v,
-                               OpType::save_statevec, op.save_type);
+      uint_t ip = root.param_index(i);
+      (result + ip)
+          ->save_data_pershot(Base::states_[root.state_index()].creg(), key, v,
+                              OpType::save_statevec, op.save_type);
     }
   }
 }
 
 template <class state_t>
 void Executor<state_t>::apply_save_statevector_dict(
-    CircuitExecutor::Branch &root, const Operations::Op &op,
-    ExperimentResult &result) {
+    CircuitExecutor::Branch &root, const Operations::Op &op, ResultItr result) {
   if (op.qubits.size() != Base::num_qubits_) {
     throw std::invalid_argument(op.name +
                                 " was not applied to all qubits."
@@ -1729,17 +1815,19 @@ void Executor<state_t>::apply_save_statevector_dict(
     result_state_ket[it.first] = it.second;
   }
   for (int_t i = 0; i < root.num_shots(); i++) {
-    result.save_data_pershot(
-        Base::states_[root.state_index()].creg(), op.string_params[0],
-        (const std::map<std::string, complex_t> &)result_state_ket, op.type,
-        op.save_type);
+    uint_t ip = root.param_index(i);
+    (result + ip)
+        ->save_data_pershot(
+            Base::states_[root.state_index()].creg(), op.string_params[0],
+            (const std::map<std::string, complex_t> &)result_state_ket, op.type,
+            op.save_type);
   }
 }
 
 template <class state_t>
 void Executor<state_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
                                               const Operations::Op &op,
-                                              ExperimentResult &result) {
+                                              ResultItr result) {
   if (op.int_params.empty()) {
     throw std::invalid_argument(
         "Invalid save_amplitudes instructions (empty params).");
@@ -1752,9 +1840,11 @@ void Executor<state_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
           Base::states_[root.state_index()].qreg().get_state(op.int_params[i]);
     }
     for (int_t i = 0; i < root.num_shots(); i++) {
-      result.save_data_pershot(
-          Base::states_[root.state_index()].creg(), op.string_params[0],
-          (const Vector<complex_t> &)amps, op.type, op.save_type);
+      uint_t ip = root.param_index(i);
+      (result + ip)
+          ->save_data_pershot(
+              Base::states_[root.state_index()].creg(), op.string_params[0],
+              (const Vector<complex_t> &)amps, op.type, op.save_type);
     }
   } else {
     rvector_t amps_sq(size, 0);
@@ -1762,9 +1852,17 @@ void Executor<state_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
       amps_sq[i] = Base::states_[root.state_index()].qreg().probability(
           op.int_params[i]);
     }
-    result.save_data_average(Base::states_[root.state_index()].creg(),
-                             op.string_params[0], amps_sq, op.type,
-                             op.save_type);
+    std::vector<bool> copied(Base::num_bind_params_, false);
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      uint_t ip = root.param_index(i);
+      if (!copied[ip]) {
+        (result + ip)
+            ->save_data_average(Base::states_[root.state_index()].creg(),
+                                op.string_params[0], amps_sq, op.type,
+                                op.save_type);
+        copied[ip] = true;
+      }
+    }
   }
 }
 
diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp
index 6746cd897a..0922705ade 100755
--- a/src/simulators/statevector/statevector_state.hpp
+++ b/src/simulators/statevector/statevector_state.hpp
@@ -156,6 +156,9 @@ class State : public QuantumState::State<statevec_t> {
   virtual std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
                                             RngEngine &rng) override;
 
+  // Helper function for computing expectation value
+  virtual double expval_pauli(const reg_t &qubits,
+                              const std::string &pauli) override;
   //-----------------------------------------------------------------------
   // Additional methods
   //-----------------------------------------------------------------------
@@ -222,6 +225,9 @@ class State : public QuantumState::State<statevec_t> {
   // Return the reduced density matrix for the simulator
   cmatrix_t density_matrix(const reg_t &qubits);
 
+  // Apply the global phase
+  void apply_global_phase();
+
 protected:
   //-----------------------------------------------------------------------
   // Save data instructions
@@ -249,9 +255,6 @@ class State : public QuantumState::State<statevec_t> {
   void apply_save_amplitudes(const Operations::Op &op,
                              ExperimentResult &result);
 
-  // Helper function for computing expectation value
-  virtual double expval_pauli(const reg_t &qubits,
-                              const std::string &pauli) override;
   //-----------------------------------------------------------------------
   // Measurement Helpers
   //-----------------------------------------------------------------------
@@ -303,9 +306,6 @@ class State : public QuantumState::State<statevec_t> {
   // Config Settings
   //-----------------------------------------------------------------------
 
-  // Apply the global phase
-  void apply_global_phase();
-
   // OpenMP qubit threshold
   int omp_qubit_threshold_ = 14;
 
@@ -438,6 +438,8 @@ bool State<statevec_t>::allocate(uint_t num_qubits, uint_t block_bits,
                                  uint_t num_parallel_shots) {
   if (BaseState::max_matrix_qubits_ > 0)
     BaseState::qreg_.set_max_matrix_bits(BaseState::max_matrix_qubits_);
+  if (BaseState::max_sampling_shots_ > 0)
+    BaseState::qreg_.set_max_sampling_shots(BaseState::max_sampling_shots_);
 
   BaseState::qreg_.set_target_gpus(BaseState::target_gpus_);
   BaseState::qreg_.chunk_setup(block_bits, num_qubits, 0, 1);
diff --git a/src/simulators/tensor_network/tensor_net_executor.hpp b/src/simulators/tensor_network/tensor_net_executor.hpp
index 74be04051e..102fb22c2f 100644
--- a/src/simulators/tensor_network/tensor_net_executor.hpp
+++ b/src/simulators/tensor_network/tensor_net_executor.hpp
@@ -29,6 +29,8 @@ namespace AER {
 
 namespace TensorNetwork {
 
+using ResultItr = std::vector<ExperimentResult>::iterator;
+
 //-------------------------------------------------------------------------
 // Batched-shots executor for statevector
 //-------------------------------------------------------------------------
@@ -47,7 +49,7 @@ class Executor : public CircuitExecutor::MultiStateExecutor<state_t> {
   bool shot_branching_supported(void) override { return true; }
 
   bool apply_branching_op(CircuitExecutor::Branch &root,
-                          const Operations::Op &op, ExperimentResult &result,
+                          const Operations::Op &op, ResultItr result,
                           bool final_op) override;
 
   rvector_t sample_measure_with_prob(CircuitExecutor::Branch &root,
@@ -68,15 +70,18 @@ class Executor : public CircuitExecutor::MultiStateExecutor<state_t> {
                                     uint_t shots,
                                     std::vector<RngEngine> &rng) const override;
 
+  // Helper functions for shot-branching
+  void apply_save_density_matrix(CircuitExecutor::Branch &root,
+                                 const Operations::Op &op, ResultItr result);
+  void apply_save_probs(CircuitExecutor::Branch &root, const Operations::Op &op,
+                        ResultItr result);
   void apply_save_statevector(CircuitExecutor::Branch &root,
-                              const Operations::Op &op,
-                              ExperimentResult &result, bool last_op);
+                              const Operations::Op &op, ResultItr result,
+                              bool last_op);
   void apply_save_statevector_dict(CircuitExecutor::Branch &root,
-                                   const Operations::Op &op,
-                                   ExperimentResult &result);
+                                   const Operations::Op &op, ResultItr result);
   void apply_save_amplitudes(CircuitExecutor::Branch &root,
-                             const Operations::Op &op,
-                             ExperimentResult &result);
+                             const Operations::Op &op, ResultItr result);
 };
 
 template <class state_t>
@@ -87,8 +92,7 @@ void Executor<state_t>::set_config(const Config &config) {
 template <class state_t>
 bool Executor<state_t>::apply_branching_op(CircuitExecutor::Branch &root,
                                            const Operations::Op &op,
-                                           ExperimentResult &result,
-                                           bool final_op) {
+                                           ResultItr result, bool final_op) {
   RngEngine dummy;
   if (Base::states_[root.state_index()].creg().check_conditional(op)) {
     switch (op.type) {
@@ -108,11 +112,14 @@ bool Executor<state_t>::apply_branching_op(CircuitExecutor::Branch &root,
       break;
     case OpType::save_expval:
     case OpType::save_expval_var:
+      Base::apply_save_expval(root, op, result);
+      break;
     case OpType::save_densmat:
+      apply_save_density_matrix(root, op, result);
+      break;
     case OpType::save_probs:
     case OpType::save_probs_ket:
-      // call save functions in state class
-      Base::states_[root.state_index()].apply_op(op, result, dummy, final_op);
+      apply_save_probs(root, op, result);
       break;
     case OpType::save_state:
     case OpType::save_statevec:
@@ -346,11 +353,76 @@ void Executor<state_t>::apply_kraus(CircuitExecutor::Branch &root,
   }
 }
 
+template <class state_t>
+void Executor<state_t>::apply_save_density_matrix(CircuitExecutor::Branch &root,
+                                                  const Operations::Op &op,
+                                                  ResultItr result) {
+  cmatrix_t reduced_state;
+
+  // Check if tracing over all qubits
+  if (op.qubits.empty()) {
+    reduced_state = cmatrix_t(1, 1);
+
+    reduced_state[0] = Base::states_[root.state_index()].qreg().norm();
+  } else {
+    reduced_state =
+        Base::states_[root.state_index()].qreg().reduced_density_matrix(
+            op.qubits);
+  }
+
+  std::vector<bool> copied(Base::num_bind_params_, false);
+  for (int_t i = 0; i < root.num_shots(); i++) {
+    uint_t ip = root.param_index(i);
+    if (!copied[ip]) {
+      (result + ip)
+          ->save_data_average(Base::states_[root.state_index()].creg(),
+                              op.string_params[0], reduced_state, op.type,
+                              op.save_type);
+      copied[ip] = true;
+    }
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_probs(CircuitExecutor::Branch &root,
+                                         const Operations::Op &op,
+                                         ResultItr result) {
+  // get probs as hexadecimal
+  auto probs =
+      Base::states_[root.state_index()].qreg().probabilities(op.qubits);
+
+  std::vector<bool> copied(Base::num_bind_params_, false);
+  if (op.type == Operations::OpType::save_probs_ket) {
+    // Convert to ket dict
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      uint_t ip = root.param_index(i);
+      if (!copied[ip]) {
+        (result + ip)
+            ->save_data_average(
+                Base::states_[root.state_index()].creg(), op.string_params[0],
+                Utils::vec2ket(probs, Base::json_chop_threshold_, 16), op.type,
+                op.save_type);
+        copied[ip] = true;
+      }
+    }
+  } else {
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      uint_t ip = root.param_index(i);
+      if (!copied[ip]) {
+        (result + ip)
+            ->save_data_average(Base::states_[root.state_index()].creg(),
+                                op.string_params[0], probs, op.type,
+                                op.save_type);
+        copied[ip] = true;
+      }
+    }
+  }
+}
+
 template <class state_t>
 void Executor<state_t>::apply_save_statevector(CircuitExecutor::Branch &root,
                                                const Operations::Op &op,
-                                               ExperimentResult &result,
-                                               bool last_op) {
+                                               ResultItr result, bool last_op) {
   if (op.qubits.size() != Base::num_qubits_) {
     throw std::invalid_argument(op.name +
                                 " was not applied to all qubits."
@@ -362,22 +434,25 @@ void Executor<state_t>::apply_save_statevector(CircuitExecutor::Branch &root,
   if (last_op) {
     const auto v = Base::states_[root.state_index()].move_to_vector();
     for (int_t i = 0; i < root.num_shots(); i++) {
-      result.save_data_pershot(Base::states_[root.state_index()].creg(), key, v,
-                               OpType::save_statevec, op.save_type);
+      uint_t ip = root.param_index(i);
+      (result + ip)
+          ->save_data_pershot(Base::states_[root.state_index()].creg(), key, v,
+                              OpType::save_statevec, op.save_type);
     }
   } else {
     const auto v = Base::states_[root.state_index()].copy_to_vector();
     for (int_t i = 0; i < root.num_shots(); i++) {
-      result.save_data_pershot(Base::states_[root.state_index()].creg(), key, v,
-                               OpType::save_statevec, op.save_type);
+      uint_t ip = root.param_index(i);
+      (result + ip)
+          ->save_data_pershot(Base::states_[root.state_index()].creg(), key, v,
+                              OpType::save_statevec, op.save_type);
     }
   }
 }
 
 template <class state_t>
 void Executor<state_t>::apply_save_statevector_dict(
-    CircuitExecutor::Branch &root, const Operations::Op &op,
-    ExperimentResult &result) {
+    CircuitExecutor::Branch &root, const Operations::Op &op, ResultItr result) {
   if (op.qubits.size() != Base::num_qubits_) {
     throw std::invalid_argument(op.name +
                                 " was not applied to all qubits."
@@ -390,17 +465,19 @@ void Executor<state_t>::apply_save_statevector_dict(
     result_state_ket[it.first] = it.second;
   }
   for (int_t i = 0; i < root.num_shots(); i++) {
-    result.save_data_pershot(
-        Base::states_[root.state_index()].creg(), op.string_params[0],
-        (const std::map<std::string, complex_t> &)result_state_ket, op.type,
-        op.save_type);
+    uint_t ip = root.param_index(i);
+    (result + ip)
+        ->save_data_pershot(
+            Base::states_[root.state_index()].creg(), op.string_params[0],
+            (const std::map<std::string, complex_t> &)result_state_ket, op.type,
+            op.save_type);
   }
 }
 
 template <class state_t>
 void Executor<state_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
                                               const Operations::Op &op,
-                                              ExperimentResult &result) {
+                                              ResultItr result) {
   if (op.int_params.empty()) {
     throw std::invalid_argument(
         "Invalid save_amplitudes instructions (empty params).");
@@ -413,9 +490,11 @@ void Executor<state_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
           Base::states_[root.state_index()].qreg().get_state(op.int_params[i]);
     }
     for (int_t i = 0; i < root.num_shots(); i++) {
-      result.save_data_pershot(
-          Base::states_[root.state_index()].creg(), op.string_params[0],
-          (const Vector<complex_t> &)amps, op.type, op.save_type);
+      uint_t ip = root.param_index(i);
+      (result + ip)
+          ->save_data_pershot(
+              Base::states_[root.state_index()].creg(), op.string_params[0],
+              (const Vector<complex_t> &)amps, op.type, op.save_type);
     }
   } else {
     rvector_t amps_sq(size, 0);
@@ -423,9 +502,17 @@ void Executor<state_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
       amps_sq[i] = Base::states_[root.state_index()].qreg().probability(
           op.int_params[i]);
     }
-    result.save_data_average(Base::states_[root.state_index()].creg(),
-                             op.string_params[0], amps_sq, op.type,
-                             op.save_type);
+    std::vector<bool> copied(Base::num_bind_params_, false);
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      uint_t ip = root.param_index(i);
+      if (!copied[ip]) {
+        (result + ip)
+            ->save_data_average(Base::states_[root.state_index()].creg(),
+                                op.string_params[0], amps_sq, op.type,
+                                op.save_type);
+        copied[ip] = true;
+      }
+    }
   }
 }
 
diff --git a/src/simulators/tensor_network/tensor_net_state.hpp b/src/simulators/tensor_network/tensor_net_state.hpp
index a1004a2312..f302e8a470 100644
--- a/src/simulators/tensor_network/tensor_net_state.hpp
+++ b/src/simulators/tensor_network/tensor_net_state.hpp
@@ -154,6 +154,10 @@ class State : public QuantumState::State<tensor_net_t> {
 
   void initialize_from_vector(const cvector_t<double> &params);
 
+  // Helper function for computing expectation value
+  virtual double expval_pauli(const reg_t &qubits,
+                              const std::string &pauli) override;
+
   //-----------------------------------------------------------------------
   // Additional methods
   //-----------------------------------------------------------------------
@@ -248,9 +252,6 @@ class State : public QuantumState::State<tensor_net_t> {
   void apply_save_amplitudes(const Operations::Op &op,
                              ExperimentResult &result);
 
-  // Helper function for computing expectation value
-  virtual double expval_pauli(const reg_t &qubits,
-                              const std::string &pauli) override;
   //-----------------------------------------------------------------------
   // Measurement Helpers
   //-----------------------------------------------------------------------
diff --git a/src/simulators/unitary/unitary_executor.hpp b/src/simulators/unitary/unitary_executor.hpp
index 240d806870..3066e0d619 100644
--- a/src/simulators/unitary/unitary_executor.hpp
+++ b/src/simulators/unitary/unitary_executor.hpp
@@ -99,9 +99,10 @@ void Executor<state_t>::initialize_qreg(uint_t num_qubits) {
                ((Base::num_qubits_ - Base::chunk_bits_));
         icol = (Base::global_state_index_ + iChunk) -
                (irow << ((Base::num_qubits_ - Base::chunk_bits_)));
-        if (irow == icol)
+        if (irow == icol) {
           Base::states_[iChunk].qreg().initialize();
-        else
+          Base::states_[iChunk].apply_global_phase();
+        } else
           Base::states_[iChunk].qreg().zero();
       }
     }
@@ -112,14 +113,13 @@ void Executor<state_t>::initialize_qreg(uint_t num_qubits) {
              ((Base::num_qubits_ - Base::chunk_bits_));
       icol = (Base::global_state_index_ + iChunk) -
              (irow << ((Base::num_qubits_ - Base::chunk_bits_)));
-      if (irow == icol)
+      if (irow == icol) {
         Base::states_[iChunk].qreg().initialize();
-      else
+        Base::states_[iChunk].apply_global_phase();
+      } else
         Base::states_[iChunk].qreg().zero();
     }
   }
-
-  Base::apply_global_phase();
 }
 
 template <class state_t>
diff --git a/src/simulators/unitary/unitary_state.hpp b/src/simulators/unitary/unitary_state.hpp
index e7352b84c2..8f33e43ce1 100755
--- a/src/simulators/unitary/unitary_state.hpp
+++ b/src/simulators/unitary/unitary_state.hpp
@@ -136,6 +136,9 @@ class State : public virtual QuantumState::State<unitary_matrix_t> {
   auto move_to_matrix();
   auto copy_to_matrix();
 
+  // Apply the global phase
+  void apply_global_phase();
+
 protected:
   //-----------------------------------------------------------------------
   // Apply Instructions
@@ -189,9 +192,6 @@ class State : public virtual QuantumState::State<unitary_matrix_t> {
   // Config Settings
   //-----------------------------------------------------------------------
 
-  // Apply the global phase
-  void apply_global_phase();
-
   // OpenMP qubit threshold
   int omp_qubit_threshold_ = 6;
 
diff --git a/src/transpile/batch_converter.hpp b/src/transpile/batch_converter.hpp
new file mode 100644
index 0000000000..40e1b65537
--- /dev/null
+++ b/src/transpile/batch_converter.hpp
@@ -0,0 +1,247 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019, 2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+/*
+This transpiler converts circuit suitable for batched shots executor for GPU
+This transpiler is called after gate fusion, because the parameterized gates
+may be fused and transpiled to matrix operations in gate fusion.
+
+This transplier stores matrices in Operations::Op.params array in cvector_t
+format not in Operations::Op.mats for effective data transfer to GPU memory
+Also matrices in Operations::OpType::matrix will be stored in Op.params as well
+
+GPU simulator supports matrix multiplication with control qubits
+but CPU does not. So there is option to convert to matrix including
+control qubits for CPU.
+*/
+
+#ifndef _aer_batche_converter_hpp_
+#define _aer_batche_converter_hpp_
+
+#include "framework/config.hpp"
+#include "framework/utils.hpp"
+#include "transpile/circuitopt.hpp"
+
+namespace AER {
+namespace Transpile {
+
+enum class ParamGates {
+  rxx,
+  ryy,
+  rzz,
+  rzx,
+  mcr,
+  mcrx,
+  mcry,
+  mcrz,
+  mcp,
+  mcu2,
+  mcu3,
+  mcu,
+};
+
+class BatchConverter : public CircuitOptimization {
+public:
+  BatchConverter() {}
+  ~BatchConverter() {}
+
+  void optimize_circuit(Circuit &circ, Noise::NoiseModel &noise,
+                        const opset_t &allowed_opset,
+                        ExperimentResult &result) const override;
+
+  void set_config(const Config &config) override;
+
+  void include_control_qubits(bool flg) {
+    include_control_qubits_in_matrix_ = flg;
+  }
+
+protected:
+  void gate_to_matrix(Operations::Op &op, uint_t num_params) const;
+
+  bool include_control_qubits_in_matrix_ = false;
+
+  // Table of allowed gate names to gate enum class members
+  const static stringmap_t<ParamGates> gateset_;
+};
+
+const stringmap_t<ParamGates> BatchConverter::gateset_(
+    {{"p", ParamGates::mcp},       {"r", ParamGates::mcr},
+     {"rx", ParamGates::mcrx},     {"ry", ParamGates::mcry},
+     {"rz", ParamGates::mcrz},     {"u1", ParamGates::mcp},
+     {"u2", ParamGates::mcu2},     {"u3", ParamGates::mcu3},
+     {"u", ParamGates::mcu3},      {"U", ParamGates::mcu3},
+     {"cp", ParamGates::mcp},      {"cu1", ParamGates::mcp},
+     {"cu2", ParamGates::mcu2},    {"cu3", ParamGates::mcu3},
+     {"cu", ParamGates::mcu},      {"cp", ParamGates::mcp},
+     {"rxx", ParamGates::rxx},     {"ryy", ParamGates::ryy},
+     {"rzz", ParamGates::rzz},     {"rzx", ParamGates::rzx},
+     {"mcr", ParamGates::mcr},     {"mcrx", ParamGates::mcrx},
+     {"mcry", ParamGates::mcry},   {"mcrz", ParamGates::mcrz},
+     {"mcphase", ParamGates::mcp}, {"mcp", ParamGates::mcp},
+     {"mcu1", ParamGates::mcp},    {"mcu2", ParamGates::mcu2},
+     {"mcu3", ParamGates::mcu3},   {"mcu", ParamGates::mcu}});
+
+void BatchConverter::set_config(const Config &config) {
+  CircuitOptimization::set_config(config);
+}
+
+void BatchConverter::optimize_circuit(Circuit &circ, Noise::NoiseModel &noise,
+                                      const opset_t &allowed_opset,
+                                      ExperimentResult &result) const {
+  // convert operations for batch shots execution
+  for (int_t i = 0; i < circ.ops.size(); i++) {
+    if (circ.ops[i].has_bind_params) {
+      if (circ.ops[i].type == Operations::OpType::gate) {
+        gate_to_matrix(circ.ops[i], circ.num_bind_params);
+      } else if (circ.ops[i].type == Operations::OpType::matrix) {
+        // convert matrix to cvector_t in params
+        uint_t matrix_size = circ.ops[i].mats[0].size();
+        circ.ops[i].params.resize(matrix_size * circ.num_bind_params);
+        for (int_t j = 0; j < circ.num_bind_params; j++) {
+          for (int_t k = 0; k < matrix_size; k++)
+            circ.ops[i].params[j * matrix_size + k] = circ.ops[i].mats[j][k];
+        }
+        circ.ops[i].mats.clear();
+      }
+    }
+  }
+
+  // convert global phase to diagonal matrix
+  if (circ.global_phase_for_params.size() == circ.num_bind_params) {
+    bool has_global_phase = false;
+    for (int_t j = 0; j < circ.num_bind_params; j++) {
+      if (!Linalg::almost_equal(circ.global_phase_for_params[j], 0.0)) {
+        has_global_phase = true;
+        break;
+      }
+    }
+    if (has_global_phase) {
+      // global phase parameter binding
+      Operations::Op phase_op;
+      phase_op.type = Operations::OpType::diagonal_matrix;
+      phase_op.has_bind_params = true;
+      phase_op.params.resize(2 * circ.num_bind_params);
+      for (int_t j = 0; j < circ.num_bind_params; j++) {
+        auto t = std::exp(complex_t(0.0, circ.global_phase_for_params[j]));
+        phase_op.params[j * 2] = t;
+        phase_op.params[j * 2 + 1] = t;
+      }
+      circ.ops.insert(circ.ops.begin(), phase_op);
+    }
+  } else {
+    if (!Linalg::almost_equal(circ.global_phase_angle, 0.0)) {
+      Operations::Op phase_op;
+      phase_op.type = Operations::OpType::diagonal_matrix;
+      phase_op.params.resize(2);
+      auto t = std::exp(complex_t(0.0, circ.global_phase_angle));
+      phase_op.params[0] = t;
+      phase_op.params[1] = t;
+      circ.ops.insert(circ.ops.begin(), phase_op);
+    }
+  }
+
+  circ.set_params();
+}
+
+void BatchConverter::gate_to_matrix(Operations::Op &op,
+                                    uint_t num_params) const {
+  auto it = gateset_.find(op.name);
+  if (it == gateset_.end())
+    return;
+
+  uint_t matrix_size;
+  if (it->second == ParamGates::mcrz || it->second == ParamGates::rzz ||
+      it->second == ParamGates::mcp) {
+    matrix_size = 2ull;
+    op.type = Operations::OpType::diagonal_matrix;
+  } else {
+    matrix_size = 4ull;
+    op.type = Operations::OpType::matrix;
+  }
+  cvector_t matrix_array(num_params * matrix_size);
+
+  auto store_matrix = [&matrix_array, matrix_size](int_t iparam,
+                                                   cvector_t mat) {
+    for (int_t j = 0; j < matrix_size; j++)
+      matrix_array[iparam * matrix_size + j] = mat[j];
+  };
+
+  switch (it->second) {
+  case ParamGates::mcr:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i,
+                   Linalg::VMatrix::r(op.params[i * 2], op.params[i * 2 + 1]));
+    break;
+  case ParamGates::mcrx:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::rx(std::real(op.params[i])));
+    break;
+  case ParamGates::mcry:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::ry(std::real(op.params[i])));
+    break;
+  case ParamGates::mcrz:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::rz_diag(std::real(op.params[i])));
+    break;
+  case ParamGates::rxx:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::rxx(std::real(op.params[i])));
+    break;
+  case ParamGates::ryy:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::ryy(std::real(op.params[i])));
+    break;
+  case ParamGates::rzz:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::rzz_diag(std::real(op.params[i])));
+    break;
+  case ParamGates::rzx:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::rzx(std::real(op.params[i])));
+    break;
+  case ParamGates::mcu3:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::u3(std::real(op.params[i * 3]),
+                                          std::real(op.params[i * 3 + 1]),
+                                          std::real(op.params[i * 3 + 2])));
+    break;
+  case ParamGates::mcu:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::u4(std::real(op.params[i * 4]),
+                                          std::real(op.params[i * 4 + 1]),
+                                          std::real(op.params[i * 4 + 2]),
+                                          std::real(op.params[i * 4 + 3])));
+    break;
+  case ParamGates::mcu2:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::u2(std::real(op.params[i * 2]),
+                                          std::real(op.params[i * 2 + 1])));
+    break;
+  case ParamGates::mcp:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::phase_diag(std::real(op.params[i])));
+    break;
+  default:
+    break;
+  }
+
+  op.params = matrix_array;
+}
+
+//-------------------------------------------------------------------------
+} // end namespace Transpile
+} // end namespace AER
+//-------------------------------------------------------------------------
+#endif
diff --git a/src/transpile/fusion.hpp b/src/transpile/fusion.hpp
index a3a1c8b59d..23a48a6d5b 100644
--- a/src/transpile/fusion.hpp
+++ b/src/transpile/fusion.hpp
@@ -54,21 +54,58 @@ class FusionMethod {
       for (size_t i = 0; i < op.qubits.size(); i++)
         op.qubits[i] = orig2remapped[op.qubits[i]];
 
-    auto fusioned_op = generate_operation_internal(fusioned_ops, arg_qubits);
+    op_t fusioned_op;
+    if (num_params_ == 0) {
+      fusioned_op = generate_operation_internal(fusioned_ops, arg_qubits);
+      if (diagonal) {
+        std::vector<complex_t> vec;
+        vec.assign((1UL << fusioned_op.qubits.size()), 0);
+        for (size_t i = 0; i < vec.size(); ++i)
+          vec[i] = fusioned_op.mats[0](i, i);
+        fusioned_op = Operations::make_diagonal(
+            fusioned_op.qubits, std::move(vec), std::string("fusion"));
+      }
+    } else {
+      // loop for runtime parameter binding
+      for (int_t p = 0; p < num_params_; p++) {
+        std::vector<op_t> ops;
+        ops.reserve(fusioned_ops.size());
+        for (auto &op : fusioned_ops) {
+          if (op.has_bind_params)
+            ops.push_back(bind_parameter(op, p, num_params_));
+          else
+            ops.push_back(op);
+        }
+        auto new_op = generate_operation_internal(ops, arg_qubits);
+
+        if (diagonal) {
+          std::vector<complex_t> vec;
+          vec.assign((1UL << new_op.qubits.size()), 0);
+          for (size_t i = 0; i < vec.size(); ++i)
+            vec[i] = new_op.mats[0](i, i);
+          new_op = Operations::make_diagonal(new_op.qubits, std::move(vec),
+                                             std::string("fusion"));
+        }
+
+        if (p == 0)
+          fusioned_op = new_op;
+        else {
+          fusioned_op.has_bind_params = true;
+          if (fusioned_op.type == Operations::OpType::diagonal_matrix)
+            fusioned_op.params.insert(fusioned_op.params.end(),
+                                      new_op.params.begin(),
+                                      new_op.params.end());
+          else
+            fusioned_op.mats.insert(fusioned_op.mats.end(), new_op.mats.begin(),
+                                    new_op.mats.end());
+        }
+      }
+    }
 
     // Revert qubits
     for (size_t i = 0; i < fusioned_op.qubits.size(); i++)
       fusioned_op.qubits[i] = remapped2orig[fusioned_op.qubits[i]];
 
-    if (diagonal) {
-      std::vector<complex_t> vec;
-      vec.assign((1UL << fusioned_op.qubits.size()), 0);
-      for (size_t i = 0; i < vec.size(); ++i)
-        vec[i] = fusioned_op.mats[0](i, i);
-      fusioned_op = Operations::make_diagonal(
-          fusioned_op.qubits, std::move(vec), std::string("fusion"));
-    }
-
     return fusioned_op;
   };
 
@@ -101,8 +138,11 @@ class FusionMethod {
     return false;
   };
 
+  void set_num_params(uint_t n) { num_params_ = n; }
+
 private:
   const static Operations::OpSet noise_opset_;
+  uint_t num_params_ = 1;
 };
 
 const Operations::OpSet FusionMethod::noise_opset_({Operations::OpType::kraus,
@@ -837,6 +877,8 @@ void Fusion::optimize_circuit(Circuit &circ, Noise::NoiseModel &noise,
                                                    allow_superop, allow_kraus);
   result.metadata.add(method.name(), "fusion", "method");
 
+  method.set_num_params(circ.num_bind_params);
+
   bool applied = false;
   for (const std::shared_ptr<Fuser> &fuser : fusers) {
     fuser->set_metadata(result);
diff --git a/src/transpile/parameter2matrix.hpp b/src/transpile/parameter2matrix.hpp
new file mode 100644
index 0000000000..a5bae36e26
--- /dev/null
+++ b/src/transpile/parameter2matrix.hpp
@@ -0,0 +1,215 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019, 2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+/*
+This transpiler converts parameterized gates into matrix operations.
+This transpiler is called after gate fusion, because the parameterized gates
+may be fused and transpiled to matrix operations in gate fusion.
+
+This transplier stores matrices in Operations::Op.params array in cvector_t
+format not in Operations::Op.mats for effective data transfer to GPU memory
+Also matrices in Operations::OpType::matrix will be stored in Op.params as well
+
+GPU simulator supports matrix multiplication with control qubits
+but CPU does not. So there is option to convert to matrix including
+control qubits for CPU.
+*/
+
+#ifndef _aer_parameter2matrix_hpp_
+#define _aer_parameter2matrix_hpp_
+
+#include "framework/config.hpp"
+#include "framework/utils.hpp"
+#include "transpile/circuitopt.hpp"
+
+namespace AER {
+namespace Transpile {
+
+enum class ParamGates {
+  rxx,
+  ryy,
+  rzz,
+  rzx,
+  mcr,
+  mcrx,
+  mcry,
+  mcrz,
+  mcp,
+  mcu2,
+  mcu3,
+  mcu,
+};
+
+class Parameter2Matrix : public CircuitOptimization {
+public:
+  Parameter2Matrix() {}
+  ~Parameter2Matrix() {}
+
+  void optimize_circuit(Circuit &circ, Noise::NoiseModel &noise,
+                        const opset_t &allowed_opset,
+                        ExperimentResult &result) const override;
+
+  void set_config(const Config &config) override;
+
+  void include_control_qubits(bool flg) {
+    include_control_qubits_in_matrix_ = flg;
+  }
+
+protected:
+  void gate_to_matrix(Operations::Op &op, uint_t num_params) const;
+
+  bool include_control_qubits_in_matrix_ = false;
+
+  // Table of allowed gate names to gate enum class members
+  const static stringmap_t<ParamGates> gateset_;
+};
+
+const stringmap_t<ParamGates> Parameter2Matrix::gateset_(
+    {{"p", ParamGates::mcp},       {"r", ParamGates::mcr},
+     {"rx", ParamGates::mcrx},     {"ry", ParamGates::mcry},
+     {"rz", ParamGates::mcrz},     {"u1", ParamGates::mcp},
+     {"u2", ParamGates::mcu2},     {"u3", ParamGates::mcu3},
+     {"u", ParamGates::mcu3},      {"U", ParamGates::mcu3},
+     {"cp", ParamGates::mcp},      {"cu1", ParamGates::mcp},
+     {"cu2", ParamGates::mcu2},    {"cu3", ParamGates::mcu3},
+     {"cu", ParamGates::mcu},      {"cp", ParamGates::mcp},
+     {"rxx", ParamGates::rxx},     {"ryy", ParamGates::ryy},
+     {"rzz", ParamGates::rzz},     {"rzx", ParamGates::rzx},
+     {"mcr", ParamGates::mcr},     {"mcrx", ParamGates::mcrx},
+     {"mcry", ParamGates::mcry},   {"mcrz", ParamGates::mcrz},
+     {"mcphase", ParamGates::mcp}, {"mcp", ParamGates::mcp},
+     {"mcu1", ParamGates::mcp},    {"mcu2", ParamGates::mcu2},
+     {"mcu3", ParamGates::mcu3},   {"mcu", ParamGates::mcu}});
+
+void Parameter2Matrix::set_config(const Config &config) {
+  CircuitOptimization::set_config(config);
+}
+
+void Parameter2Matrix::optimize_circuit(Circuit &circ, Noise::NoiseModel &noise,
+                                        const opset_t &allowed_opset,
+                                        ExperimentResult &result) const {
+  if (circ.num_bind_params <= 1)
+    return;
+
+  for (int_t i = 0; i < circ.ops.size(); i++) {
+    if (circ.ops[i].has_bind_params) {
+      if (circ.ops[i].type == Operations::OpType::gate) {
+        gate_to_matrix(circ.ops[i], circ.num_bind_params);
+      } else if (circ.ops[i].type == Operations::OpType::matrix) {
+        // convert matrix to cvector_t in params
+        uint_t matrix_size = circ.ops[i].mats[0].size();
+        circ.ops[i].params.resize(matrix_size * circ.num_bind_params);
+        for (int_t j = 0; j < circ.num_bind_params; j++) {
+          for (int_t k = 0; k < matrix_size; k++)
+            circ.ops[i].params[j * matrix_size + k] = circ.ops[i].mats[j][k];
+        }
+        circ.ops[i].mats.clear();
+      }
+    }
+  }
+
+  circ.set_params();
+}
+
+void Parameter2Matrix::gate_to_matrix(Operations::Op &op,
+                                      uint_t num_params) const {
+  auto it = gateset_.find(op.name);
+  if (it == gateset_.end())
+    return;
+
+  uint_t matrix_size;
+  if (it->second == ParamGates::mcrz || it->second == ParamGates::rzz ||
+      it->second == ParamGates::mcp) {
+    matrix_size = 2ull;
+    op.type = Operations::OpType::diagonal_matrix;
+  } else {
+    matrix_size = 4ull;
+    op.type = Operations::OpType::matrix;
+  }
+  cvector_t matrix_array(num_params * matrix_size);
+
+  auto store_matrix = [&matrix_array, matrix_size](int_t iparam,
+                                                   cvector_t mat) {
+    for (int_t j = 0; j < matrix_size; j++)
+      matrix_array[iparam * matrix_size + j] = mat[j];
+  };
+
+  switch (it->second) {
+  case ParamGates::mcr:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i,
+                   Linalg::VMatrix::r(op.params[i * 2], op.params[i * 2 + 1]));
+    break;
+  case ParamGates::mcrx:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::rx(std::real(op.params[i])));
+    break;
+  case ParamGates::mcry:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::ry(std::real(op.params[i])));
+    break;
+  case ParamGates::mcrz:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::rz_diag(std::real(op.params[i])));
+    break;
+  case ParamGates::rxx:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::rxx(std::real(op.params[i])));
+    break;
+  case ParamGates::ryy:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::ryy(std::real(op.params[i])));
+    break;
+  case ParamGates::rzz:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::rzz_diag(std::real(op.params[i])));
+    break;
+  case ParamGates::rzx:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::rzx(std::real(op.params[i])));
+    break;
+  case ParamGates::mcu3:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::u3(std::real(op.params[i * 3]),
+                                          std::real(op.params[i * 3 + 1]),
+                                          std::real(op.params[i * 3 + 2])));
+    break;
+  case ParamGates::mcu:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::u4(std::real(op.params[i * 4]),
+                                          std::real(op.params[i * 4 + 1]),
+                                          std::real(op.params[i * 4 + 2]),
+                                          std::real(op.params[i * 4 + 3])));
+    break;
+  case ParamGates::mcu2:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::u2(std::real(op.params[i * 2]),
+                                          std::real(op.params[i * 2 + 1])));
+    break;
+  case ParamGates::mcp:
+    for (int_t i = 0; i < num_params; i++)
+      store_matrix(i, Linalg::VMatrix::phase_diag(std::real(op.params[i])));
+    break;
+  default:
+    break;
+  }
+
+  op.params = matrix_array;
+}
+
+//-------------------------------------------------------------------------
+} // end namespace Transpile
+} // end namespace AER
+//-------------------------------------------------------------------------
+#endif
diff --git a/test/terra/backends/test_runtime_parameterization.py b/test/terra/backends/test_runtime_parameterization.py
new file mode 100644
index 0000000000..353d7178da
--- /dev/null
+++ b/test/terra/backends/test_runtime_parameterization.py
@@ -0,0 +1,838 @@
+# This code is part of Qiskit.
+#
+# (C) Copyright IBM 2018, 2019.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+"""
+Integration Tests for Parameterized Qobj execution, testing qasm_simulator,
+statevector_simulator, and expectation value snapshots.
+"""
+
+import unittest
+from math import pi
+from ddt import ddt
+import numpy as np
+
+from test.terra import common
+
+from qiskit.compiler import assemble, transpile
+from qiskit.circuit import QuantumCircuit, Parameter
+from test.terra.reference.ref_save_expval import (
+    save_expval_circuits,
+    save_expval_counts,
+    save_expval_labels,
+    save_expval_pre_meas_values,
+    save_expval_circuit_parameterized,
+    save_expval_final_statevecs,
+)
+from qiskit_aer.library import SaveStatevector
+from qiskit_aer import AerSimulator, AerError
+
+from qiskit_aer.noise import NoiseModel
+from qiskit_aer.noise.errors.standard_errors import pauli_error, amplitude_damping_error
+
+
+from test.terra.backends.simulator_test_case import SimulatorTestCase, supported_methods
+
+SUPPORTED_METHODS = [
+    "statevector",
+]
+
+
+@ddt
+class TestRuntimeParameterization(SimulatorTestCase):
+    """Runtime Parameterization tests"""
+
+    BACKEND_OPTS = {
+        "seed_simulator": 2113,
+        "shot_branching_enable": False,
+        "runtime_parameter_bind_enable": True,
+    }
+
+    @staticmethod
+    def runtime_parameterization(
+        backend,
+        shots=1000,
+        measure=True,
+        snapshot=False,
+        save_state=False,
+    ):
+        """Return ParameterizedQobj for settings."""
+        pershot = shots == 1
+        pcirc1, param1 = save_expval_circuit_parameterized(
+            pershot=pershot,
+            measure=measure,
+            snapshot=snapshot,
+        )
+        circuits2to4 = save_expval_circuits(
+            pauli=True,
+            skip_measure=(not measure),
+            pershot=pershot,
+        )
+        pcirc2, param2 = save_expval_circuit_parameterized(
+            pershot=pershot,
+            measure=measure,
+            snapshot=snapshot,
+        )
+        circuits = [pcirc1] + circuits2to4 + [pcirc2]
+        if save_state:
+            for circuit in circuits:
+                circuit.save_statevector(pershot=pershot)
+        params = [param1, [], [], [], param2]
+        qobj = assemble(circuits, backend=backend, shots=shots, parameterizations=params)
+        return qobj
+
+    def test_runtime_parameterization_qasm_save_expval(self):
+        """Test parameterized qobj with Expectation Value snapshot and qasm simulator."""
+        shots = 1000
+        labels = save_expval_labels() * 3
+        counts_targets = save_expval_counts(shots) * 3
+        value_targets = save_expval_pre_meas_values() * 3
+
+        backend = AerSimulator()
+        qobj = self.runtime_parameterization(
+            backend=backend, shots=1000, measure=True, snapshot=True
+        )
+        self.assertIn("parameterizations", qobj.to_dict()["config"])
+        with self.assertWarns(DeprecationWarning):
+            job = backend.run(qobj, **self.BACKEND_OPTS)
+            result = job.result()
+            success = getattr(result, "success", False)
+            num_circs = len(result.to_dict()["results"])
+            self.assertTrue(success)
+            self.compare_counts(result, range(num_circs), counts_targets, delta=0.1 * shots)
+            # Check snapshots
+            for j, target in enumerate(value_targets):
+                data = result.data(j)
+                for label in labels:
+                    self.assertAlmostEqual(data[label], target[label], delta=1e-7)
+
+    def test_runtime_parameterization_statevector(self):
+        """Test parameterized qobj with Expectation Value snapshot and qasm simulator."""
+        statevec_targets = save_expval_final_statevecs() * 3
+
+        backend = AerSimulator(method="statevector")
+        qobj = self.runtime_parameterization(
+            backend=backend,
+            measure=False,
+            snapshot=False,
+            save_state=True,
+        )
+        self.assertIn("parameterizations", qobj.to_dict()["config"])
+        with self.assertWarns(DeprecationWarning):
+            job = backend.run(qobj, **self.BACKEND_OPTS)
+            result = job.result()
+            success = getattr(result, "success", False)
+            num_circs = len(result.to_dict()["results"])
+            self.assertTrue(success)
+
+            for j in range(num_circs):
+                statevector = result.get_statevector(j)
+                np.testing.assert_array_almost_equal(
+                    statevector, statevec_targets[j].data, decimal=7
+                )
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path(self, method, device):
+        """Test parameterized circuit path via backed.run()"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+        res = backend.run(
+            circuit,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        counts = res.get_counts()
+        self.assertEqual(counts, [{"00": shots}, {"11": shots}, {"00": shots}])
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path_already_bound_parameter_expression(self, method, device):
+        """Test parameterizations with a parameter expression that's already bound."""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        tmp = Parameter("x")
+        theta = Parameter("theta")
+        expr = tmp - tmp
+        bound_expr = expr.bind({tmp: 1})
+        circuit.rx(theta, 0)
+        circuit.rx(bound_expr, 0)
+        circuit.cx(0, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+        res = backend.run(
+            circuit,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        counts = res.get_counts()
+        self.assertEqual(counts, [{"00": shots}, {"11": shots}, {"00": shots}])
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path_already_transpiled_parameter_expression(self, method, device):
+        """Test parameterizations with a transpiled parameter expression."""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(1)
+        theta = Parameter("theta")
+        circuit.rx(theta, 0)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+        tqc = transpile(circuit, basis_gates=["u3"])
+        res = backend.run(
+            tqc,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        counts = res.get_counts()
+        self.assertEqual(counts, [{"0": shots}, {"1": shots}, {"0": shots}])
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path_with_expressions(self, method, device):
+        """Test parameterized circuit path via backed.run()"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.rz(theta_squared, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+        res = backend.run(
+            circuit,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        counts = res.get_counts()
+        self.assertEqual(counts, [{"00": shots}, {"11": shots}, {"00": shots}])
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path_with_expressions_multiple_params_per_instruction(self, method, device):
+        """Test parameterized circuit path via backed.run()"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.rz(theta_squared, 1)
+        circuit.u(theta, theta_squared, theta, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+        res = backend.run(
+            circuit,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        counts = res.get_counts()
+        self.assertEqual(counts, [{"00": shots}, {"01": shots}, {"00": shots}])
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path_with_more_params_than_expressions(self, method, device):
+        """Test parameterized circuit path via backed.run()"""
+        shots = 2000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        phi = Parameter("phi")
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.rz(theta_squared, 1)
+        circuit.ry(phi, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi], phi: [0, 1, pi]}]
+        res = backend.run(
+            circuit,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        counts = res.get_counts()
+        for index, expected in enumerate(
+            [{"00": shots}, {"01": 0.25 * shots, "11": 0.75 * shots}, {"10": shots}]
+        ):
+            self.assertDictAlmostEqual(counts[index], expected, delta=0.05 * shots)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path_multiple_circuits(self, method, device):
+        """Test parameterized circuit path via backed.run()"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}] * 3
+        res = backend.run(
+            [circuit] * 3,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        counts = res.get_counts()
+        self.assertEqual(counts, [{"00": shots}, {"11": shots}, {"00": shots}] * 3)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path_multiple_different_circuits(self, method, device):
+        """Test parameterized circuit path via backed.run()"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+
+        circuit1 = QuantumCircuit(2)
+        theta1 = Parameter("theta1")
+        circuit1.rx(theta1, 0)
+        circuit1.cx(0, 1)
+        circuit1.measure_all()
+
+        circuit2 = QuantumCircuit(2)
+        theta2 = Parameter("theta2")
+        circuit2.rx(theta2, 0)
+        circuit2.cx(0, 1)
+        circuit2.measure_all()
+
+        circuit3 = QuantumCircuit(2)
+        theta3_1 = Parameter("theta3_1")
+        theta3_2 = Parameter("theta3_2")
+        circuit3.rx(theta3_1, 0)
+        circuit3.rx(theta3_2, 0)
+        circuit3.cx(0, 1)
+        circuit3.measure_all()
+
+        parameter_binds = [
+            {theta1: [0, pi, 2 * pi]},
+            {theta2: [0, pi, 2 * pi]},
+            {theta3_1: [0, pi / 2, pi], theta3_2: [0, pi / 2, pi]},
+        ]
+        res = backend.run(
+            [circuit1, circuit2, circuit3],
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        counts = res.get_counts()
+        self.assertEqual(counts, [{"00": shots}, {"11": shots}, {"00": shots}] * 3)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path_with_expressions_multiple_circuits(self, method, device):
+        """Test parameterized circuit path via backed.run()"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.rz(theta_squared, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}] * 3
+        res = backend.run(
+            [circuit] * 3,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        counts = res.get_counts()
+        self.assertEqual(counts, [{"00": shots}, {"11": shots}, {"00": shots}] * 3)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path_with_expressions_multiple_params_per_instruction(self, method, device):
+        """Test parameterized circuit path via backed.run()"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.rz(theta_squared, 1)
+        circuit.u(theta, theta_squared, theta, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}] * 3
+        res = backend.run(
+            [circuit] * 3,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        counts = res.get_counts()
+        self.assertEqual(counts, [{"00": shots}, {"01": shots}, {"00": shots}] * 3)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path_with_more_params_than_expressions_multiple_circuits(self, method, device):
+        """Test parameterized circuit path via backed.run()"""
+        shots = 2000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        phi = Parameter("phi")
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.rz(theta_squared, 1)
+        circuit.ry(phi, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi], phi: [0, 1, pi]}] * 3
+        res = backend.run(
+            [circuit] * 3,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        counts = res.get_counts()
+        for index, expected in enumerate(
+            [{"00": shots}, {"01": 0.25 * shots, "11": 0.75 * shots}, {"10": shots}] * 3
+        ):
+            self.assertDictAlmostEqual(counts[index], expected, delta=0.05 * shots)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path_multiple_circuits_mismatch_length(self, method, device):
+        """Test parameterized circuit path via backed.run()"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+        with self.assertRaises(AerError):
+            backend.run(
+                [circuit] * 3,
+                shots=shots,
+                parameter_binds=[parameter_binds],
+                shot_branching_enable=False,
+                runtime_parameter_bind_enable=True,
+            ).result()
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_path_with_truncation(self, method, device):
+        """Test parameterized circuits with truncation"""
+        backend = self.backend(method=method, device=device)
+        theta = Parameter("theta")
+        circuit = QuantumCircuit(5, 2)
+        for q in range(5):
+            circuit.ry(theta, q)
+        circuit.cx(0, 1)
+        circuit.cx(1, 2)
+        for q in range(5):
+            circuit.ry(theta, q)
+        circuit.cx(0, 1)
+        circuit.cx(1, 2)
+        circuit.append(SaveStatevector(3, label="sv", pershot=False, conditional=False), range(3))
+
+        param_map = {theta: [0.1 * i for i in range(3)]}
+        param_sets = [{theta: 0.1 * i} for i in range(3)]
+
+        resolved_circuits = [circuit.bind_parameters(param_set) for param_set in param_sets]
+
+        result = backend.run(
+            circuit,
+            parameter_binds=[param_map],
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        self.assertSuccess(result)
+
+        result_without_parameters = backend.run(resolved_circuits).result()
+        self.assertSuccess(result_without_parameters)
+
+        for actual_result in result.results:
+            metadata = actual_result.metadata
+            self.assertEqual(metadata["active_input_qubits"], [q for q in range(3)])
+        for i in range(3):
+            self.assertEqual(result.data(i)["sv"], result_without_parameters.data(i)["sv"])
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_different_seed(self, method, device):
+        """Test parameterized circuits have different seeds"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+        res = backend.run(
+            circuit,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        seed_simulator_list = [result.seed_simulator for result in res.results]
+        self.assertEqual(len(seed_simulator_list), len(np.unique(seed_simulator_list)))
+
+        res2 = backend.run(
+            circuit,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            seed_simulator=seed_simulator_list[0],
+        ).result()
+        self.assertEqual(seed_simulator_list, [result.seed_simulator for result in res2.results])
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_run_empty(self, method, device):
+        """Test parameterized circuit with empty dict path via backed.run()"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.measure_all()
+        parameter_binds = [{}]
+        with self.assertRaises(AerError):
+            res = backend.run(
+                circuit,
+                shots=shots,
+                parameter_binds=parameter_binds,
+                shot_branching_enable=False,
+                runtime_parameter_bind_enable=True,
+            ).result()
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_parameters_with_barrier(self, method, device):
+        """Test parameterized circuit path with barrier"""
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(3)
+        theta = Parameter("theta")
+        phi = Parameter("phi")
+        circuit.rx(theta, 0)
+        circuit.rx(theta, 1)
+        circuit.rx(theta, 2)
+        circuit.barrier()
+        circuit.rx(phi, 0)
+        circuit.rx(phi, 1)
+        circuit.rx(phi, 2)
+        circuit.barrier()
+        circuit.measure_all()
+
+        parameter_binds = [{theta: [pi / 2], phi: [pi / 2]}]
+        res = backend.run(
+            [circuit],
+            shots=1024,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+
+        self.assertSuccess(res)
+        self.assertEqual(res.get_counts(), {"111": 1024})
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_dynamic_circuit(self, method, device):
+        """Test parameterized dynamic circuit"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        circuit.h(0)
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.reset(0)
+        circuit.rz(theta_squared, 1)
+        circuit.u(theta, theta_squared, theta, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+
+        result = backend.run(
+            circuit,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        self.assertSuccess(result)
+        counts = result.get_counts()
+
+        result_pre_bind = backend.run(
+            circuit,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=False,
+        ).result()
+        self.assertSuccess(result_pre_bind)
+        counts_pre_bind = result_pre_bind.get_counts()
+
+        self.assertEqual(counts, counts_pre_bind)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_dynamic_circuit_with_shot_branching(self, method, device):
+        """Test parameterized dynamic circuit"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        circuit.h(0)
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.reset(0)
+        circuit.rz(theta_squared, 1)
+        circuit.u(theta, theta_squared, theta, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+
+        result = backend.run(
+            circuit,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=True,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        self.assertSuccess(result)
+        counts = result.get_counts()
+
+        result_pre_bind = backend.run(
+            circuit,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=False,
+        ).result()
+        self.assertSuccess(result_pre_bind)
+        counts_pre_bind = result_pre_bind.get_counts()
+
+        self.assertEqual(counts, counts_pre_bind)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_fusion(self, method, device):
+        """Test parameterized circuit with fusion"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.rz(theta_squared, 1)
+        circuit.u(theta, theta_squared, theta, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}] * 3
+        res = backend.run(
+            [circuit] * 3,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            fusion_enable=True,
+            fusion_threshold=1,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        counts = res.get_counts()
+        self.assertEqual(counts, [{"00": shots}, {"01": shots}, {"00": shots}] * 3)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_pauli_noise(self, method, device):
+        """Test parameterized circuit with Pauli noise"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        circuit.h(0)
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.rz(theta_squared, 1)
+        circuit.u(theta, theta_squared, theta, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+
+        error = pauli_error([("X", 0.2), ("Y", 0.2), ("Z", 0.2), ("I", 0.4)])
+        noise_model = NoiseModel()
+        noise_model.add_all_qubit_quantum_error(error, ["h", "rx", "rz", "u"])
+
+        result = backend.run(
+            circuit,
+            noise_model=noise_model,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        self.assertSuccess(result)
+        counts = result.get_counts()
+
+        result_pre_bind = backend.run(
+            circuit,
+            noise_model=noise_model,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=False,
+        ).result()
+        self.assertSuccess(result_pre_bind)
+        counts_pre_bind = result_pre_bind.get_counts()
+
+        self.assertEqual(counts, counts_pre_bind)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_kraus_noise(self, method, device):
+        """Test parameterized circuit with Kraus noise"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        circuit.h(0)
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.rz(theta_squared, 1)
+        circuit.u(theta, theta_squared, theta, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+
+        error = amplitude_damping_error(0.75, 0.25)
+        noise_model = NoiseModel()
+        noise_model.add_all_qubit_quantum_error(error, ["h", "rx", "rz", "u"])
+
+        result = backend.run(
+            circuit,
+            noise_model=noise_model,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        self.assertSuccess(result)
+        counts = result.get_counts()
+
+        result_pre_bind = backend.run(
+            circuit,
+            noise_model=noise_model,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=False,
+        ).result()
+        self.assertSuccess(result_pre_bind)
+        counts_pre_bind = result_pre_bind.get_counts()
+
+        self.assertEqual(counts, counts_pre_bind)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_pauli_noise_with_shot_branching(self, method, device):
+        """Test parameterized circuit with Pauli noise"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        circuit.h(0)
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.rz(theta_squared, 1)
+        circuit.u(theta, theta_squared, theta, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+
+        error = pauli_error([("X", 0.2), ("Y", 0.2), ("Z", 0.2), ("I", 0.4)])
+        noise_model = NoiseModel()
+        noise_model.add_all_qubit_quantum_error(error, ["h", "rx", "rz", "u"])
+
+        result = backend.run(
+            circuit,
+            noise_model=noise_model,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=True,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        self.assertSuccess(result)
+        counts = result.get_counts()
+
+        result_pre_bind = backend.run(
+            circuit,
+            noise_model=noise_model,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=False,
+        ).result()
+        self.assertSuccess(result_pre_bind)
+        counts_pre_bind = result_pre_bind.get_counts()
+
+        self.assertEqual(counts, counts_pre_bind)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_kraus_noise_with_shot_branching(self, method, device):
+        """Test parameterized circuit with Kraus noise"""
+        shots = 1000
+        backend = self.backend(method=method, device=device)
+        circuit = QuantumCircuit(2)
+        theta = Parameter("theta")
+        theta_squared = theta * theta
+        circuit.h(0)
+        circuit.rx(theta, 0)
+        circuit.cx(0, 1)
+        circuit.rz(theta_squared, 1)
+        circuit.u(theta, theta_squared, theta, 1)
+        circuit.measure_all()
+        parameter_binds = [{theta: [0, pi, 2 * pi]}]
+
+        error = amplitude_damping_error(0.75, 0.25)
+        noise_model = NoiseModel()
+        noise_model.add_all_qubit_quantum_error(error, ["h", "rx", "rz", "u"])
+
+        result = backend.run(
+            circuit,
+            noise_model=noise_model,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=True,
+            runtime_parameter_bind_enable=True,
+        ).result()
+        self.assertSuccess(result)
+        counts = result.get_counts()
+
+        result_pre_bind = backend.run(
+            circuit,
+            noise_model=noise_model,
+            shots=shots,
+            parameter_binds=parameter_binds,
+            shot_branching_enable=False,
+            runtime_parameter_bind_enable=False,
+        ).result()
+        self.assertSuccess(result_pre_bind)
+        counts_pre_bind = result_pre_bind.get_counts()
+
+        self.assertEqual(counts, counts_pre_bind)
+
+
+if __name__ == "__main__":
+    unittest.main()