Skip to content

Commit

Permalink
Implementing runtime parameter binding (Qiskit#1901)
Browse files Browse the repository at this point in the history
Optimizes GPU simulation for single circuit with multiple parameters by binding parameters
to each gates at runtime on a single circuit with multiple shots of simulations. 
This feature is enabled by a new option ``runtime_parameter_bind_enable=True`` (Default is ``False``).

* Implementing runtime parameter binding
* remove old files
* fix seg fault caused by global phase for parameters
* delete duplicate max_matrix_qubits
* Correct metadata for runtime param bind configs and move time_taken to metadata so that we can read time info from primitives
* performance improvement of sampling measure for runtime parameter binding
* fix error for MPI
* Improve batched sampling measure
* format
* fix OpenMP nested parallel
* reflecting review comments
* fix lint
* fix lint
  • Loading branch information
doichanj authored Oct 6, 2023
1 parent 73f0847 commit e1332f8
Show file tree
Hide file tree
Showing 43 changed files with 4,389 additions and 823 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,7 @@ endif()

if(AER_DEBUG)
set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_DEBUG)
set(AER_COMPILER_FLAGS "${AER_COMPILER_FLAGS} -g")
endif()

if(TEST_JSON)
Expand Down
1 change: 1 addition & 0 deletions qiskit_aer/backends/aer_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ def compile_circuit(circuits, basis_gates=None, optypes=None):
"parameterizations": (list),
"fusion_parallelization_threshold": (int, np.integer),
"target_gpus": (list),
"runtime_parameter_bind_enable": (bool, np.bool_),
}


Expand Down
8 changes: 8 additions & 0 deletions qiskit_aer/backends/aer_simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,12 @@ class AerSimulator(AerBackend):
* ``accept_distributed_results`` (bool): This option enables storing
results independently in each process (Default: None).
* ``runtime_parameter_bind_enable`` (bool): If this option is True
parameters are bound at runtime by using multi-shots without constructing
circuits for each parameters. For GPU this option can be used with
``batched_shots_gpu`` to run with multiple parameters in a batch.
(Default: False).
These backend options only apply when using the ``"statevector"``
simulation method:
Expand Down Expand Up @@ -765,6 +771,8 @@ def _default_options(cls):
# tensor network options
tensor_network_num_sampling_qubits=10,
use_cuTensorNet_autotuning=False,
# parameter binding
runtime_parameter_bind_enable=False,
)

def __repr__(self):
Expand Down
14 changes: 12 additions & 2 deletions qiskit_aer/backends/wrappers/aer_controller_binding.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,14 @@ void bind_aer_controller(MODULE m) {
"target_gpus",
[](const Config &config) { return config.target_gpus.val; },
[](Config &config, reg_t val) { config.target_gpus.value(val); });
aer_config.def_property(
"runtime_parameter_bind_enable",
[](const Config &config) {
return config.runtime_parameter_bind_enable.val;
},
[](Config &config, bool val) {
config.runtime_parameter_bind_enable.value(val);
});

aer_config.def(py::pickle(
[](const AER::Config &config) {
Expand Down Expand Up @@ -500,11 +508,12 @@ void bind_aer_controller(MODULE m) {
79, config.extended_stabilizer_norm_estimation_default_samples),
write_value(80, config.shot_branching_enable),
write_value(81, config.shot_branching_sampling_enable),
write_value(82, config.target_gpus));
write_value(82, config.target_gpus),
write_value(83, config.runtime_parameter_bind_enable));
},
[](py::tuple t) {
AER::Config config;
if (t.size() != 82)
if (t.size() != 84)
throw std::runtime_error("Invalid serialization format.");

read_value(t, 0, config.shots);
Expand Down Expand Up @@ -594,6 +603,7 @@ void bind_aer_controller(MODULE m) {
read_value(t, 80, config.shot_branching_enable);
read_value(t, 81, config.shot_branching_sampling_enable);
read_value(t, 82, config.target_gpus);
read_value(t, 83, config.runtime_parameter_bind_enable);
return config;
}));
}
Expand Down
30 changes: 30 additions & 0 deletions releasenotes/notes/add_executor-ba4870f86ed5d8ec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
features:
- |
This release restructures ``State`` classes.
Adding circuit executor classes that runs a circuit and manages multiple
states for multi-shots simulations or multi-chunk simulations for large
number of qubits.
Previously ``StateChunk`` class manages multiple chunks for multi-shots or
multi-chunk simulations but now ``State`` class only has one state
and all the parallelization codes are moved to ``Executor`` classes.
Now all ``State`` classes are independent from parallelization.
Also some of the functions in ``Aer::Controller`` class are moved to
``CircuitExecutor::Executor`` class.
- |
Shot-branching technique that accelerates dynamic circuits simulations
is implemented with restructured ``Executor`` classes.
Shot-branching is currently applicable to statevector, density_matrix
and tensor_network methods.
Shot-branching provides dynamic distribution of multi-shots
by branching states when applying dynamic operations
(measure, reset, initialize, noises)
By default ``shot_branching_enable`` is disabled.
And by setting ``shot_branching_sampling_enable``, final measures will be
done by sampling measure that will speed up to get counts for multiple shots
sharing the same state.
- |
New option for GPU simulation ``target_gpus`` is added.
A list of GPUs used for the simulation can be set by this option.
Without this option, all the available GPUs are used.
For example, if there is 4 GPUs, ``target_gpus=[0, 2]`` will use 2 GPUs.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
features:
- |
A runtime parameter binding option is implemented to bind paramters at
runtime to a single circuit instead running multiple circuits as input.
An option ``runtime_parameter_bind_enable=True`` enables this feature and
for GPU, ``batched_shots_gpu=True`` should be also set to speed up
simulating parameterized circuit.
80 changes: 52 additions & 28 deletions src/controllers/aer_controller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,9 @@ class Controller {
int myrank_ = 0;
int num_processes_ = 1;
int num_process_per_experiment_ = 1;

// runtime parameter binding
bool runtime_parameter_bind_ = false;
};

//=========================================================================
Expand Down Expand Up @@ -329,6 +332,10 @@ void Controller::set_config(const Config &config) {
throw std::runtime_error(std::string("Invalid simulation precision (") +
precision + std::string(")."));
}

// check if runtime binding is enable
if (config.runtime_parameter_bind_enable.has_value())
runtime_parameter_bind_ = config.runtime_parameter_bind_enable.value();
}

void Controller::clear_config() {
Expand Down Expand Up @@ -502,7 +509,14 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
auto methods = simulation_methods(config, circuits, noise_model);

// Initialize Result object for the given number of experiments
Result result(circuits.size());
uint_t result_size;
reg_t result_offset(circuits.size());
result_size = 0;
for (int_t i = 0; i < circuits.size(); i++) {
result_offset[i] = result_size;
result_size += circuits[i]->num_bind_params;
}
Result result(result_size);
// Initialize circuit executors for each circuit
std::vector<std::shared_ptr<CircuitExecutor::Base>> executors(
circuits.size());
Expand All @@ -514,12 +528,15 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,

// set parallelization for experiments
try {
uint_t res_pos = 0;
for (int i = 0; i < circuits.size(); i++) {
executors[i] = make_circuit_executor(methods[i]);
required_memory_mb_list[i] =
executors[i]->required_memory_mb(config, *circuits[i], noise_model);
result.results[i].metadata.add(required_memory_mb_list[i],
"required_memory_mb");
for (int j = 0; j < circuits[i]->num_bind_params; j++) {
result.results[res_pos++].metadata.add(required_memory_mb_list[i],
"required_memory_mb");
}
}
set_parallelization_experiments(required_memory_mb_list);
} catch (std::exception &e) {
Expand Down Expand Up @@ -565,41 +582,48 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
// average random seed to set the same seed to each process (when
// seed_simulator is not set)
if (num_processes_ > 1) {
reg_t seeds(circuits.size());
reg_t avg_seeds(circuits.size());
for (int_t i = 0; i < circuits.size(); i++)
seeds[i] = circuits[i]->seed;
MPI_Allreduce(seeds.data(), avg_seeds.data(), circuits.size(),
MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD);
for (int_t i = 0; i < circuits.size(); i++)
circuits[i]->seed = avg_seeds[i] / num_processes_;
}
#endif

const int NUM_RESULTS = result.results.size();
// following looks very similar but we have to separate them to avoid omp
// nested loops that causes performance degradation (DO NOT use if statement
// in #pragma omp)
if (parallel_experiments_ == 1) {
for (int i = 0; i < NUM_RESULTS; i++) {
executors[i]->run_circuit(*circuits[i], noise_model, config, methods[i],
sim_device_, result.results[i]);
reg_t seeds(result_size);
reg_t avg_seeds(result_size);
int_t iseed = 0;
for (int_t i = 0; i < circuits.size(); i++) {
if (circuits[i]->num_bind_params > 1) {
for (int_t j = 0; i < circuits[i]->num_bind_params; i++)
seeds[iseed++] = circuits[i]->seed_for_params[j];
} else
seeds[iseed++] = circuits[i]->seed;
}
} else {
#pragma omp parallel for num_threads(parallel_experiments_)
for (int i = 0; i < NUM_RESULTS; i++) {
executors[i]->run_circuit(*circuits[i], noise_model, config, methods[i],
sim_device_, result.results[i]);
MPI_Allreduce(seeds.data(), avg_seeds.data(), result_size, MPI_UINT64_T,
MPI_SUM, MPI_COMM_WORLD);
iseed = 0;
for (int_t i = 0; i < circuits.size(); i++) {
if (circuits[i]->num_bind_params > 1) {
for (int_t j = 0; i < circuits[i]->num_bind_params; i++)
circuits[i]->seed_for_params[j] =
avg_seeds[iseed++] / num_processes_;
} else
circuits[i]->seed = avg_seeds[iseed++] / num_processes_;
}
}
#endif

auto run_circuits = [this, &executors, &circuits, &noise_model, &config,
&methods, &result, &result_offset](int_t i) {
executors[i]->run_circuit(*circuits[i], noise_model, config, methods[i],
sim_device_,
result.results.begin() + result_offset[i]);
};
Utils::apply_omp_parallel_for((parallel_experiments_ > 1), 0,
circuits.size(), run_circuits,
parallel_experiments_);

executors.clear();

// Check each experiment result for completed status.
// If only some experiments completed return partial completed status.

bool all_failed = true;
result.status = Result::Status::completed;
for (int i = 0; i < NUM_RESULTS; ++i) {
for (int i = 0; i < result.results.size(); ++i) {
auto &experiment = result.results[i];
if (experiment.status == ExperimentResult::Status::completed) {
all_failed = false;
Expand Down
Loading

0 comments on commit e1332f8

Please sign in to comment.