diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ac3951fc7..517ce982e7 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -512,6 +512,7 @@ endif() if(AER_DEBUG) set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_DEBUG) + set(AER_COMPILER_FLAGS "${AER_COMPILER_FLAGS} -g") endif() if(TEST_JSON) diff --git a/qiskit_aer/backends/aer_compiler.py b/qiskit_aer/backends/aer_compiler.py index 4909f73537..e4a3a4e9b6 100644 --- a/qiskit_aer/backends/aer_compiler.py +++ b/qiskit_aer/backends/aer_compiler.py @@ -491,6 +491,7 @@ def compile_circuit(circuits, basis_gates=None, optypes=None): "parameterizations": (list), "fusion_parallelization_threshold": (int, np.integer), "target_gpus": (list), + "runtime_parameter_bind_enable": (bool, np.bool_), } diff --git a/qiskit_aer/backends/aer_simulator.py b/qiskit_aer/backends/aer_simulator.py index d34cf1ef27..f845ecd6f0 100644 --- a/qiskit_aer/backends/aer_simulator.py +++ b/qiskit_aer/backends/aer_simulator.py @@ -318,6 +318,12 @@ class AerSimulator(AerBackend): * ``accept_distributed_results`` (bool): This option enables storing results independently in each process (Default: None). + * ``runtime_parameter_bind_enable`` (bool): If this option is True + parameters are bound at runtime by using multi-shots without constructing + circuits for each parameters. For GPU this option can be used with + ``batched_shots_gpu`` to run with multiple parameters in a batch. + (Default: False). + These backend options only apply when using the ``"statevector"`` simulation method: @@ -765,6 +771,8 @@ def _default_options(cls): # tensor network options tensor_network_num_sampling_qubits=10, use_cuTensorNet_autotuning=False, + # parameter binding + runtime_parameter_bind_enable=False, ) def __repr__(self): diff --git a/qiskit_aer/backends/wrappers/aer_controller_binding.hpp b/qiskit_aer/backends/wrappers/aer_controller_binding.hpp index 67e057c74f..f614e4483d 100644 --- a/qiskit_aer/backends/wrappers/aer_controller_binding.hpp +++ b/qiskit_aer/backends/wrappers/aer_controller_binding.hpp @@ -412,6 +412,14 @@ void bind_aer_controller(MODULE m) { "target_gpus", [](const Config &config) { return config.target_gpus.val; }, [](Config &config, reg_t val) { config.target_gpus.value(val); }); + aer_config.def_property( + "runtime_parameter_bind_enable", + [](const Config &config) { + return config.runtime_parameter_bind_enable.val; + }, + [](Config &config, bool val) { + config.runtime_parameter_bind_enable.value(val); + }); aer_config.def(py::pickle( [](const AER::Config &config) { @@ -500,11 +508,12 @@ void bind_aer_controller(MODULE m) { 79, config.extended_stabilizer_norm_estimation_default_samples), write_value(80, config.shot_branching_enable), write_value(81, config.shot_branching_sampling_enable), - write_value(82, config.target_gpus)); + write_value(82, config.target_gpus), + write_value(83, config.runtime_parameter_bind_enable)); }, [](py::tuple t) { AER::Config config; - if (t.size() != 82) + if (t.size() != 84) throw std::runtime_error("Invalid serialization format."); read_value(t, 0, config.shots); @@ -594,6 +603,7 @@ void bind_aer_controller(MODULE m) { read_value(t, 80, config.shot_branching_enable); read_value(t, 81, config.shot_branching_sampling_enable); read_value(t, 82, config.target_gpus); + read_value(t, 83, config.runtime_parameter_bind_enable); return config; })); } diff --git a/releasenotes/notes/add_executor-ba4870f86ed5d8ec.yaml b/releasenotes/notes/add_executor-ba4870f86ed5d8ec.yaml new file mode 100644 index 0000000000..e1088061fa --- /dev/null +++ b/releasenotes/notes/add_executor-ba4870f86ed5d8ec.yaml @@ -0,0 +1,30 @@ +--- +features: + - | + This release restructures ``State`` classes. + Adding circuit executor classes that runs a circuit and manages multiple + states for multi-shots simulations or multi-chunk simulations for large + number of qubits. + Previously ``StateChunk`` class manages multiple chunks for multi-shots or + multi-chunk simulations but now ``State`` class only has one state + and all the parallelization codes are moved to ``Executor`` classes. + Now all ``State`` classes are independent from parallelization. + Also some of the functions in ``Aer::Controller`` class are moved to + ``CircuitExecutor::Executor`` class. + - | + Shot-branching technique that accelerates dynamic circuits simulations + is implemented with restructured ``Executor`` classes. + Shot-branching is currently applicable to statevector, density_matrix + and tensor_network methods. + Shot-branching provides dynamic distribution of multi-shots + by branching states when applying dynamic operations + (measure, reset, initialize, noises) + By default ``shot_branching_enable`` is disabled. + And by setting ``shot_branching_sampling_enable``, final measures will be + done by sampling measure that will speed up to get counts for multiple shots + sharing the same state. + - | + New option for GPU simulation ``target_gpus`` is added. + A list of GPUs used for the simulation can be set by this option. + Without this option, all the available GPUs are used. + For example, if there is 4 GPUs, ``target_gpus=[0, 2]`` will use 2 GPUs. diff --git a/releasenotes/notes/runtime_parameter_binding-d2c57255f02729a1.yaml b/releasenotes/notes/runtime_parameter_binding-d2c57255f02729a1.yaml new file mode 100644 index 0000000000..04573f93f7 --- /dev/null +++ b/releasenotes/notes/runtime_parameter_binding-d2c57255f02729a1.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + A runtime parameter binding option is implemented to bind paramters at + runtime to a single circuit instead running multiple circuits as input. + An option ``runtime_parameter_bind_enable=True`` enables this feature and + for GPU, ``batched_shots_gpu=True`` should be also set to speed up + simulating parameterized circuit. diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index e6005b9a62..d216b4ff9e 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -194,6 +194,9 @@ class Controller { int myrank_ = 0; int num_processes_ = 1; int num_process_per_experiment_ = 1; + + // runtime parameter binding + bool runtime_parameter_bind_ = false; }; //========================================================================= @@ -329,6 +332,10 @@ void Controller::set_config(const Config &config) { throw std::runtime_error(std::string("Invalid simulation precision (") + precision + std::string(").")); } + + // check if runtime binding is enable + if (config.runtime_parameter_bind_enable.has_value()) + runtime_parameter_bind_ = config.runtime_parameter_bind_enable.value(); } void Controller::clear_config() { @@ -502,7 +509,14 @@ Result Controller::execute(std::vector> &circuits, auto methods = simulation_methods(config, circuits, noise_model); // Initialize Result object for the given number of experiments - Result result(circuits.size()); + uint_t result_size; + reg_t result_offset(circuits.size()); + result_size = 0; + for (int_t i = 0; i < circuits.size(); i++) { + result_offset[i] = result_size; + result_size += circuits[i]->num_bind_params; + } + Result result(result_size); // Initialize circuit executors for each circuit std::vector> executors( circuits.size()); @@ -514,12 +528,15 @@ Result Controller::execute(std::vector> &circuits, // set parallelization for experiments try { + uint_t res_pos = 0; for (int i = 0; i < circuits.size(); i++) { executors[i] = make_circuit_executor(methods[i]); required_memory_mb_list[i] = executors[i]->required_memory_mb(config, *circuits[i], noise_model); - result.results[i].metadata.add(required_memory_mb_list[i], - "required_memory_mb"); + for (int j = 0; j < circuits[i]->num_bind_params; j++) { + result.results[res_pos++].metadata.add(required_memory_mb_list[i], + "required_memory_mb"); + } } set_parallelization_experiments(required_memory_mb_list); } catch (std::exception &e) { @@ -565,33 +582,40 @@ Result Controller::execute(std::vector> &circuits, // average random seed to set the same seed to each process (when // seed_simulator is not set) if (num_processes_ > 1) { - reg_t seeds(circuits.size()); - reg_t avg_seeds(circuits.size()); - for (int_t i = 0; i < circuits.size(); i++) - seeds[i] = circuits[i]->seed; - MPI_Allreduce(seeds.data(), avg_seeds.data(), circuits.size(), - MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD); - for (int_t i = 0; i < circuits.size(); i++) - circuits[i]->seed = avg_seeds[i] / num_processes_; - } -#endif - - const int NUM_RESULTS = result.results.size(); - // following looks very similar but we have to separate them to avoid omp - // nested loops that causes performance degradation (DO NOT use if statement - // in #pragma omp) - if (parallel_experiments_ == 1) { - for (int i = 0; i < NUM_RESULTS; i++) { - executors[i]->run_circuit(*circuits[i], noise_model, config, methods[i], - sim_device_, result.results[i]); + reg_t seeds(result_size); + reg_t avg_seeds(result_size); + int_t iseed = 0; + for (int_t i = 0; i < circuits.size(); i++) { + if (circuits[i]->num_bind_params > 1) { + for (int_t j = 0; i < circuits[i]->num_bind_params; i++) + seeds[iseed++] = circuits[i]->seed_for_params[j]; + } else + seeds[iseed++] = circuits[i]->seed; } - } else { -#pragma omp parallel for num_threads(parallel_experiments_) - for (int i = 0; i < NUM_RESULTS; i++) { - executors[i]->run_circuit(*circuits[i], noise_model, config, methods[i], - sim_device_, result.results[i]); + MPI_Allreduce(seeds.data(), avg_seeds.data(), result_size, MPI_UINT64_T, + MPI_SUM, MPI_COMM_WORLD); + iseed = 0; + for (int_t i = 0; i < circuits.size(); i++) { + if (circuits[i]->num_bind_params > 1) { + for (int_t j = 0; i < circuits[i]->num_bind_params; i++) + circuits[i]->seed_for_params[j] = + avg_seeds[iseed++] / num_processes_; + } else + circuits[i]->seed = avg_seeds[iseed++] / num_processes_; } } +#endif + + auto run_circuits = [this, &executors, &circuits, &noise_model, &config, + &methods, &result, &result_offset](int_t i) { + executors[i]->run_circuit(*circuits[i], noise_model, config, methods[i], + sim_device_, + result.results.begin() + result_offset[i]); + }; + Utils::apply_omp_parallel_for((parallel_experiments_ > 1), 0, + circuits.size(), run_circuits, + parallel_experiments_); + executors.clear(); // Check each experiment result for completed status. @@ -599,7 +623,7 @@ Result Controller::execute(std::vector> &circuits, bool all_failed = true; result.status = Result::Status::completed; - for (int i = 0; i < NUM_RESULTS; ++i) { + for (int i = 0; i < result.results.size(); ++i) { auto &experiment = result.results[i]; if (experiment.status == ExperimentResult::Status::completed) { all_failed = false; diff --git a/src/controllers/controller_execute.hpp b/src/controllers/controller_execute.hpp index 4c2015461f..f3128a7739 100644 --- a/src/controllers/controller_execute.hpp +++ b/src/controllers/controller_execute.hpp @@ -67,7 +67,12 @@ Result controller_execute(std::vector> &input_circs, // pars = [par0, par1, ...] is a list of different parameterizations using pos_t = std::pair; using exp_params_t = std::vector>>; - std::vector param_table = config.param_table; + std::vector ¶m_table = config.param_table; + + // check if runtime binding is enable + bool runtime_parameter_bind = false; + if (config.runtime_parameter_bind_enable.has_value()) + runtime_parameter_bind = config.runtime_parameter_bind_enable.value(); // Validate parameterizations for number of circuis if (!param_table.empty() && param_table.size() != num_circs) { @@ -78,6 +83,8 @@ Result controller_execute(std::vector> &input_circs, std::vector> circs; std::vector> template_circs; + using myclock_t = std::chrono::high_resolution_clock; + auto timer_start = myclock_t::now(); try { // Load circuits for (size_t i = 0; i < num_circs; i++) { @@ -93,39 +100,41 @@ Result controller_execute(std::vector> &input_circs, circ->set_params(false); circ->set_metadata(config, truncate); // Load different parameterizations of the initial circuit - const auto circ_params = param_table[i]; + const auto &circ_params = param_table[i]; const size_t num_params = circ_params[0].second.size(); const size_t num_instr = circ->ops.size(); - for (size_t j = 0; j < num_params; j++) { + + if (runtime_parameter_bind && num_params > 1) { // Make a copy of the initial circuit auto param_circ = std::make_shared(*circ); + param_circ->num_bind_params = num_params; + for (const auto ¶ms : circ_params) { const auto instr_pos = params.first.first; const auto param_pos = params.first.second; // Validation if (instr_pos == AER::Config::GLOBAL_PHASE_POS) { // negative position is for global phase - param_circ->global_phase_angle = params.second[j]; - } else { - if (instr_pos >= num_instr) { - std::cout << "Invalid parameterization: instruction position " - "out of range: " - << instr_pos << std::endl; - throw std::invalid_argument( - R"(Invalid parameterization: instruction position out of range)"); - } - auto &op = param_circ->ops[instr_pos]; + param_circ->global_phase_for_params.resize(num_params); + for (size_t j = 0; j < num_params; j++) + param_circ->global_phase_for_params[j] = params.second[j]; + } else if (instr_pos >= num_instr) { + throw std::invalid_argument( + R"(Invalid parameterized qobj: instruction position out of range)"); + } + auto &op = param_circ->ops[instr_pos]; + if (!op.has_bind_params) { if (param_pos >= op.params.size()) { throw std::invalid_argument( - R"(Invalid parameterization: instruction param position out of range)"); - } - if (j >= params.second.size()) { - throw std::invalid_argument( - R"(Invalid parameterization: parameterization value out of range)"); + R"(Invalid parameterized qobj: instruction param position out of range)"); } - // Update the param - op.params[param_pos] = params.second[j]; + // resize parameter array + op.params.resize(op.params.size() * num_params); + op.has_bind_params = true; } + uint_t stride = op.params.size() / num_params; + for (size_t j = 0; j < num_params; j++) + op.params[param_pos + stride * j] = params.second[j]; } // Run truncation. // TODO: Truncation should be performed and parameters should be @@ -137,7 +146,53 @@ Result controller_execute(std::vector> &input_circs, param_circ->set_metadata(config, true); } circs.push_back(param_circ); - template_circs.push_back(circ); + for (size_t j = 0; j < num_params; j++) + template_circs.push_back(circ); + } else { + for (size_t j = 0; j < num_params; j++) { + // Make a copy of the initial circuit + auto param_circ = std::make_shared(*circ); + for (const auto ¶ms : circ_params) { + const auto instr_pos = params.first.first; + const auto param_pos = params.first.second; + // Validation + if (instr_pos == AER::Config::GLOBAL_PHASE_POS) { + // negative position is for global phase + circ->global_phase_angle = params.second[j]; + } else { + if (instr_pos >= num_instr) { + std::cout << "Invalid parameterization: instruction position " + "out of range: " + << instr_pos << std::endl; + throw std::invalid_argument( + R"(Invalid parameterization: instruction position out of range)"); + } + auto &op = param_circ->ops[instr_pos]; + if (param_pos >= op.params.size()) { + throw std::invalid_argument( + R"(Invalid parameterization: instruction param position out of range)"); + } + if (j >= params.second.size()) { + throw std::invalid_argument( + R"(Invalid parameterization: parameterization value out of range)"); + } + // Update the param + op.params[param_pos] = params.second[j]; + } + } + // Run truncation. + // TODO: Truncation should be performed and parameters should be + // resolved after it. However, parameters are associated with + // indices of instructions, which can be changed in truncation. + // Therefore, current implementation performs truncation for each + // parameter set. + if (truncate) { + param_circ->set_params(true); + param_circ->set_metadata(config, true); + } + circs.push_back(param_circ); + template_circs.push_back(circ); + } } } } @@ -148,7 +203,6 @@ Result controller_execute(std::vector> &input_circs, result.message = std::string("Failed to load circuits: ") + e.what(); return result; } - int_t seed = -1; uint_t seed_shift = 0; @@ -157,10 +211,23 @@ Result controller_execute(std::vector> &input_circs, else seed = circs[0]->seed; - for (auto &circ : circs) { - circ->seed = seed + seed_shift; - seed_shift += 2113; + if (runtime_parameter_bind) { + for (auto &circ : circs) { + circ->seed = seed + seed_shift; + circ->seed_for_params.resize(circ->num_bind_params); + for (int_t i = 0; i < circ->num_bind_params; i++) { + circ->seed_for_params[i] = seed + seed_shift; + seed_shift += 2113; + } + } + } else { + for (auto &circ : circs) { + circ->seed = seed + seed_shift; + seed_shift += 2113; + } } + auto time_taken = + std::chrono::duration(myclock_t::now() - timer_start).count(); // Fix for MacOS and OpenMP library double initialization crash. // Issue: https://github.com/Qiskit/qiskit-aer/issues/1 @@ -170,6 +237,7 @@ Result controller_execute(std::vector> &input_circs, for (size_t i = 0; i < ret.results.size(); ++i) ret.results[i].circ_id = template_circs[i]->circ_id; + ret.metadata.add(time_taken, "time_taken_parameter_binding"); return ret; } diff --git a/src/framework/circuit.hpp b/src/framework/circuit.hpp index bc7645d694..a21a7fbd8c 100644 --- a/src/framework/circuit.hpp +++ b/src/framework/circuit.hpp @@ -63,6 +63,11 @@ class Circuit { double global_phase_angle = 0; bool remapped_qubits = false; // True if qubits have been remapped + // for runtime parameter bind, number of parameters per circuit + uint_t num_bind_params = 1; + reg_t seed_for_params; // random seed for each parameter + rvector_t global_phase_for_params; // global phase angles for each param + // Constructor // The constructor automatically calculates the num_qubits, num_memory, // num_registers parameters by scanning the input list of ops. diff --git a/src/framework/config.hpp b/src/framework/config.hpp index 60a5d7c313..1074f7acdf 100644 --- a/src/framework/config.hpp +++ b/src/framework/config.hpp @@ -171,6 +171,7 @@ struct Config { optional memory_blocking_bits; optional extended_stabilizer_norm_estimation_default_samples; optional target_gpus; + optional runtime_parameter_bind_enable; void clear() { shots = 1024; @@ -270,7 +271,9 @@ struct Config { unitary_parallel_threshold.clear(); memory_blocking_bits.clear(); extended_stabilizer_norm_estimation_default_samples.clear(); + target_gpus.clear(); + runtime_parameter_bind_enable.clear(); } void merge(const Config &other) { @@ -412,8 +415,12 @@ struct Config { if (other.extended_stabilizer_norm_estimation_default_samples.has_value()) extended_stabilizer_norm_estimation_default_samples.value( other.extended_stabilizer_norm_estimation_default_samples.value()); + if (other.target_gpus.has_value()) target_gpus.value(other.target_gpus.value()); + if (other.runtime_parameter_bind_enable.has_value()) + runtime_parameter_bind_enable.value( + other.runtime_parameter_bind_enable.value()); } }; @@ -529,6 +536,8 @@ inline void from_json(const json_t &js, Config &config) { get_value(config.extended_stabilizer_norm_estimation_default_samples, "extended_stabilizer_norm_estimation_default_samples", js); get_value(config.target_gpus, "target_gpus", js); + get_value(config.runtime_parameter_bind_enable, + "runtime_parameter_bind_enable", js); } } // namespace AER diff --git a/src/framework/operations.hpp b/src/framework/operations.hpp index 4ec55757ff..335528de59 100755 --- a/src/framework/operations.hpp +++ b/src/framework/operations.hpp @@ -308,6 +308,9 @@ struct Op { // Save DataSubType save_type = DataSubType::single; + + // runtime parameter bind + bool has_bind_params = false; }; inline std::ostream &operator<<(std::ostream &s, const Op &op) { @@ -940,6 +943,30 @@ inline Op make_qerror_loc(const reg_t &qubits, const std::string &label, return op; } +// make new op by parameter binding +inline Op bind_parameter(const Op &src, const uint_t iparam, + const uint_t num_params) { + Op op; + op.type = src.type; + op.name = src.name; + op.qubits = src.qubits; + op.conditional = src.conditional; + op.conditional_reg = src.conditional_reg; + + if (src.params.size() > 0) { + uint_t stride = src.params.size() / num_params; + op.params.resize(stride); + for (int_t i = 0; i < stride; i++) + op.params[i] = src.params[iparam * stride + i]; + } else if (src.mats.size() > 0) { + uint_t stride = src.mats.size() / num_params; + op.mats.resize(stride); + for (int_t i = 0; i < stride; i++) + op.mats[i] = src.mats[iparam * stride + i]; + } + return op; +} + //------------------------------------------------------------------------------ // JSON conversion //------------------------------------------------------------------------------ diff --git a/src/framework/results/data/metadata.hpp b/src/framework/results/data/metadata.hpp index cf7cb39bb1..789906b903 100644 --- a/src/framework/results/data/metadata.hpp +++ b/src/framework/results/data/metadata.hpp @@ -64,6 +64,8 @@ struct Metadata : public DataMap, // Combine stored data Metadata &combine(Metadata &&other); + + Metadata ©(Metadata &other); }; //------------------------------------------------------------------------------ @@ -77,6 +79,13 @@ Metadata &Metadata::combine(Metadata &&other) { return *this; } +Metadata &Metadata::copy(Metadata &other) { + DataMap::copy(other); + DataMap::copy(other); + DataMap::copy(other); + return *this; +} + json_t Metadata::to_json() { json_t result = json_t::object(); DataMap::add_to_json(result); diff --git a/src/framework/results/data/subtypes/data_map.hpp b/src/framework/results/data/subtypes/data_map.hpp index 8c942ae0ac..2d46bd19f9 100644 --- a/src/framework/results/data/subtypes/data_map.hpp +++ b/src/framework/results/data/subtypes/data_map.hpp @@ -43,6 +43,9 @@ class DataMap { // Combine with another data object void combine(DataMap &&other); + // copy from another data onject + void copy(DataMap &other); + // Clear all stored data void clear(); @@ -75,6 +78,9 @@ class DataMap { // Combine with another data object void combine(DataMap &&other); + // copy from another data onject + void copy(DataMap &other); + // Clear all stored data void clear(); @@ -128,6 +134,22 @@ void DataMap::combine(DataMap &&other) { } } +template