Skip to content

Commit

Permalink
Fix OpenMP nested parallel (Qiskit#1880)
Browse files Browse the repository at this point in the history
* Fix OpenMP nested parallel

* add comment in release note

* fix true and false

* fix format

---------

Co-authored-by: Hiroshi Horii <hhorii@users.noreply.github.com>
  • Loading branch information
hhorii and hhorii committed Aug 9, 2023
1 parent 79b47fe commit a7b6a10
Show file tree
Hide file tree
Showing 11 changed files with 78 additions and 66 deletions.
11 changes: 11 additions & 0 deletions releasenotes/notes/fix_omp_nested-a554de2e7fd2a2d6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
fixes:
- |
OpenMP parallel nested was not correctly set when number of input circuits
is less than number of threads.
Also parallel state update was not correctly set if number of input circuits
is more than 1.
This release fixes these settings to get more speed up with OpenMP.
For single circuit with multiple-shots run, when nested parallel is used
`omp_nested=True` is set in the metadata of result for a circuit.
4 changes: 2 additions & 2 deletions src/controllers/aer_controller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -541,9 +541,9 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,

// nested should be set to zero if num_threads clause will be used
#if _OPENMP >= 200805
omp_set_max_active_levels(0);
omp_set_max_active_levels(2);
#else
omp_set_nested(0);
omp_set_nested(1);
#endif

result.metadata.add(parallel_nested_, "omp_nested");
Expand Down
2 changes: 1 addition & 1 deletion src/framework/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ struct Config {
optional<uint_t> unitary_parallel_threshold;
optional<uint_t> memory_blocking_bits;
optional<uint_t> extended_stabilizer_norm_estimation_default_samples;
optional<reg_t> target_gpus;
optional<reg_t> target_gpus;

void clear() {
shots = 1024;
Expand Down
16 changes: 7 additions & 9 deletions src/simulators/circuit_executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ class Executor : public Base {
int max_parallel_shots_;
size_t max_memory_mb_;
size_t max_gpu_memory_mb_;
int num_gpus_; // max number of GPU per process
reg_t target_gpus_; //GPUs to be used
int num_gpus_; // max number of GPU per process
reg_t target_gpus_; // GPUs to be used

// use explicit parallelization
bool explicit_parallelization_;
Expand Down Expand Up @@ -293,7 +293,7 @@ void Executor<state_t>::set_config(const Config &config) {
sim_precision_ = Precision::Single;
}

//set target GPUs
// set target GPUs
#ifdef AER_THRUST_CUDA
int nDev = 0;
if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
Expand All @@ -303,16 +303,14 @@ void Executor<state_t>::set_config(const Config &config) {
#endif
if (config.target_gpus.has_value()) {
target_gpus_ = config.target_gpus.value();
if( nDev < target_gpus_.size()){
throw std::invalid_argument(
"target_gpus has more GPUs than available.");
if (nDev < target_gpus_.size()) {
throw std::invalid_argument("target_gpus has more GPUs than available.");
}
num_gpus_ = target_gpus_.size();
}
else{
} else {
num_gpus_ = nDev;
target_gpus_.resize(num_gpus_);
for(int_t i=0;i<num_gpus_;i++)
for (int_t i = 0; i < num_gpus_; i++)
target_gpus_[i] = i;
}
}
Expand Down
1 change: 0 additions & 1 deletion src/simulators/density_matrix/densitymatrix_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,6 @@ template <class densmat_t>
size_t State<densmat_t>::required_memory_mb(
uint_t num_qubits, const std::vector<Operations::Op> &ops) const {
(void)ops; // avoid unused variable compiler warning
(void)ops; // avoid unused variable compiler warning
densmat_t tmp;
return tmp.required_memory_mb(2 * num_qubits);
}
Expand Down
28 changes: 18 additions & 10 deletions src/simulators/multi_state_executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,15 +234,15 @@ bool MultiStateExecutor<state_t>::allocate_states(uint_t num_shots,
num_active_states_ = num_shots;

// initialize groups
top_state_of_group_.resize(num_shots);
num_states_in_group_.resize(num_shots);
num_groups_ = num_shots;
top_state_of_group_.resize(1);
num_states_in_group_.resize(1);
num_groups_ = 1;
top_state_of_group_[0] = 0;
num_states_in_group_[0] = num_shots;

for (i = 0; i < num_shots; i++) {
states_[i].set_config(config);
states_[i].set_num_global_qubits(num_qubits_);

top_state_of_group_[i] = i;
num_states_in_group_[i] = 1;
}

return ret;
Expand Down Expand Up @@ -315,7 +315,7 @@ void MultiStateExecutor<state_t>::run_circuit_shots(

int_t par_shots;
if (Base::sim_device_ == Device::GPU) {
par_shots = std::min((int_t)Base::parallel_shots_, (int_t)num_groups_);
par_shots = num_groups_;
} else {
par_shots =
std::min((int_t)Base::parallel_shots_, (int_t)num_local_states_);
Expand All @@ -332,9 +332,15 @@ void MultiStateExecutor<state_t>::run_circuit_shots(
nshots -= ishot;

// state distribution
uint_t istate = i * num_active_states_ / par_shots;
uint_t nstates = (i + 1) * num_active_states_ / par_shots;
nstates -= istate;
uint_t istate, nstates;
if (Base::sim_device_ == Device::GPU) {
istate = top_state_of_group_[i];
nstates = num_states_in_group_[i];
} else {
istate = i * num_active_states_ / par_shots;
nstates = (i + 1) * num_active_states_ / par_shots;
nstates -= istate;
}

if (nshots > 0) {
if (sample_noise) {
Expand Down Expand Up @@ -422,6 +428,8 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
}

int_t par_shots = std::min(shot_branch_parallel_, (int_t)num_states);
if (par_shots == 0)
par_shots = 1;

// initialize local shots
std::vector<RngEngine> shots_storage(nshots);
Expand Down
25 changes: 12 additions & 13 deletions src/simulators/statevector/chunk/chunk_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class ChunkManager {
uint_t num_creg_bits_ = 0;

reg_t target_gpus_;

public:
ChunkManager();

Expand All @@ -72,7 +73,7 @@ class ChunkManager {

uint_t Allocate(int chunk_bits, int nqubits, uint_t nchunks,
uint_t chunk_index, int matrix_bit, bool density_mat,
reg_t& gpus, bool enable_cuStatevec);
reg_t &gpus, bool enable_cuStatevec);
void Free(void);

int num_devices(void) { return num_devices_; }
Expand Down Expand Up @@ -161,8 +162,7 @@ template <typename data_t>
uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
uint_t nchunks, uint_t chunk_index,
int matrix_bit, bool density_mat,
reg_t& gpus, bool enable_cuStatevec)
{
reg_t &gpus, bool enable_cuStatevec) {
uint_t num_buffers;
int iDev;
uint_t is, ie, nc;
Expand All @@ -186,14 +186,13 @@ uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,

enable_cuStatevec_ = enable_cuStatevec;
target_gpus_ = gpus;
if(target_gpus_.size() > 0){
if (target_gpus_.size() > 0) {
num_devices_ = target_gpus_.size();
if(num_devices_ > 1)
if (num_devices_ > 1)
multi_gpu = true;
}
else{
} else {
target_gpus_.resize(num_devices_);
for(iDev=0;iDev<num_devices_;iDev++){
for (iDev = 0; iDev < num_devices_; iDev++) {
target_gpus_[iDev] = iDev;
}
}
Expand Down Expand Up @@ -309,12 +308,12 @@ uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
chunk_index_ +
chunks_allocated); // set first chunk index for the container
chunks_[iDev]->set_num_creg_bits(num_creg_bits_);
if (num_devices_ > 0){
if (num_devices_ > 0) {
int id = target_gpus_[(iDev + idev_start) % num_devices_];
chunks_allocated += chunks_[iDev]->Allocate(id, chunk_bits, nqubits,
nc, num_buffers, multi_shots_, matrix_bit, density_matrix_);
}
else{
chunks_allocated +=
chunks_[iDev]->Allocate(id, chunk_bits, nqubits, nc, num_buffers,
multi_shots_, matrix_bit, density_matrix_);
} else {
chunks_allocated +=
chunks_[iDev]->Allocate(iDev, chunk_bits, nqubits, nc, num_buffers,
multi_shots_, matrix_bit, density_matrix_);
Expand Down
36 changes: 16 additions & 20 deletions src/simulators/statevector/chunk/device_chunk_container.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,28 +321,24 @@ uint_t DeviceChunkContainer<data_t>::Allocate(int idev, int chunk_bits,
set_device();

#ifdef AER_THRUST_CUDA
if (!multi_shots) {
int ip, nd;
cudaGetDeviceCount(&nd);
peer_access_.resize(nd);
for (i = 0; i < nd; i++) {
ip = 1;
if (i != device_id_) {
cudaDeviceCanAccessPeer(&ip, device_id_, i);
}
if (ip) {
if (cudaDeviceEnablePeerAccess(i, 0) != cudaSuccess)
cudaGetLastError();
peer_access_[i] = true;
} else
peer_access_[i] = false;
int ip, nd;
cudaGetDeviceCount(&nd);
peer_access_.resize(nd);
for (i = 0; i < nd; i++) {
ip = 1;
if (i != device_id_) {
cudaDeviceCanAccessPeer(&ip, device_id_, i);
}
} else {
#endif
peer_access_.resize(1);
peer_access_[0] = true;
#ifdef AER_THRUST_CUDA
if (ip) {
if (cudaDeviceEnablePeerAccess(i, 0) != cudaSuccess)
cudaGetLastError();
peer_access_[i] = true;
} else
peer_access_[i] = false;
}
#else
peer_access_.resize(1);
peer_access_[0] = true;
#endif

this->num_buffers_ = buffers;
Expand Down
5 changes: 3 additions & 2 deletions src/simulators/statevector/qubitvector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ class QubitVector {
// cuStateVec
void cuStateVec_enable(bool flg) {}

void set_target_gpus(reg_t& t) {}
void set_target_gpus(reg_t &t) {}

//-----------------------------------------------------------------------
// Optimization configuration settings
Expand All @@ -449,7 +449,6 @@ class QubitVector {
virtual bool enable_batch(bool flg) const { return false; }

bool support_global_indexing(void) { return false; }


protected:
//-----------------------------------------------------------------------
Expand Down Expand Up @@ -971,6 +970,8 @@ size_t QubitVector<data_t>::required_memory_mb(uint_t num_qubits) const {

size_t unit = std::log2(sizeof(std::complex<data_t>));
size_t shift_mb = std::max<int_t>(0, num_qubits + unit - 20);
if (shift_mb >= 63)
return SIZE_MAX;
size_t mem_mb = 1ULL << shift_mb;
return mem_mb;
}
Expand Down
9 changes: 6 additions & 3 deletions src/simulators/statevector/qubitvector_thrust.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ class QubitVectorThrust {

bool support_global_indexing(void) { return (!cuStateVec_enable_); }

void set_target_gpus(reg_t& t) { target_gpus_ = t;}
void set_target_gpus(reg_t &t) { target_gpus_ = t; }
//-----------------------------------------------------------------------
// Optimization configuration settings
//-----------------------------------------------------------------------
Expand Down Expand Up @@ -690,6 +690,7 @@ void QubitVectorThrust<data_t>::copy_qv(const QubitVectorThrust<data_t> &obj) {
}
set_num_qubits(obj.num_qubits());

chunk_.set_device();
chunk_.CopyIn(obj.chunk_);
}
//------------------------------------------------------------------------------
Expand Down Expand Up @@ -871,7 +872,8 @@ bool QubitVectorThrust<data_t>::chunk_setup(int chunk_bits, int num_qubits,
chunk_manager_->set_num_creg_bits(num_creg_bits_ + num_cmem_bits_);
chunk_manager_->Allocate(chunk_bits, num_qubits, num_local_chunks,
chunk_index_, max_matrix_bits_,
is_density_matrix(), target_gpus_, cuStateVec_enable_);
is_density_matrix(), target_gpus_,
cuStateVec_enable_);
}

multi_chunk_distribution_ = false;
Expand Down Expand Up @@ -959,8 +961,9 @@ size_t QubitVectorThrust<data_t>::required_memory_mb(uint_t num_qubits) const {

size_t unit = std::log2(sizeof(std::complex<data_t>));
size_t shift_mb = std::max<int_t>(0, num_qubits + unit - 20);
if (shift_mb >= 63)
return SIZE_MAX;
size_t mem_mb = 1ULL << shift_mb;

return mem_mb;
}

Expand Down
7 changes: 2 additions & 5 deletions src/simulators/unitary/unitary_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,12 +321,9 @@ void State<unitary_matrix_t>::apply_op(const Operations::Op &op,
template <class unitary_matrix_t>
size_t State<unitary_matrix_t>::required_memory_mb(
uint_t num_qubits, const std::vector<Operations::Op> &ops) const {
// An n-qubit unitary as 2^2n complex doubles
// where each complex double is 16 bytes
(void)ops; // avoid unused variable compiler warning
size_t shift_mb = std::max<int_t>(0, num_qubits + 4 - 20);
size_t mem_mb = 1ULL << (2 * shift_mb);
return mem_mb;
unitary_matrix_t tmp;
return tmp.required_memory_mb(2 * num_qubits);
}

template <class unitary_matrix_t>
Expand Down

0 comments on commit a7b6a10

Please sign in to comment.