Fix OpenMP nested parallel (Qiskit#1880)

* Fix OpenMP nested parallel * add comment in release note * fix true and false * fix format --------- Co-authored-by: Hiroshi Horii <hhorii@users.noreply.github.com>
hhorii · Aug 9, 2023 · a7b6a10 · a7b6a10
1 parent 79b47fe
commit a7b6a10
Show file tree

Hide file tree

Showing 11 changed files with 78 additions and 66 deletions.
diff --git a/releasenotes/notes/fix_omp_nested-a554de2e7fd2a2d6.yaml b/releasenotes/notes/fix_omp_nested-a554de2e7fd2a2d6.yaml
@@ -0,0 +1,11 @@
+---
+fixes:
+  - |
+    OpenMP parallel nested was not correctly set when number of input circuits
+    is less than number of threads.
+    Also parallel state update was not correctly set if number of input circuits
+    is more than 1.
+    This release fixes these settings to get more speed up with OpenMP.
+    For single circuit with multiple-shots run, when nested parallel is used
+    `omp_nested=True` is set in the metadata of result for a circuit.
+
diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp
@@ -541,9 +541,9 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
 
       // nested should be set to zero if num_threads clause will be used
 #if _OPENMP >= 200805
-      omp_set_max_active_levels(0);
+      omp_set_max_active_levels(2);
 #else
-      omp_set_nested(0);
+      omp_set_nested(1);
 #endif
 
       result.metadata.add(parallel_nested_, "omp_nested");

diff --git a/src/framework/config.hpp b/src/framework/config.hpp
@@ -170,7 +170,7 @@ struct Config {
   optional<uint_t> unitary_parallel_threshold;
   optional<uint_t> memory_blocking_bits;
   optional<uint_t> extended_stabilizer_norm_estimation_default_samples;
-  optional<reg_t>  target_gpus;
+  optional<reg_t> target_gpus;
 
   void clear() {
     shots = 1024;

diff --git a/src/simulators/circuit_executor.hpp b/src/simulators/circuit_executor.hpp
@@ -87,8 +87,8 @@ class Executor : public Base {
   int max_parallel_shots_;
   size_t max_memory_mb_;
   size_t max_gpu_memory_mb_;
-  int num_gpus_; // max number of GPU per process
-  reg_t target_gpus_;  //GPUs to be used
+  int num_gpus_;      // max number of GPU per process
+  reg_t target_gpus_; // GPUs to be used
 
   // use explicit parallelization
   bool explicit_parallelization_;
@@ -293,7 +293,7 @@ void Executor<state_t>::set_config(const Config &config) {
     sim_precision_ = Precision::Single;
   }
 
-  //set target GPUs
+  // set target GPUs
 #ifdef AER_THRUST_CUDA
   int nDev = 0;
   if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
@@ -303,16 +303,14 @@ void Executor<state_t>::set_config(const Config &config) {
 #endif
   if (config.target_gpus.has_value()) {
     target_gpus_ = config.target_gpus.value();
-    if( nDev < target_gpus_.size()){
-      throw std::invalid_argument(
-        "target_gpus has more GPUs than available.");
+    if (nDev < target_gpus_.size()) {
+      throw std::invalid_argument("target_gpus has more GPUs than available.");
     }
     num_gpus_ = target_gpus_.size();
-  }
-  else{
+  } else {
     num_gpus_ = nDev;
     target_gpus_.resize(num_gpus_);
-    for(int_t i=0;i<num_gpus_;i++)
+    for (int_t i = 0; i < num_gpus_; i++)
       target_gpus_[i] = i;
   }
 }

diff --git a/src/simulators/density_matrix/densitymatrix_state.hpp b/src/simulators/density_matrix/densitymatrix_state.hpp
@@ -391,7 +391,6 @@ template <class densmat_t>
 size_t State<densmat_t>::required_memory_mb(
     uint_t num_qubits, const std::vector<Operations::Op> &ops) const {
   (void)ops; // avoid unused variable compiler warning
-  (void)ops; // avoid unused variable compiler warning
   densmat_t tmp;
   return tmp.required_memory_mb(2 * num_qubits);
 }

diff --git a/src/simulators/multi_state_executor.hpp b/src/simulators/multi_state_executor.hpp
@@ -234,15 +234,15 @@ bool MultiStateExecutor<state_t>::allocate_states(uint_t num_shots,
   num_active_states_ = num_shots;
 
   // initialize groups
-  top_state_of_group_.resize(num_shots);
-  num_states_in_group_.resize(num_shots);
-  num_groups_ = num_shots;
+  top_state_of_group_.resize(1);
+  num_states_in_group_.resize(1);
+  num_groups_ = 1;
+  top_state_of_group_[0] = 0;
+  num_states_in_group_[0] = num_shots;
+
   for (i = 0; i < num_shots; i++) {
     states_[i].set_config(config);
     states_[i].set_num_global_qubits(num_qubits_);
-
-    top_state_of_group_[i] = i;
-    num_states_in_group_[i] = 1;
   }
 
   return ret;
@@ -315,7 +315,7 @@ void MultiStateExecutor<state_t>::run_circuit_shots(
 
   int_t par_shots;
   if (Base::sim_device_ == Device::GPU) {
-    par_shots = std::min((int_t)Base::parallel_shots_, (int_t)num_groups_);
+    par_shots = num_groups_;
   } else {
     par_shots =
         std::min((int_t)Base::parallel_shots_, (int_t)num_local_states_);
@@ -332,9 +332,15 @@ void MultiStateExecutor<state_t>::run_circuit_shots(
     nshots -= ishot;
 
     // state distribution
-    uint_t istate = i * num_active_states_ / par_shots;
-    uint_t nstates = (i + 1) * num_active_states_ / par_shots;
-    nstates -= istate;
+    uint_t istate, nstates;
+    if (Base::sim_device_ == Device::GPU) {
+      istate = top_state_of_group_[i];
+      nstates = num_states_in_group_[i];
+    } else {
+      istate = i * num_active_states_ / par_shots;
+      nstates = (i + 1) * num_active_states_ / par_shots;
+      nstates -= istate;
+    }
 
     if (nshots > 0) {
       if (sample_noise) {
@@ -422,6 +428,8 @@ void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
   }
 
   int_t par_shots = std::min(shot_branch_parallel_, (int_t)num_states);
+  if (par_shots == 0)
+    par_shots = 1;
 
   // initialize local shots
   std::vector<RngEngine> shots_storage(nshots);

diff --git a/src/simulators/statevector/chunk/chunk_manager.hpp b/src/simulators/statevector/chunk/chunk_manager.hpp
@@ -58,6 +58,7 @@ class ChunkManager {
   uint_t num_creg_bits_ = 0;
 
   reg_t target_gpus_;
+
 public:
   ChunkManager();
 
@@ -72,7 +73,7 @@ class ChunkManager {
 
   uint_t Allocate(int chunk_bits, int nqubits, uint_t nchunks,
                   uint_t chunk_index, int matrix_bit, bool density_mat,
-                  reg_t& gpus, bool enable_cuStatevec);
+                  reg_t &gpus, bool enable_cuStatevec);
   void Free(void);
 
   int num_devices(void) { return num_devices_; }
@@ -161,8 +162,7 @@ template <typename data_t>
 uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
                                       uint_t nchunks, uint_t chunk_index,
                                       int matrix_bit, bool density_mat,
-                                      reg_t& gpus, bool enable_cuStatevec) 
-{
+                                      reg_t &gpus, bool enable_cuStatevec) {
   uint_t num_buffers;
   int iDev;
   uint_t is, ie, nc;
@@ -186,14 +186,13 @@ uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
 
   enable_cuStatevec_ = enable_cuStatevec;
   target_gpus_ = gpus;
-  if(target_gpus_.size() > 0){
+  if (target_gpus_.size() > 0) {
     num_devices_ = target_gpus_.size();
-    if(num_devices_ > 1)
+    if (num_devices_ > 1)
       multi_gpu = true;
-  }
-  else{
+  } else {
     target_gpus_.resize(num_devices_);
-    for(iDev=0;iDev<num_devices_;iDev++){
+    for (iDev = 0; iDev < num_devices_; iDev++) {
       target_gpus_[iDev] = iDev;
     }
   }
@@ -309,12 +308,12 @@ uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
           chunk_index_ +
           chunks_allocated); // set first chunk index for the container
       chunks_[iDev]->set_num_creg_bits(num_creg_bits_);
-      if (num_devices_ > 0){
+      if (num_devices_ > 0) {
         int id = target_gpus_[(iDev + idev_start) % num_devices_];
-        chunks_allocated += chunks_[iDev]->Allocate(id, chunk_bits, nqubits,
-            nc, num_buffers, multi_shots_, matrix_bit, density_matrix_);
-      }
-      else{
+        chunks_allocated +=
+            chunks_[iDev]->Allocate(id, chunk_bits, nqubits, nc, num_buffers,
+                                    multi_shots_, matrix_bit, density_matrix_);
+      } else {
         chunks_allocated +=
             chunks_[iDev]->Allocate(iDev, chunk_bits, nqubits, nc, num_buffers,
                                     multi_shots_, matrix_bit, density_matrix_);

diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp
@@ -321,28 +321,24 @@ uint_t DeviceChunkContainer<data_t>::Allocate(int idev, int chunk_bits,
   set_device();
 
 #ifdef AER_THRUST_CUDA
-  if (!multi_shots) {
-    int ip, nd;
-    cudaGetDeviceCount(&nd);
-    peer_access_.resize(nd);
-    for (i = 0; i < nd; i++) {
-      ip = 1;
-      if (i != device_id_) {
-        cudaDeviceCanAccessPeer(&ip, device_id_, i);
-      }
-      if (ip) {
-        if (cudaDeviceEnablePeerAccess(i, 0) != cudaSuccess)
-          cudaGetLastError();
-        peer_access_[i] = true;
-      } else
-        peer_access_[i] = false;
+  int ip, nd;
+  cudaGetDeviceCount(&nd);
+  peer_access_.resize(nd);
+  for (i = 0; i < nd; i++) {
+    ip = 1;
+    if (i != device_id_) {
+      cudaDeviceCanAccessPeer(&ip, device_id_, i);
     }
-  } else {
-#endif
-    peer_access_.resize(1);
-    peer_access_[0] = true;
-#ifdef AER_THRUST_CUDA
+    if (ip) {
+      if (cudaDeviceEnablePeerAccess(i, 0) != cudaSuccess)
+        cudaGetLastError();
+      peer_access_[i] = true;
+    } else
+      peer_access_[i] = false;
   }
+#else
+  peer_access_.resize(1);
+  peer_access_[0] = true;
 #endif
 
   this->num_buffers_ = buffers;

diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp
@@ -434,7 +434,7 @@ class QubitVector {
   // cuStateVec
   void cuStateVec_enable(bool flg) {}
 
-  void set_target_gpus(reg_t& t) {}
+  void set_target_gpus(reg_t &t) {}
 
   //-----------------------------------------------------------------------
   // Optimization configuration settings
@@ -449,7 +449,6 @@ class QubitVector {
   virtual bool enable_batch(bool flg) const { return false; }
 
   bool support_global_indexing(void) { return false; }
-
 
 protected:
   //-----------------------------------------------------------------------
@@ -971,6 +970,8 @@ size_t QubitVector<data_t>::required_memory_mb(uint_t num_qubits) const {
 
   size_t unit = std::log2(sizeof(std::complex<data_t>));
   size_t shift_mb = std::max<int_t>(0, num_qubits + unit - 20);
+  if (shift_mb >= 63)
+    return SIZE_MAX;
   size_t mem_mb = 1ULL << shift_mb;
   return mem_mb;
 }

diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp
@@ -444,7 +444,7 @@ class QubitVectorThrust {
 
   bool support_global_indexing(void) { return (!cuStateVec_enable_); }
 
-  void set_target_gpus(reg_t& t) { target_gpus_ = t;}
+  void set_target_gpus(reg_t &t) { target_gpus_ = t; }
   //-----------------------------------------------------------------------
   // Optimization configuration settings
   //-----------------------------------------------------------------------
@@ -690,6 +690,7 @@ void QubitVectorThrust<data_t>::copy_qv(const QubitVectorThrust<data_t> &obj) {
   }
   set_num_qubits(obj.num_qubits());
 
+  chunk_.set_device();
   chunk_.CopyIn(obj.chunk_);
 }
 //------------------------------------------------------------------------------
@@ -871,7 +872,8 @@ bool QubitVectorThrust<data_t>::chunk_setup(int chunk_bits, int num_qubits,
     chunk_manager_->set_num_creg_bits(num_creg_bits_ + num_cmem_bits_);
     chunk_manager_->Allocate(chunk_bits, num_qubits, num_local_chunks,
                              chunk_index_, max_matrix_bits_,
-                             is_density_matrix(), target_gpus_, cuStateVec_enable_);
+                             is_density_matrix(), target_gpus_,
+                             cuStateVec_enable_);
   }
 
   multi_chunk_distribution_ = false;
@@ -959,8 +961,9 @@ size_t QubitVectorThrust<data_t>::required_memory_mb(uint_t num_qubits) const {
 
   size_t unit = std::log2(sizeof(std::complex<data_t>));
   size_t shift_mb = std::max<int_t>(0, num_qubits + unit - 20);
+  if (shift_mb >= 63)
+    return SIZE_MAX;
   size_t mem_mb = 1ULL << shift_mb;
-
   return mem_mb;
 }
 

diff --git a/src/simulators/unitary/unitary_state.hpp b/src/simulators/unitary/unitary_state.hpp
@@ -321,12 +321,9 @@ void State<unitary_matrix_t>::apply_op(const Operations::Op &op,
 template <class unitary_matrix_t>
 size_t State<unitary_matrix_t>::required_memory_mb(
     uint_t num_qubits, const std::vector<Operations::Op> &ops) const {
-  // An n-qubit unitary as 2^2n complex doubles
-  // where each complex double is 16 bytes
   (void)ops; // avoid unused variable compiler warning
-  size_t shift_mb = std::max<int_t>(0, num_qubits + 4 - 20);
-  size_t mem_mb = 1ULL << (2 * shift_mb);
-  return mem_mb;
+  unitary_matrix_t tmp;
+  return tmp.required_memory_mb(2 * num_qubits);
 }
 
 template <class unitary_matrix_t>