NVIDIA · JanuszL · Feb 15, 2024 · Feb 13, 2024 · Feb 13, 2024 · Feb 13, 2024
diff --git a/dali/core/mm/malloc_resource.cc b/dali/core/mm/malloc_resource.cc
@@ -94,9 +94,8 @@ cuda_malloc_async_memory_resource::cuda_malloc_async_memory_resource(int device_
   dummy_host_stream_ = CUDAStreamPool::instance().Get(device_id_);
 #if NVML_ENABLED
   static const float driverVersion = []() {
-    nvml::Init();
+    auto nvml_handle = nvml::NvmlInstance::CreateNvmlInstance();
     auto ret = nvml::GetDriverVersion();
-    nvml::Shutdown();
     return ret;
   }();
   if (driverVersion < 470.60) {

diff --git a/dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h b/dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h
@@ -119,7 +119,7 @@ class nvJPEGDecoder : public StatelessOperator<MixedBackend>, CachedDecoderImpl
     // disable HW decoder for drivers < 455.x as the memory pool for it is not available
     // and multi GPU performance is far from perfect due to frequent memory allocations
 #if NVML_ENABLED
-      nvml::Init();
+      nvml_handle_ = nvml::NvmlInstance::CreateNvmlInstance();
       float driverVersion = nvml::GetDriverVersion();
       if (driverVersion < 455) {
         try_init_hw_decoder = false,
@@ -297,9 +297,6 @@ class nvJPEGDecoder : public StatelessOperator<MixedBackend>, CachedDecoderImpl
     try {
       DeviceGuard g(device_id_);
 
-#if NVML_ENABLED
-      nvml::Shutdown();
-#endif
       if (hw_decode_stream_)
         CUDA_CALL(cudaStreamSynchronize(hw_decode_stream_));
 
@@ -1186,6 +1183,10 @@ class nvJPEGDecoder : public StatelessOperator<MixedBackend>, CachedDecoderImpl
   int64_t task_priority_seq_ = 0;
   unsigned int num_hw_engines_ = 1;
   unsigned int num_hw_cores_per_engine_ = 1;
+
+#if NVML_ENABLED
+  nvml::NvmlInstance nvml_handle_;
+#endif
 };
 
 }  // namespace dali

diff --git a/dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api_test.cc b/dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api_test.cc
@@ -185,7 +185,7 @@ TYPED_TEST(nvjpegDecodeDecoupledAPITest, TestSingleTiffDecode4T) {
 #if NVJPEG_VER_MAJOR >= 11 && NVML_ENABLED
 void PrintDeviceInfo() {
   unsigned int device_count;
-  nvml::Init();
+  auto nvml_handle = nvml::NvmlInstance::CreateNvmlInstance();
   CUDA_CALL(nvmlDeviceGetCount_v2(&device_count));
   for (unsigned int device_idx = 0; device_idx < device_count; device_idx++) {
     auto info = nvml::GetDeviceInfo(device_idx);
@@ -195,7 +195,6 @@ void PrintDeviceInfo() {
               << " cc_m " << info.cap_minor
               << std::endl;
   }
-  nvml::Shutdown();
 }
 
 /**
@@ -204,9 +203,8 @@ void PrintDeviceInfo() {
 bool ShouldUseHwDecoder() {
   // HW decoder is disabled for drivers < 455.x, see
   // dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h for details
-  nvml::Init();
+  auto nvml_handle = nvml::NvmlInstance::CreateNvmlInstance();
   static float driver_version = nvml::GetDriverVersion();
-  nvml::Shutdown();
   static bool device_supports_hw_decoder = nvml::isHWDecoderSupported();
   return device_supports_hw_decoder && driver_version >= 455;
 }

diff --git a/dali/operators/reader/gds_mem_test.cu b/dali/operators/reader/gds_mem_test.cu
@@ -61,9 +61,8 @@ void SkipIfIncompatible(TestBody &&body) {
   // skip test for aarch64 and CUDA < 12.2
 #if NVML_ENABLED
   static const int driverVersion = []() {
-    nvml::Init();
+    auto nvml_handle = nvml::NvmlInstance::CreateNvmlInstance();
     auto ret = nvml::GetCudaDriverVersion();
-    nvml::Shutdown();
     return ret;
   }();
 #if defined(__aarch64__)

diff --git a/dali/operators/reader/loader/video/video_loader_decoder_gpu.cc b/dali/operators/reader/loader/video/video_loader_decoder_gpu.cc
@@ -40,9 +40,8 @@ void VideoSampleGpu::Decode() {
 void VideoLoaderDecoderGpu::InitCudaStream() {
   #if NVML_ENABLED
   {
-    nvml::Init();
+    auto nvml_handle = nvml::NvmlInstance::CreateNvmlInstance();
     static float driver_version = nvml::GetDriverVersion();
-    nvml::Shutdown();
     if (driver_version > 460 && driver_version < 470.21) {
       DALI_WARN_ONCE("Warning: Decoding on a default stream. Performance may be affected.");
       return;

diff --git a/dali/operators/reader/nvdecoder/nvdecoder.cc b/dali/operators/reader/nvdecoder/nvdecoder.cc
@@ -32,8 +32,6 @@
 #include "dali/core/error_handling.h"
 #include "dali/operators/reader/nvdecoder/imgproc.h"
 #include "dali/core/device_guard.h"
-#include "dali/util/nvml.h"
-
 namespace dali {
 
 static constexpr int kNvcuvid_success = 1;
@@ -64,7 +62,7 @@ NvDecoder::NvDecoder(int device_id,
   bool use_default_stream = false;
 #if NVML_ENABLED
   {
-    nvml::Init();
+    nvml_handle_ = nvml::NvmlInstance::CreateNvmlInstance();
     static float driver_version = nvml::GetDriverVersion();
     if (driver_version > 460 && driver_version < 470.21)
       use_default_stream = true;
@@ -135,11 +133,7 @@ bool NvDecoder::initialized() const {
     return parser_.initialized();
 }
 
-NvDecoder::~NvDecoder() {
-#if NVML_ENABLED
-  nvml::Shutdown();
-#endif
-}
+NvDecoder::~NvDecoder() {}
 
 VidReqStatus NvDecoder::decode_av_packet(AVPacket* avpkt, int64_t start_time,
                                          AVRational stream_base) {

diff --git a/dali/operators/reader/nvdecoder/nvdecoder.h b/dali/operators/reader/nvdecoder/nvdecoder.h
@@ -37,6 +37,10 @@ extern "C" {
 #include "dali/operators/reader/nvdecoder/cuvideodecoder.h"
 #include "dali/operators/reader/nvdecoder/dynlink_nvcuvid.h"
 #include "dali/util/thread_safe_queue.h"
+#if NVML_ENABLED
+#include "dali/util/nvml.h"
+#endif
+
 
 struct AVPacket;
 #if HAVE_AVSTREAM_CODECPAR
@@ -230,6 +234,9 @@ class NvDecoder {
   std::exception_ptr captured_exception_;
 
   std::thread thread_convert_;
+#if NVML_ENABLED
+  nvml::NvmlInstance nvml_handle_;
+#endif
 };
 
 }  // namespace dali

diff --git a/dali/operators/reader/video_reader_op_test.cc b/dali/operators/reader/video_reader_op_test.cc
@@ -123,9 +123,8 @@ TEST_F(VideoReaderTest, MultipleVideoResolution) {
   float driverVersion = 0;
 
 #if NVML_ENABLED
-  nvml::Init();
+  auto nvml_handle = nvml::NvmlInstance::CreateNvmlInstance();
   driverVersion = nvml::GetDriverVersion();
-  nvml::Shutdown();
 #endif
 
 

diff --git a/dali/operators/sequence/optical_flow/optical_flow.h b/dali/operators/sequence/optical_flow/optical_flow.h
@@ -74,15 +74,12 @@ class OpticalFlow : public StatelessOperator<Backend> {
                  std::to_string(spec.NumInput()));
     sync_ = CUDAEvent::Create(device_id_);
 #if NVML_ENABLED
-    nvml::Init();
+    nvml_handle_ = nvml::NvmlInstance::CreateNvmlInstance();
 #endif
   }
 
-  ~OpticalFlow() {
-#if NVML_ENABLED
-    nvml::Shutdown();
-#endif
-  }
+  ~OpticalFlow() {}
+
   DISABLE_COPY_MOVE_ASSIGN(OpticalFlow);
 
  protected:
@@ -230,6 +227,10 @@ class OpticalFlow : public StatelessOperator<Backend> {
   std::vector<int> sequence_sizes_;
   std::vector<DimsOrder> processing_order_;
   CUDAEvent sync_;
+
+#if NVML_ENABLED
+  nvml::NvmlInstance nvml_handle_;
+#endif
 };
 
 }  // namespace dali

diff --git a/dali/pipeline/util/thread_pool.cc b/dali/pipeline/util/thread_pool.cc
@@ -32,7 +32,7 @@ ThreadPool::ThreadPool(int num_thread, int device_id, bool set_affinity, const c
 #if NVML_ENABLED
   // only for the CPU pipeline
   if (device_id != CPU_ONLY_DEVICE_ID) {
-    nvml::Init();
+    nvml_handle_ = nvml::NvmlInstance::CreateNvmlInstance();
   }
 #endif
   // Start the threads in the main loop
@@ -54,9 +54,6 @@ ThreadPool::~ThreadPool() {
   for (auto &thread : threads_) {
     thread.join();
   }
-#if NVML_ENABLED
-  nvml::Shutdown();
-#endif
 }
 
 void ThreadPool::AddWork(Work work, int64_t priority, bool start_immediately) {

diff --git a/dali/pipeline/util/thread_pool.h b/dali/pipeline/util/thread_pool.h
@@ -25,6 +25,9 @@
 #include <vector>
 #include <string>
 #include "dali/core/common.h"
+#if NVML_ENABLED
+#include "dali/util/nvml.h"
+#endif
 
 
 namespace dali {
@@ -93,6 +96,9 @@ class DLL_PUBLIC ThreadPool {
 
   //  Stored error strings for each thread
   vector<std::queue<string>> tl_errors_;
+#if NVML_ENABLED
+  nvml::NvmlInstance nvml_handle_;
+#endif
 };
 
 }  // namespace dali

diff --git a/dali/pipeline/util/worker_thread.h b/dali/pipeline/util/worker_thread.h
@@ -71,7 +71,7 @@ class WorkerThread {
     running_(true), work_complete_(true), barrier_(2) {
 #if NVML_ENABLED
     if (device_id != CPU_ONLY_DEVICE_ID) {
-      nvml::Init();
+      nvml_handle_ = nvml::NvmlInstance::CreateNvmlInstance();
     }
 #endif
     thread_ = std::thread(&WorkerThread::ThreadMain,
@@ -80,9 +80,6 @@ class WorkerThread {
 
   inline ~WorkerThread() {
     Shutdown();
-#if NVML_ENABLED
-    nvml::Shutdown();
-#endif
   }
 
   /*
@@ -236,6 +233,9 @@ class WorkerThread {
   std::queue<string> errors_;
 
   Barrier barrier_;
+#if NVML_ENABLED
+  nvml::NvmlInstance nvml_handle_;
+#endif
 };
 
 }  // namespace dali

diff --git a/dali/util/nvml.h b/dali/util/nvml.h
@@ -241,6 +241,45 @@ inline void Shutdown() {
   CUDA_CALL(nvmlShutdown());
 }
 
+
+class NvmlInstance {
+ public:
+  static NvmlInstance CreateNvmlInstance() {
+    return NvmlInstance(true);
+  }
+
+  explicit NvmlInstance(bool init = false) {
+    if (init) {
+      Init();
+      is_created_ = true;
+    }
+  }
+
+  NvmlInstance(const NvmlInstance &) = delete;
+
+  NvmlInstance &operator=(const NvmlInstance &) = delete;
+
+  inline NvmlInstance(NvmlInstance &&other) : is_created_(other.is_created_) {
+    other.is_created_ = false;
+  }
+
+  inline NvmlInstance &operator=(NvmlInstance &&other) {
+    std::swap(is_created_, other.is_created_);
+    other.~NvmlInstance();
+    return *this;
+  }
+
+  ~NvmlInstance() {
+    if (is_created_) {
+      Shutdown();
+      is_created_ = false;
+    }
+  }
+
+ private:
+  bool is_created_ = false;
+};
+
 /**
  * Checks, whether CUDA11-proper NVML functions have been successfully loaded
  */

diff --git a/qa/leak.sup b/qa/leak.sup
@@ -40,6 +40,7 @@ leak:dali::kernels::DynamicScratchpad::~DynamicScratchpad
 leak:dali::kernels::DynamicScratchpad::AllocImpl
 leak:dali::mm::GPUHog::init()
 leak:dali::mm::detail::fixed_size_allocator
+leak:nvmlInitWithFlags
 # no idea how to suppress them other than as below but they are not caused by DALI
 # still there is some danger that any of the  below functions appear in a valid leak
 leak:std::string::_Rep::_S_create