From e93c18a3910dc7761ff2b445cead04b434d2940e Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Thu, 16 Sep 2021 17:52:30 +0800
Subject: [PATCH] fix dataloader exit terminate error (#34501)

* fix DataLoader exit with SIGABRT/SIGSEGV. test=develop
---
 .../fluid/dataloader/dataloader_iter.py       | 155 ++++++++++++------
 python/paddle/fluid/dataloader/fetcher.py     |  33 +++-
 .../unittests/test_dataloader_dataset.py      |  15 +-
 3 files changed, 141 insertions(+), 62 deletions(-)

diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index cc98d378f1489..70c7b01b05ba3 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -43,6 +43,36 @@
 
 __all__ = ['get_worker_info']
 
+# NOTE: fix `terminate called without an active exception`
+# if for loop break and program exit immediately(with no model
+# layers processing) after iterate **the first few data** in
+# distributed lauch mode, distributed launch will call
+# terminate() to kill main process on each devices, but thread
+# is still iterating to fullfill blocking queue caches, which
+# may cause thread error `terminate called without an active
+# exception` for terminate is a strong singal and `__del__`
+# of DataLoader may not be called, so we add a global link to
+# the last DataLoader instance to call `__del__` to clean up
+# resources
+# NOTE: cannot simply as `__del__` to CleanupFuncRegistrar,
+# for this will remain a link to each DataLoader instance in
+# global, and will precludes GC to auto collect DataLoader
+# instance and will cause memory leak
+_loader = None
+
+
+def _clear_loader():
+    global _loader
+    if _loader is not None:
+        try:
+            _loader.__del__()
+            del _loader
+        except:
+            pass
+
+
+CleanupFuncRegistrar.register(_clear_loader)
+
 
 class _DataLoaderIterBase(object):
     """
@@ -100,6 +130,16 @@ def __iter__(self):
     def __len__(self):
         return len(self._batch_sampler)
 
+    def _exit_thread_expectedly(self):
+        self._thread_done_event.set()
+        if self._blocking_queue:
+            self._blocking_queue.close()
+
+    def _exit_thread_unexpectedly(self):
+        self._thread_done_event.set()
+        if self._blocking_queue:
+            self._blocking_queue.kill()
+
 
 class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
     """
@@ -125,9 +165,13 @@ def __init__(self, loader):
         # NOTE: len(self._places) batch data compose as an output
         # iteration, set blocking_queue can cache 2 iteration datas
         # at most here
-        self._blocking_queue_capacity = 2 * len(self._places)
+        self._blocking_queue_capacity = 1 * len(self._places)
 
         self._init_thread()
+        self._shutdown = False
+
+        global _loader
+        _loader = self
 
     def _init_thread(self):
         self._var_names = [v.name for v in self._feed_list]
@@ -151,22 +195,35 @@ def _init_thread(self):
         self._thread.start()
 
     def _thread_loop(self, legacy_expected_place):
-        try:
-            #NOTE(zhiqiu): Set the expected place for new thread as the same as father thread,
-            # and it will call platform::SetDeviceId() in c++ internally.
-            # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0,
-            # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda 
-            # APIs in this thread.
-            _set_expected_place(legacy_expected_place)
-
-            for indices in self._sampler_iter:
+        #NOTE(zhiqiu): Set the expected place for new thread as the same as father thread,
+        # and it will call platform::SetDeviceId() in c++ internally.
+        # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0,
+        # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda 
+        # APIs in this thread.
+        _set_expected_place(legacy_expected_place)
+
+        while not self._thread_done_event.is_set():
+            try:
+                indices = next(self._sampler_iter)
+
                 # read data from dataset in mini-batch
-                batch = self._dataset_fetcher.fetch(indices)
+                # with paddle.fluid.dygraph.guard(place=paddle.CPUPlace()):
+                # read data from dataset in mini-batch
+                batch = self._dataset_fetcher.fetch(indices,
+                                                    self._thread_done_event)
+            except StopIteration:
+                self._exit_thread_expectedly()
+                return
+
+            if batch is None or self._thread_done_event.is_set(): break
+
+            # flat batch and record structure infos
+            batch, structure = _flatten_batch(batch)
+            self._structure_infos.append(structure)
 
-                # flat batch and record structure infos
-                batch, structure = _flatten_batch(batch)
-                self._structure_infos.append(structure)
+            if self._thread_done_event.is_set(): break
 
+            try:
                 # pack as LoDTensorArray
                 array = core.LoDTensorArray()
                 for slot in batch:
@@ -179,21 +236,18 @@ def _thread_loop(self, legacy_expected_place):
 
                     array.append(slot)
 
-                if not self._blocking_queue.push(array):
-                    break
+                if self._thread_done_event.is_set(): break
 
-                if self._thread_done_event.is_set():
-                    break
+                try:
+                    self._blocking_queue.push(array)
+                except:
+                    self._exit_thread_expectedly()
 
-            self._blocking_queue.close()
-            self._shutdown_thread()
-        except StopIteration:
-            self._blocking_queue.close()
-        except Exception:
-            self._blocking_queue.kill()
-            self._shutdown_thread()
-            logging.warning("DataLoader reader thread raised an exception.")
-            six.reraise(*sys.exc_info())
+            except:
+                self._exit_thread_unexpectedly()
+                six.reraise(*sys.exc_info())
+
+        self._exit_thread_expectedly()
 
     def __next__(self):
         try:
@@ -221,28 +275,46 @@ def __next__(self):
             return data
         except StopIteration:
             self._reader.shutdown()
+            self._try_shutdown_all()
             six.reraise(*sys.exc_info())
 
     def _shutdown_thread(self):
         if self._thread:
             self._thread_done_event.set()
-            if self._thread is not threading.current_thread():
-                self._thread.join()
+            # NOTE: we wait for _thread exit for 3 seconds, if
+            #       thread not exit normally, force kill it
+            for _ in range(3):
+                if self._thread.is_alive():
+                    time.sleep(1)
+                else:
+                    break
+            else:
+                if self._thread is not threading.current_thread():
+                    self._thread.join()
+
             self._thread = None
 
     # python2 compatibility
     def next(self):
         return self.__next__()
 
+    def _try_shutdown_all(self):
+        if not self._shutdown:
+            try:
+                # # _blocking_queue in keep order mode holds sub-threads
+                # # need to release thread resources on unexpected exit
+                if self._blocking_queue:
+                    self._blocking_queue.close()
+                    self._blocking_queue = None
+                # NOTE: blocking queue should be closed firstly for
+                # blocking queue read may hang and _thread_done_event
+                # cannot be checked
+                self._shutdown_thread()
+            finally:
+                self._shutdown = True
+
     def __del__(self):
-        # _blocking_queue in keep order mode holds sub-threads
-        # need to release thread resources on unexpected exit
-        if self._blocking_queue:
-            self._blocking_queue.close()
-        # NOTE: blocking queue should be closed firstly for
-        # blocking queue read may hang and _thread_done_event
-        # cannot be checked
-        self._shutdown_thread()
+        self._try_shutdown_all()
 
 
 class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
@@ -421,15 +493,6 @@ def _try_shutdown_all(self, timeout=None):
                 core._erase_process_pids(id(self))
                 self._shutdown = True
 
-    def _exit_thread_expectedly(self):
-        self._thread_done_event.set()
-        self._blocking_queue.close()
-
-    def _exit_thread_unexpectedly(self):
-        self._thread_done_event.set()
-        self._blocking_queue.kill()
-        logging.error("DataLoader reader thread raised an exception!")
-
     def _thread_loop(self, legacy_expected_place):
         #NOTE(zhiqiu): Set the expected place for new thread as the same as father thread,
         # and it will call platform::SetDeviceId() in c++ internally.
diff --git a/python/paddle/fluid/dataloader/fetcher.py b/python/paddle/fluid/dataloader/fetcher.py
index 8ccec81810a0a..ec3240a326b8e 100644
--- a/python/paddle/fluid/dataloader/fetcher.py
+++ b/python/paddle/fluid/dataloader/fetcher.py
@@ -26,7 +26,16 @@ def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
         self.collate_fn = collate_fn
         self.drop_last = drop_last
 
-    def fetch(self, batch_indices):
+    # NOTE: fetch function here perform the whole pipeline of dataset
+    #       reading and data trasforms of a batch in each calling, this
+    #       may take a long time inside, if DataLoader is exit outside,
+    #       fetch need to perceive exit situation, so we pass done_event
+    #       here for fetch to check exit status
+    # NOTE: if DataLoadet exit by `break`, performing GPU tensor operations,
+    #       e.g. to_tensor may cause SIGSEGV in thread, so we pass the
+    #       done_event argument to check DataLoader exit status between
+    #       ecah sample processing in the batch
+    def fetch(self, batch_indices, done_event=None):
         raise NotImplementedError("'fetch' not implement for class {}".format(
             self.__class__.__name__))
 
@@ -69,15 +78,18 @@ def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
             dataset, auto_collate_batch, collate_fn, drop_last)
         self.dataset_iter = iter(dataset)
 
-    def fetch(self, batch_indices):
+    def fetch(self, batch_indices, done_event=None):
 
         if self.auto_collate_batch:
             data = []
             for _ in batch_indices:
-                try:
-                    data.append(next(self.dataset_iter))
-                except StopIteration:
-                    break
+                if done_event is None or not done_event.is_set():
+                    try:
+                        data.append(next(self.dataset_iter))
+                    except StopIteration:
+                        break
+                else:
+                    return None
 
             if len(data) == 0 or (self.drop_last and
                                   len(data) < len(batch_indices)):
@@ -101,9 +113,14 @@ def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
         super(_MapDatasetFetcher, self).__init__(dataset, auto_collate_batch,
                                                  collate_fn, drop_last)
 
-    def fetch(self, batch_indices):
+    def fetch(self, batch_indices, done_event=None):
         if self.auto_collate_batch:
-            data = [self.dataset[idx] for idx in batch_indices]
+            data = []
+            for idx in batch_indices:
+                if done_event is None or not done_event.is_set():
+                    data.append(self.dataset[idx])
+                else:
+                    return None
 
             global _WARNING_TO_LOG
             if not isinstance(data[0], (Sequence, Mapping)) \
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
index d2f4eadc9c564..c54a1406e39bf 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
@@ -43,14 +43,18 @@ def test_main(self):
 class TestDatasetWithDiffOutputPlace(unittest.TestCase):
     def get_dataloader(self, num_workers):
         dataset = paddle.vision.datasets.MNIST(
-            mode='test', transform=transforms.ToTensor())
+            mode='test',
+            transform=transforms.Compose([
+                transforms.CenterCrop(20), transforms.RandomResizedCrop(14),
+                transforms.Normalize(), transforms.ToTensor()
+            ]))
         loader = paddle.io.DataLoader(
             dataset, batch_size=32, num_workers=num_workers, shuffle=True)
         return loader
 
     def run_check_on_cpu(self):
         paddle.set_device('cpu')
-        loader = self.get_dataloader(0)
+        loader = self.get_dataloader(1)
         for image, label in loader:
             self.assertTrue(image.place.is_cpu_place())
             self.assertTrue(label.place.is_cpu_place())
@@ -66,12 +70,7 @@ def test_single_process(self):
             for image, label in loader:
                 self.assertTrue(image.place.is_gpu_place())
                 self.assertTrue(label.place.is_cuda_pinned_place())
-                # FIXME(dkp): when input tensor is in GPU place and
-                # iteration break in the median, it seems the GPU
-                # tensor put into blocking_queue cannot be safely
-                # released and may cause ABRT/SEGV, this should
-                # be fixed
-                # break
+                break
 
     def test_multi_process(self):
         # DataLoader with multi-process mode is not supported on MacOs and Windows currently