[Dataset] add COCOsuperpixel dataset (#6407)

dmlc · Oct 12, 2023 · e594b4a · e594b4a
1 parent fc06d7f
commit e594b4a
Show file tree

Hide file tree

Showing 3 changed files with 301 additions and 0 deletions.
diff --git a/python/dgl/data/__init__.py b/python/dgl/data/__init__.py
@@ -76,6 +76,7 @@
 # datasets.
 try:
     from .lrgb import (
+        COCOSuperpixelsDataset,
         PeptidesFunctionalDataset,
         PeptidesStructuralDataset,
         VOCSuperpixelsDataset,

diff --git a/python/dgl/data/lrgb.py b/python/dgl/data/lrgb.py
@@ -797,3 +797,286 @@ def __getitem__(self, idx):
             return self.graphs[idx]
 
         return self._transform(self.graphs[idx])
+
+
+class COCOSuperpixelsDataset(DGLDataset):
+    r"""COCO superpixel dataset for the node classification task.
+
+    DGL dataset of COCO-SP in the LRGB benckmark which contains image
+    superpixels and a semantic segmentation label for each node superpixel.
+
+    Based on the COCO 2017 dataset. Original source `<https://cocodataset.org>`_
+
+    Reference `<https://arxiv.org/abs/2206.08164.pdf>`_
+
+    Statistics:
+
+    - Train examples: 113,286
+    - Valid examples: 5,000
+    - Test examples: 5,000
+    - Average number of nodes: 476.88
+    - Average number of edges: 2,710.48
+    - Number of node classes: 81
+
+    Parameters
+    ----------
+    raw_dir : str
+        Directory to store all the downloaded raw datasets.
+        Default: "~/.dgl/".
+    split : str
+        Should be chosen from ["train", "val", "test"]
+        Default: "train".
+    construct_format : str, optional
+        Option to select the graph construction format.
+        Should be chosen from the following formats:
+
+        - "edge_wt_only_coord": the graphs are 8-nn graphs with the edge weights
+          computed based on only spatial coordinates of superpixel nodes.
+        - "edge_wt_coord_feat": the graphs are 8-nn graphs with the edge weights
+          computed based on combination of spatial coordinates and feature
+          values of superpixel nodes.
+        - "edge_wt_region_boundary": the graphs region boundary graphs where two
+          regions (i.e. superpixel nodes) have an edge between them if they
+          share a boundary in the original image.
+
+        Default: "edge_wt_region_boundary".
+    slic_compactness : int, optional
+        Option to select compactness of slic that was used for superpixels
+        Should be chosen from [10, 30]
+        Default: 30.
+    force_reload : bool
+        Whether to reload the dataset.
+        Default: False.
+    verbose : bool
+        Whether to print out progress information.
+        Default: False.
+    transform : callable, optional
+        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
+        a transformed version. The :class:`~dgl.DGLGraph` object will be
+        transformed before every access.
+
+    Examples
+    ---------
+    >>> from dgl.data import COCOSuperpixelsDataset
+
+    >>> train_dataset = COCOSuperpixelsDataset(split="train")
+    >>> len(train_dataset)
+    113286
+    >>> train_dataset.num_classes
+    81
+    >>> graph = train_dataset[0]
+    >>> graph
+    Graph(num_nodes=488, num_edges=2766,
+        ndata_schemes={'feat': Scheme(shape=(14,), dtype=torch.float32),
+                        'label': Scheme(shape=(), dtype=torch.uint8)}
+        edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float32)})
+
+    >>> # support tensor to be index when transform is None
+    >>> # see details in __getitem__ function
+    >>> import torch
+    >>> idx = torch.tensor([0, 1, 2])
+    >>> train_dataset_subset = train_dataset[idx]
+    >>> train_dataset_subset[0]
+    Graph(num_nodes=488, num_edges=2766,
+        ndata_schemes={'feat': Scheme(shape=(14,), dtype=torch.float32),
+                        'label': Scheme(shape=(), dtype=torch.uint8)}
+        edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float32)})
+    """
+
+    urls = {
+        10: {
+            "edge_wt_only_coord": """
+            https://www.dropbox.com/s/prqizdep8gk0ndk/coco_superpixels_edge_wt_only_coord.zip?dl=1
+            """,
+            "edge_wt_coord_feat": """
+            https://www.dropbox.com/s/zftoyln1pkcshcg/coco_superpixels_edge_wt_coord_feat.zip?dl=1
+            """,
+            "edge_wt_region_boundary": """
+            https://www.dropbox.com/s/fhihfcyx2y978u8/coco_superpixels_edge_wt_region_boundary.zip?dl=1
+            """,
+        },
+        30: {
+            "edge_wt_only_coord": """
+            https://www.dropbox.com/s/hrbfkxmc5z9lsaz/coco_superpixels_edge_wt_only_coord.zip?dl=1
+            """,
+            "edge_wt_coord_feat": """
+            https://www.dropbox.com/s/4rfa2d5ij1gfu9b/coco_superpixels_edge_wt_coord_feat.zip?dl=1
+            """,
+            "edge_wt_region_boundary": """
+            https://www.dropbox.com/s/r6ihg1f4pmyjjy0/coco_superpixels_edge_wt_region_boundary.zip?dl=1
+            """,
+        },
+    }
+
+    def __init__(
+        self,
+        raw_dir=None,
+        split="train",
+        construct_format="edge_wt_region_boundary",
+        slic_compactness=30,
+        force_reload=None,
+        verbose=None,
+        transform=None,
+    ):
+        assert split in ["train", "val", "test"], "split not valid."
+        assert construct_format in [
+            "edge_wt_only_coord",
+            "edge_wt_coord_feat",
+            "edge_wt_region_boundary",
+        ], "construct_format not valid."
+        assert slic_compactness in [10, 30], "slic_compactness not valid."
+
+        self.construct_format = construct_format
+        self.slic_compactness = slic_compactness
+        self.split = split
+        self.graphs = []
+
+        super().__init__(
+            name="COCO-SP",
+            raw_dir=raw_dir,
+            url=self.urls[self.slic_compactness][self.construct_format],
+            force_reload=force_reload,
+            verbose=verbose,
+            transform=transform,
+        )
+
+    @property
+    def save_path(self):
+        r"""Directory to save the processed dataset."""
+        return os.path.join(
+            self.raw_path,
+            "slic_compactness_" + str(self.slic_compactness),
+            self.construct_format,
+        )
+
+    @property
+    def raw_data_path(self):
+        r"""Path to save the raw dataset file."""
+        return os.path.join(self.save_path, f"{self.split}.pickle")
+
+    @property
+    def graph_path(self):
+        r"""Path to save the processed dataset file."""
+        return os.path.join(self.save_path, f"processed_{self.split}.pkl")
+
+    @property
+    def num_classes(self):
+        r"""Number of classes for each node."""
+        return 81
+
+    def __len__(self):
+        r"""The number of examples in the dataset."""
+        return len(self.graphs)
+
+    def download(self):
+        zip_file_path = os.path.join(
+            self.raw_path, "coco_superpixels_" + self.construct_format + ".zip"
+        )
+        path = download(self.url, path=zip_file_path, overwrite=True)
+        extract_archive(path, self.raw_path, overwrite=True)
+        makedirs(self.save_path)
+        os.rename(
+            os.path.join(
+                self.raw_path, "coco_superpixels_" + self.construct_format
+            ),
+            self.save_path,
+        )
+        os.unlink(path)
+
+    def label_remap(self):
+        # Util function to remap the labels as the original label
+        # idxs are not contiguous
+        # fmt: off
+        original_label_idx = [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19,
+            20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+            58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78,
+            79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90
+        ]
+        # fmt: on
+        label_map = {}
+        for i, key in enumerate(original_label_idx):
+            label_map[key] = i
+
+        return label_map
+
+    def process(self):
+        with open(self.raw_data_path, "rb") as file:
+            graphs = pickle.load(file)
+
+        label_map = self.label_remap()
+
+        for idx in tqdm(
+            range(len(graphs)), desc=f"Processing {self.split} dataset"
+        ):
+            graph = graphs[idx]
+
+            """
+            Each `graph` is a tuple (x, edge_attr, edge_index, y)
+                Shape of x : [num_nodes, 14]
+                Shape of edge_attr : [num_edges, 1] or [num_edges, 2]
+                Shape of edge_index : [2, num_edges]
+                Shape of y : [num_nodes]
+            """
+
+            DGLgraph = dgl_graph(
+                (graph[2][0], graph[2][1]),
+                num_nodes=len(graph[3]),
+            )
+            DGLgraph.ndata["feat"] = graph[0].to(F.float32)
+            DGLgraph.edata["feat"] = graph[1].to(F.float32)
+
+            y = F.tensor(graph[3])
+
+            # Label remapping. See self.label_remap() func
+            for i, label in enumerate(y):
+                y[i] = label_map[label.item()]
+
+            DGLgraph.ndata["label"] = y
+            self.graphs.append(DGLgraph)
+
+    def load(self):
+        with open(self.graph_path, "rb") as file:
+            graphs = pickle.load(file)
+            self.graphs = graphs
+
+    def save(self):
+        with open(os.path.join(self.graph_path), "wb") as file:
+            pickle.dump(self.graphs, file)
+
+    def has_cache(self):
+        return os.path.exists(self.graph_path)
+
+    def __getitem__(self, idx):
+        r"""Get the idx-th sample.
+
+        Parameters
+        ---------
+        idx : int or tensor
+            The sample index.
+            1-D tensor as `idx` is allowed when transform is None.
+
+        Returns
+        -------
+        :class:`dgl.DGLGraph`
+            graph structure, node features, node labels and edge features.
+
+            - ``ndata['feat']``: node features
+            - ``ndata['label']``: node labels
+            - ``edata['feat']``: edge features
+        or
+        :class:`dgl.data.utils.Subset`
+            Subset of the dataset at specified indices
+        """
+        if F.is_tensor(idx) and idx.dim() == 1:
+            if self._transform is None:
+                return Subset(self, idx.cpu())
+            raise ValueError(
+                "Tensor idx not supported when transform is not None."
+            )
+
+        if self._transform is None:
+            return self.graphs[idx]
+
+        return self._transform(self.graphs[idx])
diff --git a/tests/integration/test_data.py b/tests/integration/test_data.py
@@ -107,6 +107,23 @@ def test_VOC_superpixels():
     assert g2.num_edges() - g1.num_edges() == g1.num_nodes()
 
 
+@unittest.skipIf(
+    F._default_context_str == "gpu",
+    reason="Datasets don't need to be tested on GPU.",
+)
+@unittest.skipIf(
+    dgl.backend.backend_name != "pytorch", reason="only supports pytorch"
+)
+def test_COCO_superpixels():
+    transform = dgl.AddSelfLoop(allow_duplicate=True)
+    dataset1 = data.COCOSuperpixelsDataset()
+    g1 = dataset1[0]
+    dataset2 = data.COCOSuperpixelsDataset(transform=transform)
+    g2 = dataset2[0]
+
+    assert g2.num_edges() - g1.num_edges() == g1.num_nodes()
+
+
 @unittest.skipIf(
     F._default_context_str == "gpu",
     reason="Datasets don't need to be tested on GPU.",