dmlc · rudongyu · Nov 14, 2023 · Nov 8, 2023 · Nov 9, 2023 · Nov 12, 2023
diff --git a/python/dgl/data/__init__.py b/python/dgl/data/__init__.py
@@ -84,6 +84,7 @@
 except ImportError:
     pass
 from .pattern import PATTERNDataset
+from .superpixel import SuperPixelDataset
 from .wikics import WikiCSDataset
 from .yelp import YelpDataset
 from .zinc import ZINCDataset

diff --git a/python/dgl/data/superpixel.py b/python/dgl/data/superpixel.py
@@ -0,0 +1,362 @@
+import os
+import pickle
+
+import numpy as np
+from scipy.spatial.distance import cdist
+from tqdm import tqdm
+
+from .. import backend as F
+from ..convert import graph as dgl_graph
+
+from .dgl_dataset import DGLDataset
+from .utils import download, extract_archive, load_graphs, save_graphs, Subset
+
+
+def sigma(dists, kth=8):
+    # Compute sigma and reshape
+    try:
+        # Get k-nearest neighbors for each node
+        knns = np.partition(dists, kth, axis=-1)[:, kth::-1]
+        sigma = knns.sum(axis=1).reshape((knns.shape[0], 1)) / kth
+    except ValueError:  # handling for graphs with num_nodes less than kth
+        num_nodes = dists.shape[0]
+        sigma = np.array([1] * num_nodes).reshape(num_nodes, 1)
+
+    return sigma + 1e-8  # adding epsilon to avoid zero value of sigma
+
+
+def compute_adjacency_matrix_images(coord, feat, use_feat=True):
+    coord = coord.reshape(-1, 2)
+    # Compute coordinate distance
+    c_dist = cdist(coord, coord)
+
+    if use_feat:
+        # Compute feature distance
+        f_dist = cdist(feat, feat)
+        # Compute adjacency
+        A = np.exp(
+            -((c_dist / sigma(c_dist)) ** 2) - (f_dist / sigma(f_dist)) ** 2
+        )
+    else:
+        A = np.exp(-((c_dist / sigma(c_dist)) ** 2))
+
+    # Convert to symmetric matrix
+    A = 0.5 * (A + A.T)
+    A[np.diag_indices_from(A)] = 0
+    return A
+
+
+def compute_edges_list(A, kth=8 + 1):
+    # Get k-similar neighbor indices for each node
+
+    num_nodes = A.shape[0]
+    new_kth = num_nodes - kth
+
+    if num_nodes > 9:
+        knns = np.argpartition(A, new_kth - 1, axis=-1)[:, new_kth:-1]
+        knn_values = np.partition(A, new_kth - 1, axis=-1)[:, new_kth:-1]
+    else:
+        # handling for graphs with less than kth nodes
+        # in such cases, the resulting graph will be fully connected
+        knns = np.tile(np.arange(num_nodes), num_nodes).reshape(
+            num_nodes, num_nodes
+        )
+        knn_values = A
+
+        # removing self loop
+        if num_nodes != 1:
+            knn_values = A[knns != np.arange(num_nodes)[:, None]].reshape(
+                num_nodes, -1
+            )
+            knns = knns[knns != np.arange(num_nodes)[:, None]].reshape(
+                num_nodes, -1
+            )
+    return knns, knn_values
+
+
+class SuperPixelDataset(DGLDataset):
+    r"""MNIST and CIFAR10 superpixel dataset for the graph classification task.
+
+    DGL dataset of MNIST and CIFAR10 in the benchmark-gnn which contains graphs
+    converted fromt the original MINST and CIFAR10 images.
+
+    Reference `<http://arxiv.org/abs/2003.00982>`_
+
+    Statistics:
+        MNIST:
+
+        - Train examples: 60,000
+        - Test examples: 10,000
+        - Size of dataset images: 28
+
+        CIFAR10:
+
+        - Train examples: 45,000
+        - Test examples: 10,000
+        - Size of dataset images: 32
+
+    Parameters
+    ----------
+    raw_dir : str
+        Directory to store all the downloaded raw datasets.
+        Default: "~/.dgl/".
+    name : str
+        Should be chosen from ["MNIST", "CIFAR10"]
+        Default: "MNIST"".
+    split : str
+        Should be chosen from ["train", "test"]
+        Default: "train".
+    use_mean_px: bool
+
+        - True: Adj matrix defined from super-pixel locations + features
+        - False: Adj matrix defined from super-pixel locations (only)
+
+        Default: False.
+    force_reload : bool
+        Whether to reload the dataset.
+        Default: False.
+    verbose : bool
+        Whether to print out progress information.
+        Default: False.
+    transform : callable, optional
+        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
+        a transformed version. The :class:`~dgl.DGLGraph` object will be
+        transformed before every access.
+
+    Examples
+    ---------
+    >>> from dgl.data import SuperPixDataset
+
+    >>> # MNIST dataset
+    >>> train_dataset = SuperPixDataset(split="train")
+    >>> len(train_dataset)
+    60000
+    >>> train_dataset.img_size
+    28
+    >>> graph, label = train_dataset[0]
+    >>> graph
+    Graph(num_nodes=71, num_edges=568,
+        ndata_schemes={'feat': Scheme(shape=(3,), dtype=torch.float16)}
+        edata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float16)})
+
+    >>> # CIFAR10 dataset
+    >>> train_dataset = SuperPixDataset(name="CIFAR10", split="train")
+    >>> len(train_dataset)
+    60000
+    >>> train_dataset.img_size
+    28
+    >>> graph, label = train_dataset[0]
+    >>> graph
+    Graph(num_nodes=71, num_edges=568,
+        ndata_schemes={'feat': Scheme(shape=(3,), dtype=torch.float16)}
+        edata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float16)})
+
+    >>> # support tensor to be index when transform is None
+    >>> # see details in __getitem__ function
+    >>> import torch
+    >>> idx = torch.tensor([0, 1, 2])
+    >>> train_dataset_subset = train_dataset[idx]
+    >>> train_dataset_subset[0]
+    Graph(num_nodes=71, num_edges=568,
+        ndata_schemes={'feat': Scheme(shape=(3,), dtype=torch.float16)}
+        edata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float16)})
+    """
+
+    def __init__(
+        self,
+        raw_dir=None,
+        name="MNIST",
+        split="train",
+        use_mean_px=False,
+        force_reload=False,
+        verbose=False,
+        transform=None,
+    ):
+        assert split in ["train", "test"], "split not valid."
+        assert name in ["MNIST", "CIFAR10"], "name not valid."
+
+        self.use_mean_px = use_mean_px
+        self.split = split
+        self._dataset_name = name
+        self.graphs = []
+        self.labels = []
+
+        super().__init__(
+            name="Superpixel",
+            raw_dir=raw_dir,
+            url="""
+            https://www.dropbox.com/s/y2qwa77a0fxem47/superpixels.zip?dl=1
+            """,
+            force_reload=force_reload,
+            verbose=verbose,
+            transform=transform,
+        )
+
+    @property
+    def img_size(self):
+        r"""Size of dataset image."""
+        if self._dataset_name == "MNIST":
+            return 28
+        return 32
+
+    @property
+    def save_path(self):
+        r"""Directory to save the processed dataset."""
+        return os.path.join(self.raw_path, "processed")
+
+    @property
+    def raw_data_path(self):
+        r"""Path to save the raw dataset file."""
+        return os.path.join(self.raw_path, "superpixels.zip")
+
+    @property
+    def graph_path(self):
+        r"""Path to save the processed dataset file."""
+        if self.use_mean_px:
+            return os.path.join(
+                self.save_path,
+                f"use_feat_{self._dataset_name}_{self.split}.pkl",
+            )
+        return os.path.join(
+            self.save_path, f"{self._dataset_name}_{self.split}.pkl"
+        )
+
+    def download(self):
+        path = download(self.url, path=self.raw_data_path)
+        extract_archive(path, target_dir=self.raw_path, overwrite=True)
+
+    def process(self):
+        if self._dataset_name == "MNIST":
+            plk_file = "mnist_75sp"
+        elif self._dataset_name == "CIFAR10":
+            plk_file = "cifar10_150sp"
+
+        with open(
+            os.path.join(
+                self.raw_path, "superpixels", f"{plk_file}_{self.split}.pkl"
+            ),
+            "rb",
+        ) as f:
+            self.labels, self.sp_data = pickle.load(f)
+            self.labels = F.tensor(self.labels)
+
+        (
+            self.Adj_matrices,
+            self.node_features,
+            self.edges_lists,
+            self.edge_features,
+        ) = (
+            [],
+            [],
+            [],
+            [],
+        )
+        for index, sample in enumerate(
+            tqdm(self.sp_data, desc=f"Processing {self.split} dataset")
+        ):
+            mean_px, coord = sample[:2]
+            coord = coord / self.img_size
+
+            if self.use_mean_px:
+                A = compute_adjacency_matrix_images(
+                    coord, mean_px
+                )  # using super-pixel locations + features
+            else:
+                A = compute_adjacency_matrix_images(
+                    coord, mean_px, False
+                )  # using only super-pixel locations
+            edges_list, edge_values_list = compute_edges_list(A)
+
+            N_nodes = A.shape[0]
+
+            mean_px = mean_px.reshape(N_nodes, -1)
+            coord = coord.reshape(N_nodes, 2)
+            x = np.concatenate((mean_px, coord), axis=1)
+
+            edge_values_list = edge_values_list.reshape(-1)  # TO DOUBLE-CHECK !
+
+            self.node_features.append(x)
+            self.edge_features.append(edge_values_list)
+            self.Adj_matrices.append(A)
+            self.edges_lists.append(edges_list)
+
+        for index in tqdm(
+            range(len(self.sp_data)), desc=f"Dump {self.split} dataset"
+        ):
+            N = self.node_features[index].shape[0]
+
+            src_nodes = []
+            dst_nodes = []
+            for src, dsts in enumerate(self.edges_lists[index]):
+                # handling for 1 node where the self loop would be the only edge
+                if N == 1:
+                    src_nodes.append(src)
+                    dst_nodes.append(dsts)
+                else:
+                    dsts = dsts[dsts != src]
+                    srcs = [src] * len(dsts)
+                    src_nodes.extend(srcs)
+                    dst_nodes.extend(dsts)
+
+            src_nodes = F.tensor(src_nodes)
+            dst_nodes = F.tensor(dst_nodes)
+
+            g = dgl_graph((src_nodes, dst_nodes), num_nodes=N)
+            g.ndata["feat"] = (
+                F.zerocopy_from_numpy(self.node_features[index])
+                .to(F.float32)
+                .half()
+            )
+            g.edata["feat"] = (
+                F.zerocopy_from_numpy(self.edge_features[index])
+                .to(F.float32)
+                .unsqueeze(1)
+                .half()
+            )
+
+            self.graphs.append(g)
+
+    def load(self):
+        self.graphs, label_dict = load_graphs(self.graph_path)
+        self.labels = label_dict["labels"]
+
+    def save(self):
+        save_graphs(
+            self.graph_path, self.graphs, labels={"labels": self.labels}
+        )
+
+    def has_cache(self):
+        return os.path.exists(self.graph_path)
+
+    def __len__(self):
+        return len(self.graphs)
+
+    def __getitem__(self, idx):
+        """Get the idx-th sample.
+
+        Parameters
+        ---------
+        idx : int or tensor
+            The sample index.
+            1-D tensor as `idx` is allowed when transform is None.
+
+        Returns
+        -------
+        (:class:`dgl.DGLGraph`, Tensor)
+            Graph with node feature stored in ``feat`` field and its label.
+        or
+        :class:`dgl.data.utils.Subset`
+            Subset of the dataset at specified indices
+        """
+        if F.is_tensor(idx) and idx.dim() == 1:
+            if self._transform is None:
+                return Subset(self, idx.cpu())
+
+            raise ValueError(
+                "Tensor idx not supported when transform is not None."
+            )
+
+        if self._transform is None:
+            return self.graphs[idx], self.labels[idx]
+
+        return self._transform(self.graphs[idx]), self.labels[idx]
diff --git a/tests/integration/test_data.py b/tests/integration/test_data.py
@@ -124,6 +124,23 @@ def test_COCO_superpixels():
     assert g2.num_edges() - g1.num_edges() == g1.num_nodes()
 
 
+@unittest.skipIf(
+    F._default_context_str == "gpu",
+    reason="Datasets don't need to be tested on GPU.",
+)
+@unittest.skipIf(
+    dgl.backend.backend_name != "pytorch", reason="only supports pytorch"
+)
+def test_super_pixel():
+    transform = dgl.AddSelfLoop(allow_duplicate=True)
+    dataset1 = data.SuperPixelDataset()
+    g1, _ = dataset1[0]
+    dataset2 = data.SuperPixelDataset(transform=transform)
+    g2, _ = dataset2[0]
+
+    assert g2.num_edges() - g1.num_edges() == g1.num_nodes()
+
+
 @unittest.skipIf(
     F._default_context_str == "gpu",
     reason="Datasets don't need to be tested on GPU.",