Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Dataset] add COCOsuperpixel dataset #6407

Merged
merged 7 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/dgl/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
# datasets.
try:
from .lrgb import (
COCOSuperpixelsDataset,
PeptidesFunctionalDataset,
PeptidesStructuralDataset,
VOCSuperpixelsDataset,
rudongyu marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
283 changes: 283 additions & 0 deletions python/dgl/data/lrgb.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,3 +797,286 @@ def __getitem__(self, idx):
return self.graphs[idx]

return self._transform(self.graphs[idx])


class COCOSuperpixelsDataset(DGLDataset):
r"""COCO superpixel dataset for the node classification task.
DGL dataset of COCO-SP in the LRGB benckmark which contains image
superpixels and a semantic segmentation label for each node superpixel.
Based on the COCO 2017 dataset. Original source `<https://cocodataset.org>`_
Reference `<https://arxiv.org/abs/2206.08164.pdf>`_
Statistics:
- Train examples: 113,286
- Valid examples: 5,000
- Test examples: 5,000
- Average number of nodes: 476.88
- Average number of edges: 2,710.48
- Number of node classes: 81
Parameters
----------
raw_dir : str
Directory to store all the downloaded raw datasets.
Default: "~/.dgl/".
split : str
Should be chosen from ["train", "val", "test"]
Default: "train".
construct_format : str, optional
Option to select the graph construction format.
Should be chosen from the following formats:
- "edge_wt_only_coord": the graphs are 8-nn graphs with the edge weights
computed based on only spatial coordinates of superpixel nodes.
- "edge_wt_coord_feat": the graphs are 8-nn graphs with the edge weights
computed based on combination of spatial coordinates and feature
values of superpixel nodes.
- "edge_wt_region_boundary": the graphs region boundary graphs where two
regions (i.e. superpixel nodes) have an edge between them if they
share a boundary in the original image.
Default: "edge_wt_region_boundary".
slic_compactness : int, optional
Option to select compactness of slic that was used for superpixels
Should be chosen from [10, 30]
Default: 30.
force_reload : bool
Whether to reload the dataset.
Default: False.
verbose : bool
Whether to print out progress information.
Default: False.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
Examples
---------
>>> from dgl.data import COCOSuperpixelsDataset
>>> train_dataset = COCOSuperpixelsDataset(split="train")
>>> len(train_dataset)
113286
>>> train_dataset.num_classes
81
>>> graph = train_dataset[0]
>>> graph
Graph(num_nodes=488, num_edges=2766,
ndata_schemes={'feat': Scheme(shape=(14,), dtype=torch.float32),
'label': Scheme(shape=(), dtype=torch.uint8)}
edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float32)})
>>> # support tensor to be index when transform is None
>>> # see details in __getitem__ function
>>> import torch
>>> idx = torch.tensor([0, 1, 2])
>>> train_dataset_subset = train_dataset[idx]
>>> train_dataset_subset[0]
Graph(num_nodes=488, num_edges=2766,
ndata_schemes={'feat': Scheme(shape=(14,), dtype=torch.float32),
'label': Scheme(shape=(), dtype=torch.uint8)}
edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float32)})
"""

urls = {
10: {
"edge_wt_only_coord": """
https://www.dropbox.com/s/prqizdep8gk0ndk/coco_superpixels_edge_wt_only_coord.zip?dl=1
""",
"edge_wt_coord_feat": """
https://www.dropbox.com/s/zftoyln1pkcshcg/coco_superpixels_edge_wt_coord_feat.zip?dl=1
""",
"edge_wt_region_boundary": """
https://www.dropbox.com/s/fhihfcyx2y978u8/coco_superpixels_edge_wt_region_boundary.zip?dl=1
""",
},
30: {
"edge_wt_only_coord": """
https://www.dropbox.com/s/hrbfkxmc5z9lsaz/coco_superpixels_edge_wt_only_coord.zip?dl=1
""",
"edge_wt_coord_feat": """
https://www.dropbox.com/s/4rfa2d5ij1gfu9b/coco_superpixels_edge_wt_coord_feat.zip?dl=1
""",
"edge_wt_region_boundary": """
https://www.dropbox.com/s/r6ihg1f4pmyjjy0/coco_superpixels_edge_wt_region_boundary.zip?dl=1
""",
},
}

def __init__(
self,
raw_dir=None,
split="train",
construct_format="edge_wt_region_boundary",
slic_compactness=30,
force_reload=None,
verbose=None,
transform=None,
):
assert split in ["train", "val", "test"], "split not valid."
assert construct_format in [
"edge_wt_only_coord",
"edge_wt_coord_feat",
"edge_wt_region_boundary",
], "construct_format not valid."
assert slic_compactness in [10, 30], "slic_compactness not valid."

self.construct_format = construct_format
self.slic_compactness = slic_compactness
self.split = split
self.graphs = []

super().__init__(
name="COCO-SP",
raw_dir=raw_dir,
url=self.urls[self.slic_compactness][self.construct_format],
force_reload=force_reload,
verbose=verbose,
transform=transform,
)

@property
def save_path(self):
r"""Directory to save the processed dataset."""
return os.path.join(
self.raw_path,
"slic_compactness_" + str(self.slic_compactness),
self.construct_format,
)

@property
def raw_data_path(self):
r"""Path to save the raw dataset file."""
return os.path.join(self.save_path, f"{self.split}.pickle")

@property
def graph_path(self):
r"""Path to save the processed dataset file."""
return os.path.join(self.save_path, f"processed_{self.split}.pkl")

@property
def num_classes(self):
r"""Number of classes for each node."""
return 81

def __len__(self):
r"""The number of examples in the dataset."""
return len(self.graphs)

def download(self):
zip_file_path = os.path.join(
self.raw_path, "coco_superpixels_" + self.construct_format + ".zip"
)
path = download(self.url, path=zip_file_path, overwrite=True)
extract_archive(path, self.raw_path, overwrite=True)
makedirs(self.save_path)
os.rename(
os.path.join(
self.raw_path, "coco_superpixels_" + self.construct_format
),
self.save_path,
)
os.unlink(path)

def label_remap(self):
# Util function to remap the labels as the original label
# idxs are not contiguous
# fmt: off
original_label_idx = [
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved
rudongyu marked this conversation as resolved.
Show resolved Hide resolved
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78,
79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90
]
# fmt: on
label_map = {}
for i, key in enumerate(original_label_idx):
label_map[key] = i

return label_map

def process(self):
with open(self.raw_data_path, "rb") as file:
graphs = pickle.load(file)

label_map = self.label_remap()

for idx in tqdm(
range(len(graphs)), desc=f"Processing {self.split} dataset"
):
graph = graphs[idx]

"""
Each `graph` is a tuple (x, edge_attr, edge_index, y)
Shape of x : [num_nodes, 14]
Shape of edge_attr : [num_edges, 1] or [num_edges, 2]
Shape of edge_index : [2, num_edges]
Shape of y : [num_nodes]
"""

DGLgraph = dgl_graph(
(graph[2][0], graph[2][1]),
num_nodes=len(graph[3]),
)
DGLgraph.ndata["feat"] = graph[0].to(F.float32)
DGLgraph.edata["feat"] = graph[1].to(F.float32)

y = F.tensor(graph[3])

# Label remapping. See self.label_remap() func
for i, label in enumerate(y):
y[i] = label_map[label.item()]

DGLgraph.ndata["label"] = y
self.graphs.append(DGLgraph)

def load(self):
with open(self.graph_path, "rb") as file:
graphs = pickle.load(file)
self.graphs = graphs

def save(self):
with open(os.path.join(self.graph_path), "wb") as file:
pickle.dump(self.graphs, file)

def has_cache(self):
return os.path.exists(self.graph_path)

def __getitem__(self, idx):
r"""Get the idx-th sample.
Parameters
---------
idx : int or tensor
The sample index.
1-D tensor as `idx` is allowed when transform is None.
Returns
-------
:class:`dgl.DGLGraph`
graph structure, node features, node labels and edge features.
- ``ndata['feat']``: node features
- ``ndata['label']``: node labels
- ``edata['feat']``: edge features
or
:class:`dgl.data.utils.Subset`
Subset of the dataset at specified indices
"""
if F.is_tensor(idx) and idx.dim() == 1:
if self._transform is None:
return Subset(self, idx.cpu())
raise ValueError(
"Tensor idx not supported when transform is not None."
)

if self._transform is None:
return self.graphs[idx]

return self._transform(self.graphs[idx])
17 changes: 17 additions & 0 deletions tests/integration/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,23 @@ def test_VOC_superpixels():
assert g2.num_edges() - g1.num_edges() == g1.num_nodes()


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(
dgl.backend.backend_name != "pytorch", reason="only supports pytorch"
)
def test_COCO_superpixels():
transform = dgl.AddSelfLoop(allow_duplicate=True)
dataset1 = data.COCOSuperpixelsDataset()
g1 = dataset1[0]
dataset2 = data.COCOSuperpixelsDataset(transform=transform)
g2 = dataset2[0]

assert g2.num_edges() - g1.num_edges() == g1.num_nodes()


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
Expand Down