Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Dataset] Add CIFAR10 MNIST dataset in benchmark-gnn #6543

Merged
merged 4 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/dgl/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
except ImportError:
pass
from .pattern import PATTERNDataset
from .superpixel import SuperPixelDataset
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved
from .wikics import WikiCSDataset
from .yelp import YelpDataset
from .zinc import ZINCDataset
Expand Down
362 changes: 362 additions & 0 deletions python/dgl/data/superpixel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,362 @@
import os
import pickle

import numpy as np
from scipy.spatial.distance import cdist
from tqdm import tqdm

from .. import backend as F
from ..convert import graph as dgl_graph

from .dgl_dataset import DGLDataset
from .utils import download, extract_archive, load_graphs, save_graphs, Subset


def sigma(dists, kth=8):
# Compute sigma and reshape
try:
# Get k-nearest neighbors for each node
knns = np.partition(dists, kth, axis=-1)[:, kth::-1]
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved
sigma = knns.sum(axis=1).reshape((knns.shape[0], 1)) / kth
except ValueError: # handling for graphs with num_nodes less than kth
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved
num_nodes = dists.shape[0]
sigma = np.array([1] * num_nodes).reshape(num_nodes, 1)

return sigma + 1e-8 # adding epsilon to avoid zero value of sigma


def compute_adjacency_matrix_images(coord, feat, use_feat=True):
coord = coord.reshape(-1, 2)
# Compute coordinate distance
c_dist = cdist(coord, coord)

if use_feat:
# Compute feature distance
f_dist = cdist(feat, feat)
# Compute adjacency
A = np.exp(
-((c_dist / sigma(c_dist)) ** 2) - (f_dist / sigma(f_dist)) ** 2
)
else:
A = np.exp(-((c_dist / sigma(c_dist)) ** 2))

# Convert to symmetric matrix
A = 0.5 * (A + A.T)
A[np.diag_indices_from(A)] = 0
return A


def compute_edges_list(A, kth=8 + 1):
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved
# Get k-similar neighbor indices for each node
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved

num_nodes = A.shape[0]
new_kth = num_nodes - kth

if num_nodes > 9:
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved
knns = np.argpartition(A, new_kth - 1, axis=-1)[:, new_kth:-1]
knn_values = np.partition(A, new_kth - 1, axis=-1)[:, new_kth:-1]
else:
# handling for graphs with less than kth nodes
# in such cases, the resulting graph will be fully connected
knns = np.tile(np.arange(num_nodes), num_nodes).reshape(
num_nodes, num_nodes
)
knn_values = A

# removing self loop
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved
if num_nodes != 1:
knn_values = A[knns != np.arange(num_nodes)[:, None]].reshape(
num_nodes, -1
)
knns = knns[knns != np.arange(num_nodes)[:, None]].reshape(
num_nodes, -1
)
return knns, knn_values


class SuperPixelDataset(DGLDataset):
r"""MNIST and CIFAR10 superpixel dataset for the graph classification task.

DGL dataset of MNIST and CIFAR10 in the benchmark-gnn which contains graphs
converted fromt the original MINST and CIFAR10 images.

Reference `<http://arxiv.org/abs/2003.00982>`_

Statistics:
MNIST:

- Train examples: 60,000
- Test examples: 10,000
- Size of dataset images: 28

CIFAR10:

- Train examples: 45,000
- Test examples: 10,000
- Size of dataset images: 32

Parameters
----------
raw_dir : str
Directory to store all the downloaded raw datasets.
Default: "~/.dgl/".
name : str
Should be chosen from ["MNIST", "CIFAR10"]
Default: "MNIST"".
split : str
Should be chosen from ["train", "test"]
Default: "train".
use_mean_px: bool
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved

- True: Adj matrix defined from super-pixel locations + features
- False: Adj matrix defined from super-pixel locations (only)

Default: False.
force_reload : bool
Whether to reload the dataset.
Default: False.
verbose : bool
Whether to print out progress information.
Default: False.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.

Examples
---------
>>> from dgl.data import SuperPixDataset

>>> # MNIST dataset
>>> train_dataset = SuperPixDataset(split="train")
>>> len(train_dataset)
60000
>>> train_dataset.img_size
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved
28
>>> graph, label = train_dataset[0]
>>> graph
Graph(num_nodes=71, num_edges=568,
ndata_schemes={'feat': Scheme(shape=(3,), dtype=torch.float16)}
edata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float16)})

>>> # CIFAR10 dataset
>>> train_dataset = SuperPixDataset(name="CIFAR10", split="train")
>>> len(train_dataset)
60000
>>> train_dataset.img_size
28
>>> graph, label = train_dataset[0]
>>> graph
Graph(num_nodes=71, num_edges=568,
ndata_schemes={'feat': Scheme(shape=(3,), dtype=torch.float16)}
edata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float16)})

>>> # support tensor to be index when transform is None
>>> # see details in __getitem__ function
>>> import torch
>>> idx = torch.tensor([0, 1, 2])
>>> train_dataset_subset = train_dataset[idx]
>>> train_dataset_subset[0]
Graph(num_nodes=71, num_edges=568,
ndata_schemes={'feat': Scheme(shape=(3,), dtype=torch.float16)}
edata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float16)})
"""

def __init__(
self,
raw_dir=None,
name="MNIST",
split="train",
use_mean_px=False,
force_reload=False,
verbose=False,
transform=None,
):
assert split in ["train", "test"], "split not valid."
assert name in ["MNIST", "CIFAR10"], "name not valid."

self.use_mean_px = use_mean_px
self.split = split
self._dataset_name = name
self.graphs = []
self.labels = []

super().__init__(
name="Superpixel",
raw_dir=raw_dir,
url="""
https://www.dropbox.com/s/y2qwa77a0fxem47/superpixels.zip?dl=1
""",
force_reload=force_reload,
verbose=verbose,
transform=transform,
)

@property
def img_size(self):
r"""Size of dataset image."""
if self._dataset_name == "MNIST":
return 28
return 32

@property
def save_path(self):
r"""Directory to save the processed dataset."""
return os.path.join(self.raw_path, "processed")

@property
def raw_data_path(self):
r"""Path to save the raw dataset file."""
return os.path.join(self.raw_path, "superpixels.zip")

@property
def graph_path(self):
r"""Path to save the processed dataset file."""
if self.use_mean_px:
return os.path.join(
self.save_path,
f"use_feat_{self._dataset_name}_{self.split}.pkl",
)
return os.path.join(
self.save_path, f"{self._dataset_name}_{self.split}.pkl"
)

def download(self):
path = download(self.url, path=self.raw_data_path)
extract_archive(path, target_dir=self.raw_path, overwrite=True)

def process(self):
if self._dataset_name == "MNIST":
plk_file = "mnist_75sp"
elif self._dataset_name == "CIFAR10":
plk_file = "cifar10_150sp"

with open(
os.path.join(
self.raw_path, "superpixels", f"{plk_file}_{self.split}.pkl"
),
"rb",
) as f:
self.labels, self.sp_data = pickle.load(f)
self.labels = F.tensor(self.labels)

(
self.Adj_matrices,
self.node_features,
self.edges_lists,
self.edge_features,
) = (
[],
[],
[],
[],
)
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved
for index, sample in enumerate(
tqdm(self.sp_data, desc=f"Processing {self.split} dataset")
):
mean_px, coord = sample[:2]
coord = coord / self.img_size

if self.use_mean_px:
A = compute_adjacency_matrix_images(
coord, mean_px
) # using super-pixel locations + features
else:
A = compute_adjacency_matrix_images(
coord, mean_px, False
) # using only super-pixel locations
edges_list, edge_values_list = compute_edges_list(A)

N_nodes = A.shape[0]

mean_px = mean_px.reshape(N_nodes, -1)
coord = coord.reshape(N_nodes, 2)
x = np.concatenate((mean_px, coord), axis=1)

edge_values_list = edge_values_list.reshape(-1) # TO DOUBLE-CHECK !
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved

self.node_features.append(x)
self.edge_features.append(edge_values_list)
self.Adj_matrices.append(A)
self.edges_lists.append(edges_list)

for index in tqdm(
range(len(self.sp_data)), desc=f"Dump {self.split} dataset"
):
N = self.node_features[index].shape[0]

src_nodes = []
dst_nodes = []
for src, dsts in enumerate(self.edges_lists[index]):
# handling for 1 node where the self loop would be the only edge
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved
if N == 1:
src_nodes.append(src)
dst_nodes.append(dsts)
else:
dsts = dsts[dsts != src]
srcs = [src] * len(dsts)
src_nodes.extend(srcs)
dst_nodes.extend(dsts)

src_nodes = F.tensor(src_nodes)
dst_nodes = F.tensor(dst_nodes)

g = dgl_graph((src_nodes, dst_nodes), num_nodes=N)
g.ndata["feat"] = (
F.zerocopy_from_numpy(self.node_features[index])
.to(F.float32)
.half()
)
g.edata["feat"] = (
F.zerocopy_from_numpy(self.edge_features[index])
.to(F.float32)
.unsqueeze(1)
.half()
)

self.graphs.append(g)

def load(self):
self.graphs, label_dict = load_graphs(self.graph_path)
self.labels = label_dict["labels"]

def save(self):
save_graphs(
self.graph_path, self.graphs, labels={"labels": self.labels}
)

def has_cache(self):
return os.path.exists(self.graph_path)

def __len__(self):
return len(self.graphs)

def __getitem__(self, idx):
"""Get the idx-th sample.

Parameters
---------
idx : int or tensor
The sample index.
1-D tensor as `idx` is allowed when transform is None.

Returns
-------
(:class:`dgl.DGLGraph`, Tensor)
Graph with node feature stored in ``feat`` field and its label.
or
:class:`dgl.data.utils.Subset`
Subset of the dataset at specified indices
"""
if F.is_tensor(idx) and idx.dim() == 1:
if self._transform is None:
return Subset(self, idx.cpu())

raise ValueError(
"Tensor idx not supported when transform is not None."
)

if self._transform is None:
return self.graphs[idx], self.labels[idx]

return self._transform(self.graphs[idx]), self.labels[idx]
17 changes: 17 additions & 0 deletions tests/integration/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,23 @@ def test_COCO_superpixels():
assert g2.num_edges() - g1.num_edges() == g1.num_nodes()


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(
dgl.backend.backend_name != "pytorch", reason="only supports pytorch"
)
def test_super_pixel():
paoxiaode marked this conversation as resolved.
Show resolved Hide resolved
transform = dgl.AddSelfLoop(allow_duplicate=True)
dataset1 = data.SuperPixelDataset()
g1, _ = dataset1[0]
dataset2 = data.SuperPixelDataset(transform=transform)
g2, _ = dataset2[0]

assert g2.num_edges() - g1.num_edges() == g1.num_nodes()


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
Expand Down