Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Data] new functional for creating data splits in graph #5418

Merged
merged 21 commits into from
Mar 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
cbd793b
new functional for creating data splits in graph
gvbazhenov Mar 2, 2023
f8e3460
Merge branch 'dmlc:master' into structural-shifts
gvbazhenov Mar 3, 2023
cdd4a17
minor fix in data split implementation
gvbazhenov Mar 3, 2023
a1e8c55
Merge branch 'structural-shifts' of https://github.com/gvbazhenov/dgl…
gvbazhenov Mar 3, 2023
845274d
Merge branch 'dmlc:master' into structural-shifts
gvbazhenov Mar 6, 2023
2aab8e5
apply suggestions from code review
gvbazhenov Mar 7, 2023
09af927
Merge branch 'dmlc:master' into structural-shifts
gvbazhenov Mar 7, 2023
910d757
refactoring + unit tests
gvbazhenov Mar 7, 2023
37e933f
Merge branch 'master' into structural-shifts
mufeili Mar 8, 2023
126323e
fix test file name
gvbazhenov Mar 8, 2023
7044854
Merge branch 'structural-shifts' of https://github.com/gvbazhenov/dgl…
gvbazhenov Mar 8, 2023
6dfe6de
Merge branch 'dmlc:master' into structural-shifts
gvbazhenov Mar 8, 2023
932cca0
move imports to the top
gvbazhenov Mar 8, 2023
2fa397e
Revert "fix test file name"
gvbazhenov Mar 8, 2023
c7ddd2b
Merge branch 'master' into structural-shifts
gvbazhenov Mar 8, 2023
00c842b
remove nccl submodule
gvbazhenov Mar 8, 2023
2f9677c
Merge branch 'master' into structural-shifts
frozenbugs Mar 9, 2023
d60cf9a
address linter issues
gvbazhenov Mar 9, 2023
0a622a9
Merge branch 'structural-shifts' of https://github.com/gvbazhenov/dgl…
gvbazhenov Mar 9, 2023
6d98537
Merge branch 'dmlc:master' into structural-shifts
gvbazhenov Mar 9, 2023
57fd348
Merge branch 'structural-shifts' of https://github.com/gvbazhenov/dgl…
gvbazhenov Mar 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/api/python/dgl.data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,6 @@ Utilities
utils.save_info
utils.load_info
utils.add_nodepred_split
utils.mask_nodes_by_property
utils.add_node_property_split
utils.Subset
192 changes: 192 additions & 0 deletions python/dgl/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import sys
import warnings

import networkx.algorithms as A

import numpy as np
import requests

Expand All @@ -29,6 +31,8 @@
"save_tensors",
"load_tensors",
"add_nodepred_split",
"add_node_property_split",
"mask_nodes_by_property",
]


Expand Down Expand Up @@ -482,3 +486,191 @@ def add_nodepred_split(dataset, ratio, ntype=None):
g.nodes[ntype].data["train_mask"] = train_mask
g.nodes[ntype].data["val_mask"] = val_mask
g.nodes[ntype].data["test_mask"] = test_mask


def mask_nodes_by_property(property_values, part_ratios, random_seed=None):
gvbazhenov marked this conversation as resolved.
Show resolved Hide resolved
"""Provide the split masks for a node split with distributional shift based on a given
node property, as proposed in `Evaluating Robustness and Uncertainty of Graph Models
Under Structural Distributional Shifts <https://arxiv.org/abs/2302.13875v1>`__

It considers the in-distribution (ID) and out-of-distribution (OOD) subsets of nodes.
The ID subset includes training, validation and testing parts, while the OOD subset
includes validation and testing parts. It sorts the nodes in the ascending order of
their property values, splits them into 5 non-intersecting parts, and creates 5
associated node mask arrays:
- 3 for the ID nodes: ``'in_train_mask'``, ``'in_valid_mask'``, ``'in_test_mask'``,
- and 2 for the OOD nodes: ``'out_valid_mask'``, ``'out_test_mask'``.

Parameters
----------
property_values : numpy ndarray
The node property (float) values by which the dataset will be split.
The length of the array must be equal to the number of nodes in graph.
part_ratios : list
A list of 5 ratios for training, ID validation, ID test,
OOD validation, OOD testing parts. The values in the list must sum to one.
random_seed : int, optional
Random seed to fix for the initial permutation of nodes. It is
used to create a random order for the nodes that have the same
property values or belong to the ID subset. (default: None)

Returns
----------
split_masks : dict
A python dict storing the mask names as keys and the corresponding
node mask arrays as values.

Examples
--------
>>> num_nodes = 1000
>>> property_values = np.random.uniform(size=num_nodes)
>>> part_ratios = [0.3, 0.1, 0.1, 0.3, 0.2]
>>> split_masks = dgl.data.utils.mask_nodes_by_property(property_values, part_ratios)
>>> print('in_valid_mask' in split_masks)
True
"""

num_nodes = len(property_values)
part_sizes = np.round(num_nodes * np.array(part_ratios)).astype(int)
part_sizes[-1] -= np.sum(part_sizes) - num_nodes

generator = np.random.RandomState(random_seed)
permutation = generator.permutation(num_nodes)

node_indices = np.arange(num_nodes)[permutation]
property_values = property_values[permutation]
in_distribution_size = np.sum(part_sizes[:3])

node_indices_ordered = node_indices[np.argsort(property_values)]
node_indices_ordered[:in_distribution_size] = generator.permutation(
node_indices_ordered[:in_distribution_size]
)

sections = np.cumsum(part_sizes)
node_split = np.split(node_indices_ordered, sections)[:-1]
mask_names = [
"in_train_mask",
"in_valid_mask",
"in_test_mask",
"out_valid_mask",
"out_test_mask",
]
split_masks = {}

for mask_name, node_indices in zip(mask_names, node_split):
split_mask = idx2mask(node_indices, num_nodes)
split_masks[mask_name] = generate_mask_tensor(split_mask)

return split_masks


def add_node_property_split(
gvbazhenov marked this conversation as resolved.
Show resolved Hide resolved
dataset, part_ratios, property_name, ascending=True, random_seed=None
):
"""Create a node split with distributional shift based on a given node property,
as proposed in `Evaluating Robustness and Uncertainty of Graph Models Under
Structural Distributional Shifts <https://arxiv.org/abs/2302.13875v1>`__

It splits the nodes of each graph in the given dataset into 5 non-intersecting
parts based on their structural properties. This can be used for transductive node
prediction task with distributional shifts.

It considers the in-distribution (ID) and out-of-distribution (OOD) subsets of nodes.
The ID subset includes training, validation and testing parts, while the OOD subset
includes validation and testing parts. As a result, it creates 5 associated node mask
arrays for each graph:
- 3 for the ID nodes: ``'in_train_mask'``, ``'in_valid_mask'``, ``'in_test_mask'``,
- and 2 for the OOD nodes: ``'out_valid_mask'``, ``'out_test_mask'``.

This function implements 3 particular strategies for inducing distributional shifts
in graph — based on **popularity**, **locality** or **density**.

Parameters
----------
dataset : :class:`~DGLDataset` or list of :class:`~dgl.DGLGraph`
The dataset to induce structural distributional shift.
part_ratios : list
A list of 5 ratio values for training, ID validation, ID test,
OOD validation and OOD test parts. The values must sum to 1.0.
property_name : str
The name of the node property to be used, which must be
``'popularity'``, ``'locality'`` or ``'density'``.
ascending : bool, optional
Whether to sort nodes in the ascending order of the node property,
so that nodes with greater values of the property are considered
to be OOD (default: True)
random_seed : int, optional
Random seed to fix for the initial permutation of nodes. It is
gvbazhenov marked this conversation as resolved.
Show resolved Hide resolved
used to create a random order for the nodes that have the same
property values or belong to the ID subset. (default: None)

Examples
--------
gvbazhenov marked this conversation as resolved.
Show resolved Hide resolved
>>> dataset = dgl.data.AmazonCoBuyComputerDataset()
>>> print('in_valid_mask' in dataset[0].ndata)
False
>>> part_ratios = [0.3, 0.1, 0.1, 0.3, 0.2]
>>> property_name = 'popularity'
>>> dgl.data.utils.add_node_property_split(dataset, part_ratios, property_name)
>>> print('in_valid_mask' in dataset[0].ndata)
True
"""

assert property_name in [
"popularity",
"locality",
"density",
], "The name of property has to be 'popularity', 'locality', or 'density'"

assert len(part_ratios) == 5, "part_ratios must contain 5 values"

import networkx as nx
gvbazhenov marked this conversation as resolved.
Show resolved Hide resolved

for idx in range(len(dataset)):
graph_dgl = dataset[idx]
graph_nx = nx.Graph(graph_dgl.to_networkx())

compute_property_fn = _property_name_to_compute_fn[property_name]
property_values = compute_property_fn(graph_nx, ascending)

node_masks = mask_nodes_by_property(
property_values, part_ratios, random_seed
)

for mask_name, node_mask in node_masks.items():
graph_dgl.ndata[mask_name] = node_mask


def _compute_popularity_property(graph_nx, ascending=True):
direction = -1 if ascending else 1
property_values = direction * np.array(list(A.pagerank(graph_nx).values()))
return property_values


def _compute_locality_property(graph_nx, ascending=True):
num_nodes = graph_nx.number_of_nodes()
pagerank_values = np.array(list(A.pagerank(graph_nx).values()))

personalization = dict(zip(range(num_nodes), [0.0] * num_nodes))
personalization[np.argmax(pagerank_values)] = 1.0

direction = -1 if ascending else 1
property_values = direction * np.array(
list(A.pagerank(graph_nx, personalization=personalization).values())
)
return property_values


def _compute_density_property(graph_nx, ascending=True):
direction = -1 if ascending else 1
property_values = direction * np.array(
list(A.clustering(graph_nx).values())
)
return property_values


_property_name_to_compute_fn = {
"popularity": _compute_popularity_property,
"locality": _compute_locality_property,
"density": _compute_density_property,
}
53 changes: 0 additions & 53 deletions tests/python/common/data/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,38 +408,6 @@ def test_cluster():
assert ds.num_classes == 6


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
def test_extract_archive():
# gzip
with tempfile.TemporaryDirectory() as src_dir:
gz_file = "gz_archive"
gz_path = os.path.join(src_dir, gz_file + ".gz")
content = b"test extract archive gzip"
with gzip.open(gz_path, "wb") as f:
f.write(content)
with tempfile.TemporaryDirectory() as dst_dir:
data.utils.extract_archive(gz_path, dst_dir, overwrite=True)
assert os.path.exists(os.path.join(dst_dir, gz_file))

# tar
with tempfile.TemporaryDirectory() as src_dir:
tar_file = "tar_archive"
tar_path = os.path.join(src_dir, tar_file + ".tar")
# default encode to utf8
content = "test extract archive tar\n".encode()
info = tarfile.TarInfo(name="tar_archive")
info.size = len(content)
with tarfile.open(tar_path, "w") as f:
f.addfile(info, io.BytesIO(content))
with tempfile.TemporaryDirectory() as dst_dir:
data.utils.extract_archive(tar_path, dst_dir, overwrite=True)
assert os.path.exists(os.path.join(dst_dir, tar_file))


def _test_construct_graphs_node_ids():
from dgl.data.csv_dataset_base import (
DGLGraphConstructor,
Expand Down Expand Up @@ -1659,25 +1627,6 @@ def test_csvdataset():
_test_CSVDataset_customized_data_parser()


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
def test_add_nodepred_split():
dataset = data.AmazonCoBuyComputerDataset()
print("train_mask" in dataset[0].ndata)
data.utils.add_nodepred_split(dataset, [0.8, 0.1, 0.1])
assert "train_mask" in dataset[0].ndata

dataset = data.AIFBDataset()
print("train_mask" in dataset[0].nodes["Publikationen"].data)
data.utils.add_nodepred_split(
dataset, [0.8, 0.1, 0.1], ntype="Publikationen"
)
assert "train_mask" in dataset[0].nodes["Publikationen"].data


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
Expand Down Expand Up @@ -2094,9 +2043,7 @@ def test_as_graphpred_ogb():
test_tudataset_regression()
test_fraud()
test_fakenews()
test_extract_archive()
test_csvdataset()
test_add_nodepred_split()
test_as_nodepred1()
test_as_nodepred2()
test_as_nodepred_csvdataset()
102 changes: 102 additions & 0 deletions tests/python/common/data/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import gzip
import io
import os
import tarfile
import tempfile
import unittest

import backend as F

import dgl
import dgl.data as data
import numpy as np
import pandas as pd
import pytest
import yaml
from dgl import DGLError


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
def test_add_nodepred_split():
dataset = data.AmazonCoBuyComputerDataset()
print("train_mask" in dataset[0].ndata)
data.utils.add_nodepred_split(dataset, [0.8, 0.1, 0.1])
assert "train_mask" in dataset[0].ndata

dataset = data.AIFBDataset()
print("train_mask" in dataset[0].nodes["Publikationen"].data)
data.utils.add_nodepred_split(
dataset, [0.8, 0.1, 0.1], ntype="Publikationen"
)
assert "train_mask" in dataset[0].nodes["Publikationen"].data


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
def test_extract_archive():
# gzip
with tempfile.TemporaryDirectory() as src_dir:
gz_file = "gz_archive"
gz_path = os.path.join(src_dir, gz_file + ".gz")
content = b"test extract archive gzip"
with gzip.open(gz_path, "wb") as f:
f.write(content)
with tempfile.TemporaryDirectory() as dst_dir:
data.utils.extract_archive(gz_path, dst_dir, overwrite=True)
assert os.path.exists(os.path.join(dst_dir, gz_file))

# tar
with tempfile.TemporaryDirectory() as src_dir:
tar_file = "tar_archive"
tar_path = os.path.join(src_dir, tar_file + ".tar")
# default encode to utf8
content = "test extract archive tar\n".encode()
info = tarfile.TarInfo(name="tar_archive")
info.size = len(content)
with tarfile.open(tar_path, "w") as f:
f.addfile(info, io.BytesIO(content))
with tempfile.TemporaryDirectory() as dst_dir:
data.utils.extract_archive(tar_path, dst_dir, overwrite=True)
assert os.path.exists(os.path.join(dst_dir, tar_file))


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
def test_mask_nodes_by_property():
num_nodes = 1000
property_values = np.random.uniform(size=num_nodes)
part_ratios = [0.3, 0.1, 0.1, 0.3, 0.2]
split_masks = data.utils.mask_nodes_by_property(
property_values, part_ratios
)
assert "in_valid_mask" in split_masks


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
def test_add_node_property_split():
dataset = data.AmazonCoBuyComputerDataset()
part_ratios = [0.3, 0.1, 0.1, 0.3, 0.2]
for property_name in ["popularity", "locality", "density"]:
data.utils.add_node_property_split(dataset, part_ratios, property_name)
assert "in_valid_mask" in dataset[0].ndata


if __name__ == "__main__":
test_extract_archive()
test_add_nodepred_split()
test_mask_nodes_by_property()
test_add_node_property_split()