dmlc · mufeili · Mar 9, 2023 · Mar 2, 2023 · Mar 3, 2023 · Mar 3, 2023
diff --git a/docs/source/api/python/dgl.data.rst b/docs/source/api/python/dgl.data.rst
@@ -123,4 +123,6 @@ Utilities
     utils.save_info
     utils.load_info
     utils.add_nodepred_split
+    utils.mask_nodes_by_property
+    utils.add_node_property_split
     utils.Subset
diff --git a/python/dgl/data/utils.py b/python/dgl/data/utils.py
@@ -8,6 +8,8 @@
 import sys
 import warnings
 
+import networkx.algorithms as A
+
 import numpy as np
 import requests
 
@@ -29,6 +31,8 @@
     "save_tensors",
     "load_tensors",
     "add_nodepred_split",
+    "add_node_property_split",
+    "mask_nodes_by_property",
 ]
 
 
@@ -482,3 +486,191 @@ def add_nodepred_split(dataset, ratio, ntype=None):
         g.nodes[ntype].data["train_mask"] = train_mask
         g.nodes[ntype].data["val_mask"] = val_mask
         g.nodes[ntype].data["test_mask"] = test_mask
+
+
+def mask_nodes_by_property(property_values, part_ratios, random_seed=None):
+    """Provide the split masks for a node split with distributional shift based on a given
+    node property, as proposed in `Evaluating Robustness and Uncertainty of Graph Models
+    Under Structural Distributional Shifts <https://arxiv.org/abs/2302.13875v1>`__
+
+    It considers the in-distribution (ID) and out-of-distribution (OOD) subsets of nodes.
+    The ID subset includes training, validation and testing parts, while the OOD subset
+    includes validation and testing parts. It sorts the nodes in the ascending order of
+    their property values, splits them into 5 non-intersecting parts, and creates 5
+    associated node mask arrays:
+        - 3 for the ID nodes: ``'in_train_mask'``, ``'in_valid_mask'``, ``'in_test_mask'``,
+        - and 2 for the OOD nodes: ``'out_valid_mask'``, ``'out_test_mask'``.
+
+    Parameters
+    ----------
+    property_values : numpy ndarray
+        The node property (float) values by which the dataset will be split.
+        The length of the array must be equal to the number of nodes in graph.
+    part_ratios : list
+        A list of 5 ratios for training, ID validation, ID test,
+        OOD validation, OOD testing parts. The values in the list must sum to one.
+    random_seed : int, optional
+        Random seed to fix for the initial permutation of nodes. It is
+        used to create a random order for the nodes that have the same
+        property values or belong to the ID subset. (default: None)
+
+    Returns
+    ----------
+    split_masks : dict
+        A python dict storing the mask names as keys and the corresponding
+        node mask arrays as values.
+
+    Examples
+    --------
+    >>> num_nodes = 1000
+    >>> property_values = np.random.uniform(size=num_nodes)
+    >>> part_ratios = [0.3, 0.1, 0.1, 0.3, 0.2]
+    >>> split_masks = dgl.data.utils.mask_nodes_by_property(property_values, part_ratios)
+    >>> print('in_valid_mask' in split_masks)
+    True
+    """
+
+    num_nodes = len(property_values)
+    part_sizes = np.round(num_nodes * np.array(part_ratios)).astype(int)
+    part_sizes[-1] -= np.sum(part_sizes) - num_nodes
+
+    generator = np.random.RandomState(random_seed)
+    permutation = generator.permutation(num_nodes)
+
+    node_indices = np.arange(num_nodes)[permutation]
+    property_values = property_values[permutation]
+    in_distribution_size = np.sum(part_sizes[:3])
+
+    node_indices_ordered = node_indices[np.argsort(property_values)]
+    node_indices_ordered[:in_distribution_size] = generator.permutation(
+        node_indices_ordered[:in_distribution_size]
+    )
+
+    sections = np.cumsum(part_sizes)
+    node_split = np.split(node_indices_ordered, sections)[:-1]
+    mask_names = [
+        "in_train_mask",
+        "in_valid_mask",
+        "in_test_mask",
+        "out_valid_mask",
+        "out_test_mask",
+    ]
+    split_masks = {}
+
+    for mask_name, node_indices in zip(mask_names, node_split):
+        split_mask = idx2mask(node_indices, num_nodes)
+        split_masks[mask_name] = generate_mask_tensor(split_mask)
+
+    return split_masks
+
+
+def add_node_property_split(
+    dataset, part_ratios, property_name, ascending=True, random_seed=None
+):
+    """Create a node split with distributional shift based on a given node property,
+    as proposed in `Evaluating Robustness and Uncertainty of Graph Models Under
+    Structural Distributional Shifts <https://arxiv.org/abs/2302.13875v1>`__
+
+    It splits the nodes of each graph in the given dataset into 5 non-intersecting
+    parts based on their structural properties. This can be used for transductive node
+    prediction task with distributional shifts.
+
+    It considers the in-distribution (ID) and out-of-distribution (OOD) subsets of nodes.
+    The ID subset includes training, validation and testing parts, while the OOD subset
+    includes validation and testing parts. As a result, it creates 5 associated node mask
+    arrays for each graph:
+        - 3 for the ID nodes: ``'in_train_mask'``, ``'in_valid_mask'``, ``'in_test_mask'``,
+        - and 2 for the OOD nodes: ``'out_valid_mask'``, ``'out_test_mask'``.
+
+    This function implements 3 particular strategies for inducing distributional shifts
+    in graph — based on **popularity**, **locality** or **density**.
+
+    Parameters
+    ----------
+    dataset : :class:`~DGLDataset` or list of :class:`~dgl.DGLGraph`
+        The dataset to induce structural distributional shift.
+    part_ratios : list
+        A list of 5 ratio values for training, ID validation, ID test,
+        OOD validation and OOD test parts. The values must sum to 1.0.
+    property_name : str
+        The name of the node property to be used, which must be
+        ``'popularity'``, ``'locality'`` or ``'density'``.
+    ascending : bool, optional
+        Whether to sort nodes in the ascending order of the node property,
+        so that nodes with greater values of the property are considered
+        to be OOD (default: True)
+    random_seed : int, optional
+        Random seed to fix for the initial permutation of nodes. It is
+        used to create a random order for the nodes that have the same
+        property values or belong to the ID subset. (default: None)
+
+    Examples
+    --------
+    >>> dataset = dgl.data.AmazonCoBuyComputerDataset()
+    >>> print('in_valid_mask' in dataset[0].ndata)
+    False
+    >>> part_ratios = [0.3, 0.1, 0.1, 0.3, 0.2]
+    >>> property_name = 'popularity'
+    >>> dgl.data.utils.add_node_property_split(dataset, part_ratios, property_name)
+    >>> print('in_valid_mask' in dataset[0].ndata)
+    True
+    """
+
+    assert property_name in [
+        "popularity",
+        "locality",
+        "density",
+    ], "The name of property has to be 'popularity', 'locality', or 'density'"
+
+    assert len(part_ratios) == 5, "part_ratios must contain 5 values"
+
+    import networkx as nx
+
+    for idx in range(len(dataset)):
+        graph_dgl = dataset[idx]
+        graph_nx = nx.Graph(graph_dgl.to_networkx())
+
+        compute_property_fn = _property_name_to_compute_fn[property_name]
+        property_values = compute_property_fn(graph_nx, ascending)
+
+        node_masks = mask_nodes_by_property(
+            property_values, part_ratios, random_seed
+        )
+
+        for mask_name, node_mask in node_masks.items():
+            graph_dgl.ndata[mask_name] = node_mask
+
+
+def _compute_popularity_property(graph_nx, ascending=True):
+    direction = -1 if ascending else 1
+    property_values = direction * np.array(list(A.pagerank(graph_nx).values()))
+    return property_values
+
+
+def _compute_locality_property(graph_nx, ascending=True):
+    num_nodes = graph_nx.number_of_nodes()
+    pagerank_values = np.array(list(A.pagerank(graph_nx).values()))
+
+    personalization = dict(zip(range(num_nodes), [0.0] * num_nodes))
+    personalization[np.argmax(pagerank_values)] = 1.0
+
+    direction = -1 if ascending else 1
+    property_values = direction * np.array(
+        list(A.pagerank(graph_nx, personalization=personalization).values())
+    )
+    return property_values
+
+
+def _compute_density_property(graph_nx, ascending=True):
+    direction = -1 if ascending else 1
+    property_values = direction * np.array(
+        list(A.clustering(graph_nx).values())
+    )
+    return property_values
+
+
+_property_name_to_compute_fn = {
+    "popularity": _compute_popularity_property,
+    "locality": _compute_locality_property,
+    "density": _compute_density_property,
+}
diff --git a/tests/python/common/data/test_data.py b/tests/python/common/data/test_data.py
@@ -408,38 +408,6 @@ def test_cluster():
         assert ds.num_classes == 6
 
 
-@unittest.skipIf(
-    F._default_context_str == "gpu",
-    reason="Datasets don't need to be tested on GPU.",
-)
-@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
-def test_extract_archive():
-    # gzip
-    with tempfile.TemporaryDirectory() as src_dir:
-        gz_file = "gz_archive"
-        gz_path = os.path.join(src_dir, gz_file + ".gz")
-        content = b"test extract archive gzip"
-        with gzip.open(gz_path, "wb") as f:
-            f.write(content)
-        with tempfile.TemporaryDirectory() as dst_dir:
-            data.utils.extract_archive(gz_path, dst_dir, overwrite=True)
-            assert os.path.exists(os.path.join(dst_dir, gz_file))
-
-    # tar
-    with tempfile.TemporaryDirectory() as src_dir:
-        tar_file = "tar_archive"
-        tar_path = os.path.join(src_dir, tar_file + ".tar")
-        # default encode to utf8
-        content = "test extract archive tar\n".encode()
-        info = tarfile.TarInfo(name="tar_archive")
-        info.size = len(content)
-        with tarfile.open(tar_path, "w") as f:
-            f.addfile(info, io.BytesIO(content))
-        with tempfile.TemporaryDirectory() as dst_dir:
-            data.utils.extract_archive(tar_path, dst_dir, overwrite=True)
-            assert os.path.exists(os.path.join(dst_dir, tar_file))
-
-
 def _test_construct_graphs_node_ids():
     from dgl.data.csv_dataset_base import (
         DGLGraphConstructor,
@@ -1659,25 +1627,6 @@ def test_csvdataset():
     _test_CSVDataset_customized_data_parser()
 
 
-@unittest.skipIf(
-    F._default_context_str == "gpu",
-    reason="Datasets don't need to be tested on GPU.",
-)
-@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
-def test_add_nodepred_split():
-    dataset = data.AmazonCoBuyComputerDataset()
-    print("train_mask" in dataset[0].ndata)
-    data.utils.add_nodepred_split(dataset, [0.8, 0.1, 0.1])
-    assert "train_mask" in dataset[0].ndata
-
-    dataset = data.AIFBDataset()
-    print("train_mask" in dataset[0].nodes["Publikationen"].data)
-    data.utils.add_nodepred_split(
-        dataset, [0.8, 0.1, 0.1], ntype="Publikationen"
-    )
-    assert "train_mask" in dataset[0].nodes["Publikationen"].data
-
-
 @unittest.skipIf(
     F._default_context_str == "gpu",
     reason="Datasets don't need to be tested on GPU.",
@@ -2094,9 +2043,7 @@ def test_as_graphpred_ogb():
     test_tudataset_regression()
     test_fraud()
     test_fakenews()
-    test_extract_archive()
     test_csvdataset()
-    test_add_nodepred_split()
     test_as_nodepred1()
     test_as_nodepred2()
     test_as_nodepred_csvdataset()
diff --git a/tests/python/common/data/test_utils.py b/tests/python/common/data/test_utils.py
@@ -0,0 +1,102 @@
+import gzip
+import io
+import os
+import tarfile
+import tempfile
+import unittest
+
+import backend as F
+
+import dgl
+import dgl.data as data
+import numpy as np
+import pandas as pd
+import pytest
+import yaml
+from dgl import DGLError
+
+
+@unittest.skipIf(
+    F._default_context_str == "gpu",
+    reason="Datasets don't need to be tested on GPU.",
+)
+@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
+def test_add_nodepred_split():
+    dataset = data.AmazonCoBuyComputerDataset()
+    print("train_mask" in dataset[0].ndata)
+    data.utils.add_nodepred_split(dataset, [0.8, 0.1, 0.1])
+    assert "train_mask" in dataset[0].ndata
+
+    dataset = data.AIFBDataset()
+    print("train_mask" in dataset[0].nodes["Publikationen"].data)
+    data.utils.add_nodepred_split(
+        dataset, [0.8, 0.1, 0.1], ntype="Publikationen"
+    )
+    assert "train_mask" in dataset[0].nodes["Publikationen"].data
+
+
+@unittest.skipIf(
+    F._default_context_str == "gpu",
+    reason="Datasets don't need to be tested on GPU.",
+)
+@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
+def test_extract_archive():
+    # gzip
+    with tempfile.TemporaryDirectory() as src_dir:
+        gz_file = "gz_archive"
+        gz_path = os.path.join(src_dir, gz_file + ".gz")
+        content = b"test extract archive gzip"
+        with gzip.open(gz_path, "wb") as f:
+            f.write(content)
+        with tempfile.TemporaryDirectory() as dst_dir:
+            data.utils.extract_archive(gz_path, dst_dir, overwrite=True)
+            assert os.path.exists(os.path.join(dst_dir, gz_file))
+
+    # tar
+    with tempfile.TemporaryDirectory() as src_dir:
+        tar_file = "tar_archive"
+        tar_path = os.path.join(src_dir, tar_file + ".tar")
+        # default encode to utf8
+        content = "test extract archive tar\n".encode()
+        info = tarfile.TarInfo(name="tar_archive")
+        info.size = len(content)
+        with tarfile.open(tar_path, "w") as f:
+            f.addfile(info, io.BytesIO(content))
+        with tempfile.TemporaryDirectory() as dst_dir:
+            data.utils.extract_archive(tar_path, dst_dir, overwrite=True)
+            assert os.path.exists(os.path.join(dst_dir, tar_file))
+
+
+@unittest.skipIf(
+    F._default_context_str == "gpu",
+    reason="Datasets don't need to be tested on GPU.",
+)
+@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
+def test_mask_nodes_by_property():
+    num_nodes = 1000
+    property_values = np.random.uniform(size=num_nodes)
+    part_ratios = [0.3, 0.1, 0.1, 0.3, 0.2]
+    split_masks = data.utils.mask_nodes_by_property(
+        property_values, part_ratios
+    )
+    assert "in_valid_mask" in split_masks
+
+
+@unittest.skipIf(
+    F._default_context_str == "gpu",
+    reason="Datasets don't need to be tested on GPU.",
+)
+@unittest.skipIf(dgl.backend.backend_name == "mxnet", reason="Skip MXNet")
+def test_add_node_property_split():
+    dataset = data.AmazonCoBuyComputerDataset()
+    part_ratios = [0.3, 0.1, 0.1, 0.3, 0.2]
+    for property_name in ["popularity", "locality", "density"]:
+        data.utils.add_node_property_split(dataset, part_ratios, property_name)
+        assert "in_valid_mask" in dataset[0].ndata
+
+
+if __name__ == "__main__":
+    test_extract_archive()
+    test_add_nodepred_split()
+    test_mask_nodes_by_property()
+    test_add_node_property_split()