dmlc · mufeili · Mar 9, 2023 · Mar 2, 2023 · Mar 3, 2023 · Mar 3, 2023
diff --git a/docs/source/api/python/dgl.data.rst b/docs/source/api/python/dgl.data.rst
@@ -122,4 +122,6 @@ Utilities
     utils.save_info
     utils.load_info
     utils.add_nodepred_split
+    utils.mask_nodes_by_property
+    utils.add_node_property_split
     utils.Subset
diff --git a/python/dgl/data/utils.py b/python/dgl/data/utils.py
@@ -29,6 +29,8 @@
     "save_tensors",
     "load_tensors",
     "add_nodepred_split",
+    "add_node_property_split",
+    "mask_nodes_by_property",
 ]
 
 
@@ -482,3 +484,198 @@ def add_nodepred_split(dataset, ratio, ntype=None):
         g.nodes[ntype].data["train_mask"] = train_mask
         g.nodes[ntype].data["val_mask"] = val_mask
         g.nodes[ntype].data["test_mask"] = test_mask
+
+
+def mask_nodes_by_property(property_values, part_ratios, random_seed=None):
+    """Provide the split masks for training, ID and OOD validation, ID and OOD
+    testing according to the node property values.
+
+    It sorts the nodes in the ascending order of their property values, splits
+    them into 5 non-intersecting parts, and creates 5 associated node mask arrays:
+        - 3 for the ID nodes: ``'in_train_mask'``, ``'in_valid_mask'``, ``'in_test_mask'``,
+        - and 2 for the OOD nodes: ``'out_valid_mask'``, ``'out_test_mask'``.
+
+    As described in `Evaluating Robustness and Uncertainty of Graph Models Under
+    Structural Distributional Shifts <https://arxiv.org/abs/2302.13875v1>`__,
+    this approach allows to create data splits with distributional shifts.
+
+    Parameters
+    ----------
+    property_values : numpy ndarray
+        The node property (float) values to split the dataset by.
+        The length of array must be equal to the number of nodes in graph.
+    part_ratios : list
+        A list of 5 ratios for training, ID validation, ID test,
+        OOD validation, OOD testing parts. The values in list must sum to one.
+    random_seed : int, optional
+        Random seed to fix for the initial permutation of nodes. It is
+        used to create a random order for the nodes that have the same
+        property values. (default: None)
+
+    Returns
+    ----------
+    split_masks : dict
+        A python dict storing the mask names as keys and the corresponding
+        node mask arrays as values.
+    """
+
+    num_nodes = len(property_values)
+    part_sizes = np.round(num_nodes * np.array(part_ratios)).astype(int)
+    part_sizes[-1] -= np.sum(part_sizes) - num_nodes
+
+    generator = np.random.RandomState(random_seed)
+    permutation = generator.permutation(num_nodes)
+
+    node_indices = np.arange(num_nodes)[permutation]
+    property_values = property_values[permutation]
+    in_distribution_size = np.sum(part_sizes[:3])
+
+    node_indices_ordered = node_indices[np.argsort(property_values)]
+    node_indices_ordered[: in_distribution_size] = generator.permutation(
+        node_indices_ordered[: in_distribution_size]
+    )
+
+    sections = np.cumsum(part_sizes)
+    node_split = np.split(node_indices_ordered, sections)[:-1]
+    mask_names = [
+        "in_train_mask",
+        "in_valid_mask",
+        "in_test_mask",
+        "out_valid_mask",
+        "out_test_mask",
+    ]
+    split_masks = {}
+
+    for mask_name, node_indices in zip(mask_names, node_split):
+        split_mask = idx2mask(node_indices, num_nodes)
+        split_masks[mask_name] = generate_mask_tensor(split_mask)
+
+    return split_masks
+
+
+def add_node_property_split(
+    dataset, part_ratios, property_name, ascending=True, random_seed=None
+):
+    """Create a data split with a distributional shift based on some node property.
+
+    It splits each graph in the given dataset into training, ID and OOD validation, ID and OOD
+    testing parts for transductive node prediction task with structural distributional shifts.
+    As a result, it creates 5 associated node mask arrays for each graph:
+        - 3 for the ID nodes: ``'in_train_mask'``, ``'in_valid_mask'``, ``'in_test_mask'``,
+        - and 2 for the OOD nodes: ``'out_valid_mask'``, ``'out_test_mask'``.
+
+    Following `Evaluating Robustness and Uncertainty of Graph Models Under Structural
+    Distributional Shifts <https://arxiv.org/abs/2302.13875v1>`__, this function implements
+    3 particular strategies for inducing distributional shifts in graph — based on
+    **popularity**, **locality** or **density**.
+
+    Parameters
+    ----------
+    dataset : DGLDataset or list of :class:`DGLGraph`
+        The dataset to induce structural distributional shift in.
+    part_ratios : list
+        A list of 5 ratio values for training, ID validation, ID test,
+        OOD validation and OOD test parts. The values must sum to 1.0.
+    property_name : str
+        The node property name to split the dataset by. Must be
+        ``'popularity'``, ``'locality'`` or ``'density'``.
+    ascending : bool, optional
+        Whether to sort nodes in the ascending order of a particular
+        node property, so that more shifted OOD nodes have greater values
+        of the computed property (default: True)
+    random_seed : int, optional
+        Random seed to fix for the initial permutation of nodes. It is
+        used to create a random order for the nodes that have the same
+        property values. (default: None)
+
+    Examples
+    --------
+    >>> dataset = dgl.data.AmazonCoBuyComputerDataset()
+    >>> print('in_valid_mask' in dataset[0].ndata)
+    False
+    >>> part_sizes = [0.3, 0.1, 0.1, 0.3, 0.2]
+    >>> property_name = 'popularity'
+    >>> dgl.data.utils.add_node_property_split(dataset, part_sizes, property_name)
+    >>> print('in_valid_mask' in dataset[0].ndata)
+    True
+    """
+
+    assert property_name in [
+        "popularity",
+        "locality",
+        "density",
+    ], "The name of property has to be 'popularity', 'locality', or 'density'"
+
+    assert (
+        len(part_ratios) == 5
+    ), "The list of part ratios must contain 5 values"
+
+    try:
+        import graph_tool as gt
+        from graph_tool import centrality, clustering, generation
+    except ImportError:
+        warnings.warn(
+            "graph-tool is required to compute the node property values: "
+            "https://graph-tool.skewed.de/"
+        )
+
+    user_direction = 1 if ascending else -1
+
+    for idx in range(len(dataset)):
+        graph_dgl = dataset[idx]
+        num_nodes = graph_dgl.num_nodes()
+
+        edge_list = F.stack(graph_dgl.edges(), dim=0)
+        if F.backend_name == "mxnet":
+            edge_list = edge_list.asnumpy().T
+        else:
+            edge_list = edge_list.numpy().T
+
+        graph_gt = gt.Graph()
+        graph_gt.add_vertex(num_nodes)
+        graph_gt.add_edge_list(edge_list)
+        graph_gt.set_directed(False)
+        generation.remove_parallel_edges(graph_gt)
+
+        if property_name == "popularity":
+            default_direction = -1
+            property_values = (
+                user_direction
+                * default_direction
+                * np.array(centrality.pagerank(graph_gt).get_array())
+            )
+
+        if property_name == "locality":
+            default_direction = -1
+            pagerank_values = np.array(
+                centrality.pagerank(graph_gt).get_array()
+            )
+
+            ohe_mask = np.zeros_like(pagerank_values)
+            ohe_mask[np.argmax(pagerank_values)] = 1.0
+
+            property_gt = graph_gt.new_vertex_property("double")
+            property_gt.a = ohe_mask
+
+            property_values = (
+                user_direction
+                * default_direction
+                * np.array(
+                    centrality.pagerank(graph_gt, pers=property_gt).get_array()
+                )
+            )
+
+        if property_name == "density":
+            default_direction = -1
+            property_values = (
+                user_direction
+                * default_direction
+                * np.array(clustering.local_clustering(graph_gt).get_array())
+            )
+
+        node_masks = mask_nodes_by_property(
+            property_values, part_ratios, random_seed
+        )
+
+        for mask_name, node_mask in node_masks.items():
+            graph_dgl.ndata[mask_name] = node_mask