-
Notifications
You must be signed in to change notification settings - Fork 3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Utils] Edge and LINKX homophily measure #5382
Changes from 24 commits
3e66773
d1785ef
e613ddf
4aac2dd
d7d2144
618c963
cfdd975
8396fc5
50ec561
20ac1a9
7bd5e44
2e8104b
331ad0b
35e4d87
13cc169
a4a2f46
1cdf545
d359ba5
c022083
a97f245
58a7153
2058918
9f179e5
83b6677
abe7381
c1e54e3
e35d875
558a07e
8714d4a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,24 @@ | ||
"""Utils for tacking graph homophily and heterophily""" | ||
from . import backend as F, function as fn | ||
from . import function as fn | ||
from .convert import graph as create_graph | ||
|
||
__all__ = ["node_homophily"] | ||
try: | ||
import torch | ||
except ImportError: | ||
pass | ||
|
||
__all__ = ["node_homophily", "edge_homophily", "linkx_homophily"] | ||
|
||
|
||
def get_long_edges(graph): | ||
"""Internal function for getting the edges of a graph as long tensors.""" | ||
src, dst = graph.edges() | ||
return src.long(), dst.long() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If there are only two lines, consider just embed them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm fine either way. Maybe you two can start a fight. :) @frozenbugs |
||
|
||
|
||
def node_homophily(graph, y): | ||
r"""Homophily measure from `Geom-GCN: Geometric Graph Convolutional Networks | ||
<https://arxiv.org/abs/2002.05287>`__ | ||
r"""Homophily measure from `Geom-GCN: Geometric Graph Convolutional | ||
Networks <https://arxiv.org/abs/2002.05287>`__ | ||
|
||
We follow the practice of a later paper `Large Scale Learning on | ||
Non-Homophilous Graphs: New Benchmarks and Strong Simple Methods | ||
|
@@ -15,8 +27,8 @@ def node_homophily(graph, y): | |
Mathematically it is defined as follows: | ||
|
||
.. math:: | ||
\frac{1}{|\mathcal{V}|} \sum_{v \in \mathcal{V}} \frac{ | \{ (u,v) : u | ||
\in \mathcal{N}(v) \wedge y_v = y_u \} | } { |\mathcal{N}(v)| } | ||
\frac{1}{|\mathcal{V}|} \sum_{v \in \mathcal{V}} \frac{ | \{u | ||
\in \mathcal{N}(v): y_v = y_u \} | } { |\mathcal{N}(v)| }, | ||
|
||
where :math:`\mathcal{V}` is the set of nodes, :math:`\mathcal{N}(v)` is | ||
the predecessors of node :math:`v`, and :math:`y_v` is the class of node | ||
|
@@ -25,14 +37,14 @@ def node_homophily(graph, y): | |
Parameters | ||
---------- | ||
graph : DGLGraph | ||
The graph | ||
The graph. | ||
y : Tensor | ||
The node labels, which is a tensor of shape (|V|) | ||
The node labels, which is a tensor of shape (|V|). | ||
|
||
Returns | ||
------- | ||
float | ||
The node homophily value | ||
The node homophily value. | ||
|
||
Examples | ||
-------- | ||
|
@@ -45,13 +57,121 @@ def node_homophily(graph, y): | |
0.6000000238418579 | ||
""" | ||
with graph.local_scope(): | ||
src, dst = graph.edges() | ||
# Handle the case where graph is of dtype int32. | ||
src = F.astype(src, F.int64) | ||
dst = F.astype(dst, F.int64) | ||
src, dst = get_long_edges(graph) | ||
# Compute y_v = y_u for all edges. | ||
graph.edata["same_class"] = F.astype(y[src] == y[dst], F.float32) | ||
graph.edata["same_class"] = (y[src] == y[dst]).float() | ||
graph.update_all( | ||
fn.copy_e("same_class", "m"), fn.mean("m", "node_value") | ||
fn.copy_e("same_class", "m"), fn.mean("m", "same_class_deg") | ||
) | ||
return graph.ndata["node_value"].mean().item() | ||
return graph.ndata["same_class_deg"].mean(dim=0).item() | ||
|
||
|
||
def edge_homophily(graph, y): | ||
r"""Homophily measure from `Beyond Homophily in Graph Neural Networks: | ||
Current Limitations and Effective Designs | ||
<https://arxiv.org/abs/2006.11468>`__ | ||
|
||
Mathematically it is defined as follows: | ||
|
||
.. math:: | ||
\frac{| \{ (u,v) : (u,v) \in \mathcal{E} \wedge y_u = y_v \} | } | ||
{|\mathcal{E}|}, | ||
|
||
where :math:`\mathcal{E}` is the set of edges, and :math:`y_u` is the class | ||
of node :math:`u`. | ||
|
||
Parameters | ||
---------- | ||
graph : DGLGraph | ||
The graph. | ||
y : Tensor | ||
The node labels, which is a tensor of shape (|V|). | ||
|
||
Returns | ||
------- | ||
float | ||
The edge homophily ratio value. | ||
|
||
Examples | ||
-------- | ||
>>> import dgl | ||
>>> import torch | ||
|
||
>>> graph = dgl.graph(([1, 2, 0, 4], [0, 1, 2, 3])) | ||
>>> y = torch.tensor([0, 0, 0, 0, 1]) | ||
>>> dgl.edge_homophily(graph, y) | ||
0.75 | ||
""" | ||
with graph.local_scope(): | ||
# Handle the case where graph is of dtype int32. | ||
src, dst = get_long_edges(graph) | ||
# Compute y_v = y_u for all edges. | ||
edge_indicator = (y[src] == y[dst]).float() | ||
return edge_indicator.mean(dim=0).item() | ||
|
||
|
||
def linkx_homophily(graph, y): | ||
r"""Homophily measure from `Large Scale Learning on Non-Homophilous Graphs: | ||
New Benchmarks and Strong Simple Methods | ||
<https://arxiv.org/abs/2110.14446>`__ | ||
|
||
Mathematically it is defined as follows: | ||
|
||
.. math:: | ||
\frac{1}{C-1} \sum_{k=1}^{C} \max \left(0, \frac{\sum_{v\in C_k}|\{u\in | ||
\mathcal{N}(v): y_v = y_u \}|}{\sum_{v\in C_k}|\mathcal{N}(v)|} - | ||
\frac{|\mathcal{C}_k|}{|\mathcal{V}|} \right), | ||
|
||
where :math:`C` is the number of node classes, :math:`C_k` is the set of | ||
nodes that belong to class k, :math:`\mathcal{N}(v)` are the predecessors | ||
of node :math:`v`, :math:`y_v` is the class of node :math:`v`, and | ||
:math:`\mathcal{V}` is the set of nodes. | ||
|
||
Parameters | ||
---------- | ||
graph : DGLGraph | ||
The graph. | ||
y : Tensor | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. torch.Tensor and others. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
The node labels, which is a tensor of shape (|V|). | ||
|
||
Returns | ||
------- | ||
float | ||
The homophily value. | ||
|
||
Examples | ||
-------- | ||
>>> import dgl | ||
>>> import torch | ||
|
||
>>> graph = dgl.graph(([0, 1, 2, 3], [1, 2, 0, 4])) | ||
>>> y = torch.tensor([0, 0, 0, 0, 1]) | ||
>>> dgl.linkx_homophily(graph, y) | ||
0.19999998807907104 | ||
""" | ||
with graph.local_scope(): | ||
# Compute |{u\in N(v): y_v = y_u}| for each node v. | ||
# Handle the case where graph is of dtype int32. | ||
src, dst = get_long_edges(graph) | ||
# Compute y_v = y_u for all edges. | ||
graph.edata["same_class"] = (y[src] == y[dst]).float() | ||
graph.update_all( | ||
fn.copy_e("same_class", "m"), fn.sum("m", "same_class_deg") | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, now I'm pushing this further. Will using sparse API makes the code more readable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How so? You convert the graph to a sparse matrix and call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with graph.local_scope():
# Handle the case where graph is of dtype int32.
src, dst = get_long_edges(graph)
# Compute y_v = y_u for all edges.
graph.edata["same_class"] = (y[src] == y[dst]).float()
graph.update_all(
fn.copy_e("same_class", "m"), fn.mean("m", "same_class_deg")
)
return graph.ndata["same_class_deg"].mean(dim=0).item() v.s. A = graph.adj
same_class = (y[A.row] == y[A.col]).float()
same_class_avg = dglsp.val_like(A, same_class).smean(dim=1)
return same_class_avg.mean(dim=0).item() There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. v.s. in the new message passing API style src, dst = get_long_edges(graph)
same_class = (y[src] == y[dst]).float()
same_class_avg = dgl.mpops.copy_e_mean(g, same_class)
return same_class_avg.mean(dim=0).item() There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Still, it's quite subtle. I'm fine either way. The question is more about when do we encourage the use of message passing APIs versus sparse APIs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My opinion is to go with the math formulation: If the model is described in node-wise/edge-wise computation then message passing is the way to goal; otherwise, use sparse. In this case, the definition is in node/edge so message passing is more suitable. You can see that although the sparse APIs are shorter, it doesn't align well with the definition, e.g., the use of |
||
|
||
deg = graph.in_degrees().float() | ||
num_nodes = graph.num_nodes() | ||
value = 0 | ||
mufeili marked this conversation as resolved.
Show resolved
Hide resolved
|
||
num_classes = y.max(dim=0).values.item() + 1 | ||
|
||
for k in range(num_classes): | ||
# Get the nodes that belong to class k. | ||
class_mask = y == k | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: class_mask = (y == k) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I initially did what you suggested, and then the lint check failed. |
||
same_class_deg_k = graph.ndata["same_class_deg"][class_mask].sum() | ||
deg_k = deg[class_mask].sum() | ||
# Value for a null model. | ||
null_value = class_mask.sum() / num_nodes | ||
value += max(0, same_class_deg_k / deg_k - null_value) | ||
mufeili marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
return value.item() / (num_classes - 1) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
import math | ||
import unittest | ||
|
||
import backend as F | ||
|
@@ -6,7 +7,9 @@ | |
from test_utils import parametrize_idtype | ||
|
||
|
||
@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="Skip TF") | ||
@unittest.skipIf( | ||
dgl.backend.backend_name != "pytorch", reason="Only support PyTorch for now" | ||
) | ||
@parametrize_idtype | ||
def test_node_homophily(idtype): | ||
# IfChangeThenChange: python/dgl/homophily.py | ||
|
@@ -16,4 +19,32 @@ def test_node_homophily(idtype): | |
([1, 2, 0, 4], [0, 1, 2, 3]), idtype=idtype, device=device | ||
) | ||
y = F.tensor([0, 0, 0, 0, 1]) | ||
assert dgl.node_homophily(graph, y) == 0.6000000238418579 | ||
assert math.isclose(dgl.node_homophily(graph, y), 0.6000000238418579) | ||
mufeili marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
@unittest.skipIf( | ||
dgl.backend.backend_name != "pytorch", reason="Only support PyTorch for now" | ||
) | ||
@parametrize_idtype | ||
def test_edge_homophily(idtype): | ||
# IfChangeThenChange: python/dgl/homophily.py | ||
# Update the docstring example. | ||
device = F.ctx() | ||
graph = dgl.graph( | ||
([1, 2, 0, 4], [0, 1, 2, 3]), idtype=idtype, device=device | ||
) | ||
y = F.tensor([0, 0, 0, 0, 1]) | ||
assert math.isclose(dgl.edge_homophily(graph, y), 0.75) | ||
|
||
|
||
@unittest.skipIf( | ||
dgl.backend.backend_name != "pytorch", reason="Only support PyTorch for now" | ||
) | ||
@parametrize_idtype | ||
def test_linkx_homophily(idtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any conner case you need to handle? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the current cases are sufficient. |
||
# IfChangeThenChange: python/dgl/homophily.py | ||
# Update the docstring example. | ||
device = F.ctx() | ||
graph = dgl.graph(([0, 1, 2, 3], [1, 2, 0, 4]), device=device) | ||
y = F.tensor([0, 0, 0, 0, 1]) | ||
assert math.isclose(dgl.linkx_homophily(graph, y), 0.19999998807907104) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should internal helper function start with _?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm fine either way. For this file, it should be clear that only the functions included in
__all__
are external.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sound good.
nit: Maybe rename to get_edges_long, more natural.