Skip to content

Commit

Permalink
Merge branch 'master' into fix_example
Browse files Browse the repository at this point in the history
  • Loading branch information
Rhett-Ying committed Dec 22, 2023
2 parents 08bf92d + 7094ff4 commit 3be763e
Show file tree
Hide file tree
Showing 9 changed files with 157 additions and 649 deletions.
629 changes: 9 additions & 620 deletions docs/source/stochastic_training/ondisk-dataset-specification.rst

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions graphbolt/include/graphbolt/shared_memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,21 @@
#define GRAPHBOLT_SHARED_MEMORY_H_

#ifdef _WIN32
// Add the macro to avoid MIN/MAX conflict.
#ifndef NOMINMAX
#define NOMINMAX
#define GRAPHBOLT_WINDOWS_NOMINMAX_
#endif // NOMINMAX
#include <windows.h>
#endif // _WIN32

#include <memory>
#include <string>

#ifdef GRAPHBOLT_WINDOWS_NOMINMAX_
#undef NOMINMAX
#endif // GRAPHBOLT_WINDOWS_NOMINMAX_

namespace graphbolt {
namespace sampling {

Expand Down
27 changes: 26 additions & 1 deletion python/dgl/graphbolt/impl/ondisk_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ...data.utils import download, extract_archive
from ..base import etype_str_to_tuple
from ..dataset import Dataset, Task
from ..internal import copy_or_convert_data, read_data
from ..internal import copy_or_convert_data, get_attributes, read_data
from ..itemset import ItemSet, ItemSetDict
from ..sampling_graph import SamplingGraph
from .fused_csc_sampling_graph import from_dglgraph, FusedCSCSamplingGraph
Expand Down Expand Up @@ -270,6 +270,9 @@ def test_set(self) -> Union[ItemSet, ItemSetDict]:
"""Return the test set."""
return self._test_set

def __repr__(self) -> str:
return _ondisk_task_str(self)


class OnDiskDataset(Dataset):
"""An on-disk dataset which reads graph topology, feature data and
Expand Down Expand Up @@ -609,3 +612,25 @@ def __init__(self, name: str, root: str = "datasets") -> OnDiskDataset:
extract_archive(zip_file_path, root, overwrite=True)
os.remove(zip_file_path)
super().__init__(dataset_dir)


def _ondisk_task_str(task: OnDiskTask) -> str:
final_str = "OnDiskTask("
indent_len = len(final_str)

def _add_indent(_str, indent):
lines = _str.split("\n")
lines = [lines[0]] + [" " * indent + line for line in lines[1:]]
return "\n".join(lines)

attributes = get_attributes(task)
attributes.reverse()
for name in attributes:
if name[0] == "_":
continue
val = getattr(task, name)
final_str += (
f"{name}={_add_indent(str(val), indent_len + len(name) + 1)},\n"
+ " " * indent_len
)
return final_str[:-indent_len] + ")"
19 changes: 16 additions & 3 deletions python/dgl/graphbolt/sampled_subgraph.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Graphbolt sampled subgraph."""

# pylint: disable= invalid-name
from typing import Dict, Tuple, Union

Expand Down Expand Up @@ -181,6 +182,10 @@ def exclude_edges(
index = {}
is_cscformat = 0
for etype, pair in self.node_pairs.items():
if etype not in edges:
# No edges need to be excluded.
index[etype] = None
continue
src_type, _, dst_type = etype_str_to_tuple(etype)
original_row_node_ids = (
None
Expand All @@ -207,7 +212,7 @@ def exclude_edges(
)
index[etype] = _exclude_homo_edges(
reverse_edges,
edges.get(etype),
edges[etype],
assume_num_node_within_int32,
)
if is_cscformat:
Expand Down Expand Up @@ -266,8 +271,12 @@ def _relabel_two_arrays(lhs_array, rhs_array):
return mapping[: lhs_array.numel()], mapping[lhs_array.numel() :]


def _exclude_homo_edges(edges, edges_to_exclude, assume_num_node_within_int32):
"""Return the indices of edges that are not in edges_to_exclude."""
def _exclude_homo_edges(
edges: Tuple[torch.Tensor, torch.Tensor],
edges_to_exclude: Tuple[torch.Tensor, torch.Tensor],
assume_num_node_within_int32: bool,
):
"""Return the indices of edges to be included."""
if assume_num_node_within_int32:
val = edges[0] << 32 | edges[1]
val_to_exclude = edges_to_exclude[0] << 32 | edges_to_exclude[1]
Expand All @@ -286,6 +295,8 @@ def _slice_subgraph_node_pairs(subgraph: SampledSubgraph, index: torch.Tensor):
def _index_select(obj, index):
if obj is None:
return None
if index is None:
return obj
if isinstance(obj, torch.Tensor):
return obj[index]
if isinstance(obj, tuple):
Expand All @@ -312,6 +323,8 @@ def _slice_subgraph(subgraph: SampledSubgraph, index: torch.Tensor):
def _index_select(obj, index):
if obj is None:
return None
if index is None:
return obj
if isinstance(obj, CSCFormatBase):
new_indices = obj.indices[index]
new_indptr = torch.searchsorted(index, obj.indptr)
Expand Down
22 changes: 14 additions & 8 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,19 @@ def get_lib_file_path(lib_name, backend=""):
data_files.append(get_lib_file_path("graphbolt"))
setup_kwargs = {"include_package_data": True, "data_files": data_files}

# Configure dependencies.
install_requires = [
"numpy>=1.14.0",
"scipy>=1.1.0",
"networkx>=2.1",
"requests>=2.19.0",
"tqdm",
"psutil>=5.8.0",
"torchdata>=0.5.0",
]
if "DGLBACKEND" in os.environ and os.environ["DGLBACKEND"] != "pytorch":
install_requires.pop(install_requires.index("torchdata>=0.5.0"))

setup(
name="dgl" + os.getenv("DGL_PACKAGE_SUFFIX", ""),
version=VERSION,
Expand All @@ -219,14 +232,7 @@ def get_lib_file_path(lib_name, backend=""):
maintainer="DGL Team",
maintainer_email="wmjlyjemaine@gmail.com",
packages=find_packages(),
install_requires=[
"numpy>=1.14.0",
"scipy>=1.1.0",
"networkx>=2.1",
"requests>=2.19.0",
"tqdm",
"psutil>=5.8.0",
],
install_requires=install_requires,
url="https://github.com/dmlc/dgl",
distclass=BinaryDistribution,
ext_modules=config_cython(),
Expand Down
2 changes: 1 addition & 1 deletion script/dgl_dev.yml.template
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ dependencies:
- scikit-learn
- scipy
- torch==__TORCH_VERSION__
- torchdata
- torchdata>=0.5.0
- torcheval
- torchmetrics
- tqdm
Expand Down
60 changes: 60 additions & 0 deletions tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2163,3 +2163,63 @@ def test_OnDiskDataset_heterogeneous(include_original_edge_id):
graph = None
tasks = None
dataset = None


def test_OnDiskTask_repr_homogeneous():
item_set = gb.ItemSet(
(torch.arange(0, 5), torch.arange(5, 10)),
names=("seed_nodes", "labels"),
)
metadata = {"name": "node_classification"}
task = gb.OnDiskTask(metadata, item_set, item_set, item_set)
expected_str = str(
"""OnDiskTask(validation_set=ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),
names=('seed_nodes', 'labels'),
),
train_set=ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),
names=('seed_nodes', 'labels'),
),
test_set=ItemSet(items=(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9])),
names=('seed_nodes', 'labels'),
),
metadata={'name': 'node_classification'},
)"""
)
assert str(task) == expected_str, print(task)


def test_OnDiskTask_repr_heterogeneous():
item_set = gb.ItemSetDict(
{
"user": gb.ItemSet(torch.arange(0, 5), names="seed_nodes"),
"item": gb.ItemSet(torch.arange(5, 10), names="seed_nodes"),
}
)
metadata = {"name": "node_classification"}
task = gb.OnDiskTask(metadata, item_set, item_set, item_set)
expected_str = str(
"""OnDiskTask(validation_set=ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]),),
names=('seed_nodes',),
), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]),),
names=('seed_nodes',),
)},
names=('seed_nodes',),
),
train_set=ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]),),
names=('seed_nodes',),
), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]),),
names=('seed_nodes',),
)},
names=('seed_nodes',),
),
test_set=ItemSetDict(items={'user': ItemSet(items=(tensor([0, 1, 2, 3, 4]),),
names=('seed_nodes',),
), 'item': ItemSet(items=(tensor([5, 6, 7, 8, 9]),),
names=('seed_nodes',),
)},
names=('seed_nodes',),
),
metadata={'name': 'node_classification'},
)"""
)
assert str(task) == expected_str, print(task)
34 changes: 20 additions & 14 deletions tests/python/pytorch/graphbolt/test_subgraph_sampler.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from functools import partial

import dgl
import dgl.graphbolt as gb
import pytest
Expand Down Expand Up @@ -88,25 +90,27 @@ def to_link_batch(data):
def test_SubgraphSampler_Link(labor):
graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True)
itemset = gb.ItemSet(torch.arange(0, 20).reshape(-1, 2), names="node_pairs")
item_sampler = gb.ItemSampler(itemset, batch_size=2)
datapipe = gb.ItemSampler(itemset, batch_size=2)
num_layer = 2
fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
Sampler = gb.LayerNeighborSampler if labor else gb.NeighborSampler
neighbor_dp = Sampler(item_sampler, graph, fanouts)
assert len(list(neighbor_dp)) == 5
datapipe = Sampler(datapipe, graph, fanouts)
datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
assert len(list(datapipe)) == 5


@pytest.mark.parametrize("labor", [False, True])
def test_SubgraphSampler_Link_With_Negative(labor):
graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True)
itemset = gb.ItemSet(torch.arange(0, 20).reshape(-1, 2), names="node_pairs")
item_sampler = gb.ItemSampler(itemset, batch_size=2)
datapipe = gb.ItemSampler(itemset, batch_size=2)
num_layer = 2
fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
negative_dp = gb.UniformNegativeSampler(item_sampler, graph, 1)
datapipe = gb.UniformNegativeSampler(datapipe, graph, 1)
Sampler = gb.LayerNeighborSampler if labor else gb.NeighborSampler
neighbor_dp = Sampler(negative_dp, graph, fanouts)
assert len(list(neighbor_dp)) == 5
datapipe = Sampler(datapipe, graph, fanouts)
datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
assert len(list(datapipe)) == 5


def get_hetero_graph():
Expand Down Expand Up @@ -163,12 +167,13 @@ def test_SubgraphSampler_Link_Hetero(labor):
}
)

item_sampler = gb.ItemSampler(itemset, batch_size=2)
datapipe = gb.ItemSampler(itemset, batch_size=2)
num_layer = 2
fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
Sampler = gb.LayerNeighborSampler if labor else gb.NeighborSampler
neighbor_dp = Sampler(item_sampler, graph, fanouts)
assert len(list(neighbor_dp)) == 5
datapipe = Sampler(datapipe, graph, fanouts)
datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
assert len(list(datapipe)) == 5


@pytest.mark.parametrize("labor", [False, True])
Expand All @@ -187,13 +192,14 @@ def test_SubgraphSampler_Link_Hetero_With_Negative(labor):
}
)

item_sampler = gb.ItemSampler(itemset, batch_size=2)
datapipe = gb.ItemSampler(itemset, batch_size=2)
num_layer = 2
fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
negative_dp = gb.UniformNegativeSampler(item_sampler, graph, 1)
datapipe = gb.UniformNegativeSampler(datapipe, graph, 1)
Sampler = gb.LayerNeighborSampler if labor else gb.NeighborSampler
neighbor_dp = Sampler(negative_dp, graph, fanouts)
assert len(list(neighbor_dp)) == 5
datapipe = Sampler(datapipe, graph, fanouts)
datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
assert len(list(datapipe)) == 5


@pytest.mark.parametrize("labor", [False, True])
Expand Down
4 changes: 2 additions & 2 deletions tests/scripts/build_dgl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ else
rm -rf build *.egg-info dist
pip uninstall -y dgl
# test install
python3 setup.py install
DGLBACKEND=${backend} python3 setup.py install
# test inplace build (for cython)
python3 setup.py build_ext --inplace
DGLBACKEND=${backend} python3 setup.py build_ext --inplace
done
fi
popd

0 comments on commit 3be763e

Please sign in to comment.