From 6219fe600523f17313e8842fc38ffd72a7d2adc8 Mon Sep 17 00:00:00 2001 From: yzh119 Date: Tue, 6 Aug 2019 16:13:02 +0800 Subject: [PATCH 01/32] upd --- docs/source/api/python/nn.mxnet.rst | 6 +-- docs/source/api/python/nn.pytorch.rst | 6 +-- python/dgl/nn/mxnet/softmax.py | 76 +++++++++++++++++++++------ python/dgl/nn/pytorch/softmax.py | 76 +++++++++++++++++++++------ 4 files changed, 124 insertions(+), 40 deletions(-) diff --git a/docs/source/api/python/nn.mxnet.rst b/docs/source/api/python/nn.mxnet.rst index 07de782c114a..162be4f61ba8 100644 --- a/docs/source/api/python/nn.mxnet.rst +++ b/docs/source/api/python/nn.mxnet.rst @@ -17,12 +17,10 @@ dgl.nn.mxnet.glob .. automodule:: dgl.nn.mxnet.glob :members: + :show-inheritance: dgl.nn.mxnet.softmax -------------------- .. automodule:: dgl.nn.mxnet.softmax - -.. autoclass:: dgl.nn.mxnet.softmax.EdgeSoftmax - :members: forward - :show-inheritance: + :members: edge_softmax diff --git a/docs/source/api/python/nn.pytorch.rst b/docs/source/api/python/nn.pytorch.rst index a6d4ba18ea09..24ba02d41530 100644 --- a/docs/source/api/python/nn.pytorch.rst +++ b/docs/source/api/python/nn.pytorch.rst @@ -14,7 +14,6 @@ dgl.nn.pytorch.conv dgl.nn.pytorch.glob ------------------- - .. automodule:: dgl.nn.pytorch.glob .. autoclass:: dgl.nn.pytorch.glob.SumPooling @@ -53,7 +52,4 @@ dgl.nn.pytorch.softmax ---------------------- .. automodule:: dgl.nn.pytorch.softmax - -.. autoclass:: dgl.nn.pytorch.softmax.EdgeSoftmax - :members: forward - :show-inheritance: + :members: edge_softmax diff --git a/python/dgl/nn/mxnet/softmax.py b/python/dgl/nn/mxnet/softmax.py index d5754b977ae5..aff19df0163d 100644 --- a/python/dgl/nn/mxnet/softmax.py +++ b/python/dgl/nn/mxnet/softmax.py @@ -32,12 +32,15 @@ def forward(self, score): """Forward function. Pseudo-code: - score = dgl.EData(g, score) - score_max = score.dst_max() # of type dgl.NData - score = score - score_max # edge_sub_dst, ret dgl.EData - score_sum = score.dst_sum() # of type dgl.NData - out = score / score_sum # edge_div_dst, ret dgl.EData - return out.data + + .. code:: python + + score = dgl.EData(g, score) + score_max = score.dst_max() # of type dgl.NData + score = score - score_max # edge_sub_dst, ret dgl.EData + score_sum = score.dst_sum() # of type dgl.NData + out = score / score_sum # edge_div_dst, ret dgl.EData + return out.data """ g = self.g.local_var() g.edata['s'] = score @@ -54,12 +57,15 @@ def backward(self, grad_out): """Backward function. Pseudo-code: - g, out = ctx.backward_cache - grad_out = dgl.EData(g, grad_out) - out = dgl.EData(g, out) - sds = out * grad_out # type dgl.EData - sds_sum = sds.dst_sum() # type dgl.NData - grad_score = sds - sds * sds_sum # multiple expressions + + .. code:: python + + g, out = ctx.backward_cache + grad_out = dgl.EData(g, grad_out) + out = dgl.EData(g, out) + sds = out * grad_out # type dgl.EData + sds_sum = sds.dst_sum() # type dgl.NData + grad_score = sds - sds * sds_sum # multiple expressions """ g = self.g.local_var() out, = self.saved_tensors # pylint: disable=access-member-before-definition, unpacking-non-sequence @@ -75,6 +81,19 @@ def backward(self, grad_out): def edge_softmax(graph, logits): r"""Compute edge softmax. + For a node :math:`i`, edge softmax is an operation of computing + + .. math:: + a_{ij} = \frac{\exp(z_{ij})}{\sum_{j\in\mathcal{N}(i)}\exp(z_{ij})} + + where :math:`z_{ij}` is a signal of edge :math:`j\rightarrow i`, also + called logits in the context of softmax. :math:`\mathcal{N}(i)` is + the set of nodes that have an edge to :math:`i`. + + An example of using edge softmax is in + `Graph Attention Network `__ where + the attention weights are computed with such an edge softmax operation. + Parameters ---------- graph : DGLGraph @@ -90,13 +109,40 @@ def edge_softmax(graph, logits): Notes ----- * Input shape: :math:`(N, *, 1)` where * means any number of - additional dimensions, :math:`N` is the number of edges. + additional dimensions, :math:`N` is the number of edges. * Return shape: :math:`(N, *, 1)` Examples -------- - >>> import dgl.function as fn - >>> attention = EdgeSoftmax(logits, graph) + >>> from dgl.nn.mxnet.softmax import edge_softmax + >>> import dgl + >>> from mxnet import nd + + Create a :code:`DGLGraph` object and initialize its edge features. + + >>> g = dgl.DGLGraph() + >>> g.add_nodes(3) + >>> g.add_edges([0, 0, 0, 1, 1, 2], [0, 1, 2, 1, 2, 2]) + >>> edata = nd.ones((6, 1)) + >>> edata + [[1.] + [1.] + [1.] + [1.] + [1.] + [1.]] + + + Apply edge softmax on g: + + >>> edge_softmax(g, edata) + [[1. ] + [0.5 ] + [0.33333334] + [0.5 ] + [0.33333334] + [0.33333334]] + """ softmax_op = EdgeSoftmax(graph) return softmax_op(logits) diff --git a/python/dgl/nn/pytorch/softmax.py b/python/dgl/nn/pytorch/softmax.py index bb770ff9435b..9eda68ba7577 100644 --- a/python/dgl/nn/pytorch/softmax.py +++ b/python/dgl/nn/pytorch/softmax.py @@ -29,12 +29,15 @@ def forward(ctx, g, score): """Forward function. Pseudo-code: - score = dgl.EData(g, score) - score_max = score.dst_max() # of type dgl.NData - score = score - score_max # edge_sub_dst, ret dgl.EData - score_sum = score.dst_sum() # of type dgl.NData - out = score / score_sum # edge_div_dst, ret dgl.EData - return out.data + + .. code:: python + + score = dgl.EData(g, score) + score_max = score.dst_max() # of type dgl.NData + score = score - score_max # edge_sub_dst, ret dgl.EData + score_sum = score.dst_sum() # of type dgl.NData + out = score / score_sum # edge_div_dst, ret dgl.EData + return out.data """ # remember to save the graph to backward cache before making it # a local variable @@ -55,13 +58,16 @@ def backward(ctx, grad_out): """Backward function. Pseudo-code: - g, out = ctx.backward_cache - grad_out = dgl.EData(g, grad_out) - out = dgl.EData(g, out) - sds = out * grad_out # type dgl.EData - sds_sum = sds.dst_sum() # type dgl.NData - grad_score = sds - sds * sds_sum # multiple expressions - return grad_score.data + + .. code:: python + + g, out = ctx.backward_cache + grad_out = dgl.EData(g, grad_out) + out = dgl.EData(g, out) + sds = out * grad_out # type dgl.EData + sds_sum = sds.dst_sum() # type dgl.NData + grad_score = sds - sds * sds_sum # multiple expressions + return grad_score.data """ g = ctx.backward_cache g = g.local_var() @@ -79,6 +85,19 @@ def backward(ctx, grad_out): def edge_softmax(graph, logits): r"""Compute edge softmax. + For a node :math:`i`, edge softmax is an operation of computing + + .. math:: + a_{ij} = \frac{\exp(z_{ij})}{\sum_{j\in\mathcal{N}(i)}\exp(z_{ij})} + + where :math:`z_{ij}` is a signal of edge :math:`j\rightarrow i`, also + called logits in the context of softmax. :math:`\mathcal{N}(i)` is + the set of nodes that have an edge to :math:`i`. + + An example of using edge softmax is in + `Graph Attention Network `__ where + the attention weights are computed with such an edge softmax operation. + Parameters ---------- graph : DGLGraph @@ -94,12 +113,37 @@ def edge_softmax(graph, logits): Notes ----- * Input shape: :math:`(N, *, 1)` where * means any number of - additional dimensions, :math:`N` is the number of edges. + additional dimensions, :math:`N` is the number of edges. * Return shape: :math:`(N, *, 1)` Examples -------- - >>> import dgl.function as fn - >>> attention = EdgeSoftmax(logits, graph) + >>> from dgl.nn.pytorch.softmax import edge_softmax + >>> import dgl + >>> import torch as th + + Create a :code:`DGLGraph` object and initialize its edge features. + + >>> g = dgl.DGLGraph() + >>> g.add_nodes(3) + >>> g.add_edges([0, 0, 0, 1, 1, 2], [0, 1, 2, 1, 2, 2]) + >>> edata = th.ones(6, 1).float() + >>> edata + tensor([[1.], + [1.], + [1.], + [1.], + [1.], + [1.]]) + + Apply edge softmax on g: + + >>> edge_softmax(g, edata) + tensor([[1.0000], + [0.5000], + [0.3333], + [0.5000], + [0.3333], + [0.3333]]) """ return EdgeSoftmax.apply(graph, logits) From 66541cc6034f5e985a7707bcca32d737559e2b6f Mon Sep 17 00:00:00 2001 From: yzh119 Date: Wed, 7 Aug 2019 16:41:52 +0800 Subject: [PATCH 02/32] fig edgebatch edges --- python/dgl/udf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/dgl/udf.py b/python/dgl/udf.py index 4a895140f644..e4123a87dba1 100644 --- a/python/dgl/udf.py +++ b/python/dgl/udf.py @@ -76,8 +76,8 @@ def edges(self): in the batch. """ if is_all(self._edges[2]): - self._edges[2] = utils.toindex(F.arange( - 0, self._g.number_of_edges())) + self._edges = self._edges[:2] + (utils.toindex(F.arange( + 0, self._g.number_of_nodes())),) u, v, eid = self._edges return (u.tousertensor(), v.tousertensor(), eid.tousertensor()) From 8258153723e41fd9c2ab72e2a0035882dfe7b48b Mon Sep 17 00:00:00 2001 From: yzh119 Date: Wed, 7 Aug 2019 17:58:08 +0800 Subject: [PATCH 03/32] add test --- python/dgl/graph.py | 2 +- python/dgl/udf.py | 2 +- tests/compute/test_udf.py | 82 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 tests/compute/test_udf.py diff --git a/python/dgl/graph.py b/python/dgl/graph.py index c0f20df11d88..55c75a6922f0 100644 --- a/python/dgl/graph.py +++ b/python/dgl/graph.py @@ -3258,7 +3258,7 @@ def filter_edges(self, predicate, edges=ALL): """ if is_all(edges): eid = ALL - u, v, _ = self._graph.edges() + u, v, _ = self._graph.edges('eid') elif isinstance(edges, tuple): u, v = edges u = utils.toindex(u) diff --git a/python/dgl/udf.py b/python/dgl/udf.py index e4123a87dba1..c121a7c4d321 100644 --- a/python/dgl/udf.py +++ b/python/dgl/udf.py @@ -77,7 +77,7 @@ def edges(self): """ if is_all(self._edges[2]): self._edges = self._edges[:2] + (utils.toindex(F.arange( - 0, self._g.number_of_nodes())),) + 0, self._g.number_of_edges())),) u, v, eid = self._edges return (u.tousertensor(), v.tousertensor(), eid.tousertensor()) diff --git a/tests/compute/test_udf.py b/tests/compute/test_udf.py new file mode 100644 index 000000000000..3c0b92484f2f --- /dev/null +++ b/tests/compute/test_udf.py @@ -0,0 +1,82 @@ +import backend as F +import dgl +import networkx as nx +import dgl.utils as utils +from dgl import DGLGraph, ALL +from dgl.udf import NodeBatch, EdgeBatch + +def test_node_batch(): + g = dgl.DGLGraph(nx.path_graph(20)) + feat = F.randn((g.number_of_nodes(), 10)) + g.ndata['x'] = feat + + # test all + v = ALL + n_repr = g.get_n_repr(v) + nbatch = NodeBatch(g, v, n_repr) + assert F.allclose(nbatch.data['x'], feat) + assert nbatch.mailbox is None + assert F.allclose(nbatch.nodes(), g.nodes()) + assert nbatch.batch_size() == g.number_of_nodes() + assert len(nbatch) == g.number_of_nodes() + + # test partial + v = utils.toindex(F.tensor([0, 3, 5, 7, 9])) + n_repr = g.get_n_repr(v) + nbatch = NodeBatch(g, v, n_repr) + assert F.allclose(nbatch.data['x'], F.gather_row(feat, F.tensor([0, 3, 5, 7, 9]))) + assert nbatch.mailbox is None + assert F.allclose(nbatch.nodes(), F.tensor([0, 3, 5, 7, 9])) + assert nbatch.batch_size() == 5 + assert len(nbatch) == 5 + +def test_edge_batch(): + d = 10 + g = dgl.DGLGraph(nx.path_graph(20)) + nfeat = F.randn((g.number_of_nodes(), d)) + efeat = F.randn((g.number_of_edges(), d)) + g.ndata['x'] = nfeat + g.edata['x'] = efeat + + # test all + eid = ALL + u, v, _ = g._graph.edges('eid') + + src_data = g.get_n_repr(u) + edge_data = g.get_e_repr(eid) + dst_data = g.get_n_repr(v) + ebatch = EdgeBatch(g, (u, v, eid), src_data, edge_data, dst_data) + assert F.shape(ebatch.src['x'])[0] == g.number_of_edges() and\ + F.shape(ebatch.src['x'])[1] == d + assert F.shape(ebatch.dst['x'])[0] == g.number_of_edges() and\ + F.shape(ebatch.dst['x'])[1] == d + assert F.shape(ebatch.data['x'])[0] == g.number_of_edges() and\ + F.shape(ebatch.data['x'])[1] == d + assert F.allclose(ebatch.edges()[0], u.tousertensor()) + assert F.allclose(ebatch.edges()[1], v.tousertensor()) + assert F.allclose(ebatch.edges()[2], F.arange(0, g.number_of_edges())) + assert ebatch.batch_size() == g.number_of_edges() + assert len(ebatch) == g.number_of_edges() + + # test partial + eid = utils.toindex(F.tensor([0, 3, 5, 7, 11, 13, 15, 27])) + u, v, _ = g._graph.find_edges(eid) + src_data = g.get_n_repr(u) + edge_data = g.get_e_repr(eid) + dst_data = g.get_n_repr(v) + ebatch = EdgeBatch(g, (u, v, eid), src_data, edge_data, dst_data) + assert F.shape(ebatch.src['x'])[0] == 8 and\ + F.shape(ebatch.src['x'])[1] == d + assert F.shape(ebatch.dst['x'])[0] == 8 and\ + F.shape(ebatch.dst['x'])[1] == d + assert F.shape(ebatch.data['x'])[0] == 8 and\ + F.shape(ebatch.data['x'])[1] == d + assert F.allclose(ebatch.edges()[0], u.tousertensor()) + assert F.allclose(ebatch.edges()[1], v.tousertensor()) + assert F.allclose(ebatch.edges()[2], eid.tousertensor()) + assert ebatch.batch_size() == 8 + assert len(ebatch) == 8 + +if __name__ == '__main__': + test_node_batch() + test_edge_batch() From fb62f10deffe60d9da711442bb583951c931211d Mon Sep 17 00:00:00 2001 From: yzh119 Date: Wed, 7 Aug 2019 19:21:40 +0800 Subject: [PATCH 04/32] trigger From e291d6d1f9dbf90ce6dbba907ba6db486ddfa05c Mon Sep 17 00:00:00 2001 From: Song Date: Fri, 16 Aug 2019 11:41:54 +0800 Subject: [PATCH 05/32] Update README.md for pytorch PinSage example. Add noting that the PinSage model example under example/pytorch/recommendation only work with Python 3.6+ as its dataset loader depends on stanfordnlp package which work only with Python 3.6+. --- examples/pytorch/recommendation/README.md | 2 ++ examples/pytorch/recommendation/main.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/pytorch/recommendation/README.md b/examples/pytorch/recommendation/README.md index a5dda0c7b046..da21331f0cf5 100644 --- a/examples/pytorch/recommendation/README.md +++ b/examples/pytorch/recommendation/README.md @@ -1,6 +1,8 @@ # PinSage model NOTE: this version is not using NodeFlow yet. + +This example only work with Python 3.6+ First, download and extract from https://dgl.ai.s3.us-east-2.amazonaws.com/dataset/ml-1m.tar.gz diff --git a/examples/pytorch/recommendation/main.py b/examples/pytorch/recommendation/main.py index 52beb768ef74..6a29d840b7c7 100644 --- a/examples/pytorch/recommendation/main.py +++ b/examples/pytorch/recommendation/main.py @@ -7,7 +7,6 @@ from rec.model.pinsage import PinSage from rec.datasets.movielens import MovieLens from rec.utils import cuda -from rec.adabound import AdaBound from dgl import DGLGraph import argparse From 5fdc2890749df0d313023e123615bed4927a2626 Mon Sep 17 00:00:00 2001 From: Song Date: Mon, 19 Aug 2019 12:53:33 +0800 Subject: [PATCH 06/32] Provid a frame agnostic API to test nn modules on both CPU and CUDA side. 1. make dgl.nn.xxx frame agnostic 2. make test.backend include dgl.nn modules 3. modify test_edge_softmax of test/mxnet/test_nn.py and test/pytorch/test_nn.py work on both CPU and GPU --- python/dgl/__init__.py | 3 +- python/dgl/nn/__init__.py | 54 +++++++++++++++++++++++++++++++ python/dgl/nn/backend.py | 13 ++++++++ tests/backend/__init__.py | 5 +++ tests/backend/backend_unittest.py | 4 +++ tests/backend/mxnet/__init__.py | 3 ++ tests/backend/pytorch/__init__.py | 3 ++ tests/mxnet/test_nn.py | 9 +++--- tests/pytorch/test_nn.py | 33 ++++++++++--------- 9 files changed, 106 insertions(+), 21 deletions(-) create mode 100644 python/dgl/nn/backend.py diff --git a/python/dgl/__init__.py b/python/dgl/__init__.py index f6ec994e8e2c..2ad2c69dfc82 100644 --- a/python/dgl/__init__.py +++ b/python/dgl/__init__.py @@ -4,7 +4,6 @@ import socket from . import function -from . import nn from . import contrib from . import container from . import random @@ -22,3 +21,5 @@ from .transform import * from .propagate import * from .udf import NodeBatch, EdgeBatch + +from . import nn \ No newline at end of file diff --git a/python/dgl/nn/__init__.py b/python/dgl/nn/__init__.py index 242106dd407b..e95e9022631d 100644 --- a/python/dgl/nn/__init__.py +++ b/python/dgl/nn/__init__.py @@ -1 +1,55 @@ """Package for neural network common components.""" +from __future__ import absolute_import + +import sys +import os +import importlib + +from . import backend + +_enabled_nn_modules = set() + +def _gen_missing_nn_module(module, mod_name): + def _missing_nn_module(*args, **kwargs): + raise ImportError('nn.Module "%s" is not supported by backend "%s".' + ' You can switch to other backends by setting' + ' the DGLBACKEND environment.' % (module, mod_name)) + return _missing_nn_module + +def load_backend(mod_name): + """load backend module according to mod_name + Parameters + ---------- + mod_name : str + The DGL Backend name. + """ + mod = importlib.import_module('.%s' % mod_name, __name__) + thismod = sys.modules[__name__] + for nn_module in backend.__dict__.keys(): + if nn_module.startswith('__'): + # ignore python builtin attributes + continue + else: + # load functions and classes + if nn_module in mod.__dict__: + _enabled_nn_modules.add(nn_module) + setattr(thismod, nn_module, mod.__dict__[nn_module]) + else: + setattr(thismod, nn_module, _gen_missing_nn_module(nn_module, mod_name)) + +load_backend(os.environ.get('DGLBACKEND', 'pytorch').lower()) + +def is_enabled(nn_module): + """Return true if the nn.module is enabled by the current backend. + + Parameters + ---------- + nn_module : str + The nn.module name. + + Returns + ------- + bool + True if the nn_module is enabled by the current backend. + """ + return nn_module in _enabled_nn_modules \ No newline at end of file diff --git a/python/dgl/nn/backend.py b/python/dgl/nn/backend.py new file mode 100644 index 000000000000..9f2700924582 --- /dev/null +++ b/python/dgl/nn/backend.py @@ -0,0 +1,13 @@ +"""This file defines high level nn.model interface provided by DGL. + +It is recommended the frameworks implement all the interfaces. However, it is +also OK to skip some. The generated backend module has an ``is_enabled`` function +that returns whether the interface is supported by the framework or not. +""" + +############################################################################### +# Softmax module + +def edge_softmax(graph, logits): + """Compute edge softmax""" + pass \ No newline at end of file diff --git a/tests/backend/__init__.py b/tests/backend/__init__.py index 5d388a4ae400..0825ae6e778c 100644 --- a/tests/backend/__init__.py +++ b/tests/backend/__init__.py @@ -1,4 +1,5 @@ from dgl.backend import * +from dgl.nn import * from . import backend_unittest import os import importlib @@ -22,6 +23,7 @@ _zeros = zeros _ones = ones _randn = randn +_rand = rand _tensor = tensor _arange = arange _full = full @@ -43,6 +45,9 @@ def ones(shape, dtype=float32, ctx=_default_context): def randn(shape): return copy_to(_randn(shape), _default_context) +def rand(shape): + return copy_to(_rand(shape), _default_context) + def tensor(data, dtype=None): if dtype is None: if is_tensor(data): diff --git a/tests/backend/backend_unittest.py b/tests/backend/backend_unittest.py index 3c7e90542421..00a391244d37 100644 --- a/tests/backend/backend_unittest.py +++ b/tests/backend/backend_unittest.py @@ -31,6 +31,10 @@ def randn(shape): """Generate a tensor with elements from standard normal distribution.""" pass +def rand(shape): + """Generate a tensor with elements from a discrete uniform distribution.""" + pass + def attach_grad(x): """Flag the tensor *in-place* to have its gradient computed in backward pass. diff --git a/tests/backend/mxnet/__init__.py b/tests/backend/mxnet/__init__.py index cd678699b8ca..5dcba2e5f0e5 100644 --- a/tests/backend/mxnet/__init__.py +++ b/tests/backend/mxnet/__init__.py @@ -25,6 +25,9 @@ def allclose(a, b, rtol=1e-4, atol=1e-4): def randn(shape): return nd.random.randn(*shape) +def rand(shape): + return nd.random.randint(0, 1, *shape) + def attach_grad(x): x.attach_grad() return x diff --git a/tests/backend/pytorch/__init__.py b/tests/backend/pytorch/__init__.py index c7ffa72f655b..690902a2ee71 100644 --- a/tests/backend/pytorch/__init__.py +++ b/tests/backend/pytorch/__init__.py @@ -18,6 +18,9 @@ def allclose(a, b, rtol=1e-4, atol=1e-4): def randn(shape): return th.randn(*shape) +def rand(shape): + return th.rand(*shape) + def attach_grad(x): if x.grad is not None: x.grad.zero_() diff --git a/tests/mxnet/test_nn.py b/tests/mxnet/test_nn.py index 4432ef2b5a65..d9bef57cc53c 100644 --- a/tests/mxnet/test_nn.py +++ b/tests/mxnet/test_nn.py @@ -3,6 +3,7 @@ import numpy as np import dgl import dgl.nn.mxnet as nn +import backend as F from mxnet import autograd, gluon def check_close(a, b): @@ -163,16 +164,16 @@ def uniform_attention(g, shape): def test_edge_softmax(): # Basic g = dgl.DGLGraph(nx.path_graph(3)) - edata = mx.nd.ones((g.number_of_edges(), 1)) - a = nn.edge_softmax(g, edata) + edata = F.ones((g.number_of_edges(), 1)) + a = F.edge_softmax(g, edata) assert len(g.ndata) == 0 assert len(g.edata) == 0 assert np.allclose(a.asnumpy(), uniform_attention(g, a.shape).asnumpy(), 1e-4, 1e-4) # Test higher dimension case - edata = mx.nd.ones((g.number_of_edges(), 3, 1)) - a = nn.edge_softmax(g, edata) + edata = F.ones((g.number_of_edges(), 3, 1)) + a = F.edge_softmax(g, edata) assert len(g.ndata) == 0 assert len(g.edata) == 0 assert np.allclose(a.asnumpy(), uniform_attention(g, a.shape).asnumpy(), diff --git a/tests/pytorch/test_nn.py b/tests/pytorch/test_nn.py index baea5f7e85fe..c485067c86ce 100644 --- a/tests/pytorch/test_nn.py +++ b/tests/pytorch/test_nn.py @@ -2,6 +2,7 @@ import networkx as nx import dgl import dgl.nn.pytorch as nn +import backend as F from copy import deepcopy import numpy as np @@ -187,18 +188,18 @@ def uniform_attention(g, shape): def test_edge_softmax(): # Basic g = dgl.DGLGraph(nx.path_graph(3)) - edata = th.ones(g.number_of_edges(), 1) - a = nn.edge_softmax(g, edata) + edata = F.ones((g.number_of_edges(), 1)) + a = F.edge_softmax(g, edata) assert len(g.ndata) == 0 assert len(g.edata) == 0 - assert th.allclose(a, uniform_attention(g, a.shape)) + assert F.allclose(a, uniform_attention(g, a.shape)) # Test higher dimension case - edata = th.ones(g.number_of_edges(), 3, 1) - a = nn.edge_softmax(g, edata) + edata = F.ones((g.number_of_edges(), 3, 1)) + a = F.edge_softmax(g, edata) assert len(g.ndata) == 0 assert len(g.edata) == 0 - assert th.allclose(a, uniform_attention(g, a.shape)) + assert F.allclose(a, uniform_attention(g, a.shape)) # Test both forward and backward with PyTorch built-in softmax. g = dgl.DGLGraph() @@ -208,21 +209,21 @@ def test_edge_softmax(): for j in range(30): g.add_edge(i, j) - score = th.rand(900, 1) + score = F.rand((900, 1)) score.requires_grad_() - grad = th.rand(900, 1) - y = th.softmax(score.view(30, 30), dim=0).view(-1, 1) + grad = F.rand((900, 1)) + y = F.softmax(score.view(30, 30), dim=0).view(-1, 1) y.backward(grad) grad_score = score.grad score.grad.zero_() - y_dgl = nn.edge_softmax(g, score) + y_dgl = F.edge_softmax(g, score) assert len(g.ndata) == 0 assert len(g.edata) == 0 # check forward - assert th.allclose(y_dgl, y) + assert F.allclose(y_dgl, y) y_dgl.backward(grad) # checkout gradient - assert th.allclose(score.grad, grad_score) + assert F.allclose(score.grad, grad_score) print(score.grad[:10], grad_score[:10]) # Test 2 @@ -231,18 +232,18 @@ def generate_rand_graph(n): return dgl.DGLGraph(arr, readonly=True) g = generate_rand_graph(50) - a1 = th.randn(g.number_of_edges(), 1).requires_grad_() + a1 = F.randn((g.number_of_edges(), 1)).requires_grad_() a2 = a1.clone().detach().requires_grad_() g.edata['s'] = a1 - g.group_apply_edges('dst', lambda edges: {'ss':th.softmax(edges.data['s'], 1)}) + g.group_apply_edges('dst', lambda edges: {'ss':F.softmax(edges.data['s'], 1)}) g.edata['ss'].sum().backward() - builtin_sm = nn.edge_softmax(g, a2) + builtin_sm = F.edge_softmax(g, a2) builtin_sm.sum().backward() print(a1.grad - a2.grad) assert len(g.ndata) == 0 assert len(g.edata) == 2 - assert th.allclose(a1.grad, a2.grad, rtol=1e-4, atol=1e-4) # Follow tolerance in unittest backend + assert F.allclose(a1.grad, a2.grad, rtol=1e-4, atol=1e-4) # Follow tolerance in unittest backend if __name__ == '__main__': From 2e89c6f72e932cf7ca4f2e73d5faeeef7d9682b5 Mon Sep 17 00:00:00 2001 From: Song Date: Mon, 19 Aug 2019 13:21:43 +0800 Subject: [PATCH 07/32] Fix style --- python/dgl/__init__.py | 2 +- python/dgl/nn/__init__.py | 6 +++--- python/dgl/nn/backend.py | 16 ++++++++++++++-- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/python/dgl/__init__.py b/python/dgl/__init__.py index 2ad2c69dfc82..e8b1f0d0e0e5 100644 --- a/python/dgl/__init__.py +++ b/python/dgl/__init__.py @@ -22,4 +22,4 @@ from .propagate import * from .udf import NodeBatch, EdgeBatch -from . import nn \ No newline at end of file +from . import nn diff --git a/python/dgl/nn/__init__.py b/python/dgl/nn/__init__.py index e95e9022631d..aae9c27337d9 100644 --- a/python/dgl/nn/__init__.py +++ b/python/dgl/nn/__init__.py @@ -17,7 +17,7 @@ def _missing_nn_module(*args, **kwargs): return _missing_nn_module def load_backend(mod_name): - """load backend module according to mod_name + """load backend module according to mod_name Parameters ---------- mod_name : str @@ -25,7 +25,7 @@ def load_backend(mod_name): """ mod = importlib.import_module('.%s' % mod_name, __name__) thismod = sys.modules[__name__] - for nn_module in backend.__dict__.keys(): + for nn_module in backend.__dict__: if nn_module.startswith('__'): # ignore python builtin attributes continue @@ -52,4 +52,4 @@ def is_enabled(nn_module): bool True if the nn_module is enabled by the current backend. """ - return nn_module in _enabled_nn_modules \ No newline at end of file + return nn_module in _enabled_nn_modules diff --git a/python/dgl/nn/backend.py b/python/dgl/nn/backend.py index 9f2700924582..e7b6176b5e4b 100644 --- a/python/dgl/nn/backend.py +++ b/python/dgl/nn/backend.py @@ -9,5 +9,17 @@ # Softmax module def edge_softmax(graph, logits): - """Compute edge softmax""" - pass \ No newline at end of file + """Compute edge softmax + Parameters + ---------- + graph : DGLGraph + The graph to perform edge softmax + logits : Tensor + The input edge feature + + Returns + ------- + Tensor + Softmax value + """ + pass From b1af382a766adb65ea3508485d9c786f7ff45a34 Mon Sep 17 00:00:00 2001 From: Song Date: Mon, 19 Aug 2019 13:39:46 +0800 Subject: [PATCH 08/32] Delete unused code --- python/dgl/nn/__init__.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/python/dgl/nn/__init__.py b/python/dgl/nn/__init__.py index aae9c27337d9..4be0dec86c5c 100644 --- a/python/dgl/nn/__init__.py +++ b/python/dgl/nn/__init__.py @@ -7,8 +7,6 @@ from . import backend -_enabled_nn_modules = set() - def _gen_missing_nn_module(module, mod_name): def _missing_nn_module(*args, **kwargs): raise ImportError('nn.Module "%s" is not supported by backend "%s".' @@ -32,24 +30,8 @@ def load_backend(mod_name): else: # load functions and classes if nn_module in mod.__dict__: - _enabled_nn_modules.add(nn_module) setattr(thismod, nn_module, mod.__dict__[nn_module]) else: setattr(thismod, nn_module, _gen_missing_nn_module(nn_module, mod_name)) load_backend(os.environ.get('DGLBACKEND', 'pytorch').lower()) - -def is_enabled(nn_module): - """Return true if the nn.module is enabled by the current backend. - - Parameters - ---------- - nn_module : str - The nn.module name. - - Returns - ------- - bool - True if the nn_module is enabled by the current backend. - """ - return nn_module in _enabled_nn_modules From 85630e33ccde637fa463de368b12edee63718bf9 Mon Sep 17 00:00:00 2001 From: Song Date: Mon, 19 Aug 2019 15:47:12 +0800 Subject: [PATCH 09/32] Make agnostic test only related to tests/backend 1. clear all agnostic related code in dgl.nn 2. make test_graph_conv agnostic to cpu/gpu --- python/dgl/__init__.py | 3 +-- python/dgl/nn/__init__.py | 38 +------------------------------------- python/dgl/nn/backend.py | 25 ------------------------- tests/backend/__init__.py | 6 ++++++ tests/mxnet/test_nn.py | 24 ++++++++++++------------ tests/pytorch/test_nn.py | 35 +++++++++++++++++++++-------------- 6 files changed, 41 insertions(+), 90 deletions(-) delete mode 100644 python/dgl/nn/backend.py diff --git a/python/dgl/__init__.py b/python/dgl/__init__.py index e8b1f0d0e0e5..f6ec994e8e2c 100644 --- a/python/dgl/__init__.py +++ b/python/dgl/__init__.py @@ -4,6 +4,7 @@ import socket from . import function +from . import nn from . import contrib from . import container from . import random @@ -21,5 +22,3 @@ from .transform import * from .propagate import * from .udf import NodeBatch, EdgeBatch - -from . import nn diff --git a/python/dgl/nn/__init__.py b/python/dgl/nn/__init__.py index 4be0dec86c5c..90d4522c3fd4 100644 --- a/python/dgl/nn/__init__.py +++ b/python/dgl/nn/__init__.py @@ -1,37 +1 @@ -"""Package for neural network common components.""" -from __future__ import absolute_import - -import sys -import os -import importlib - -from . import backend - -def _gen_missing_nn_module(module, mod_name): - def _missing_nn_module(*args, **kwargs): - raise ImportError('nn.Module "%s" is not supported by backend "%s".' - ' You can switch to other backends by setting' - ' the DGLBACKEND environment.' % (module, mod_name)) - return _missing_nn_module - -def load_backend(mod_name): - """load backend module according to mod_name - Parameters - ---------- - mod_name : str - The DGL Backend name. - """ - mod = importlib.import_module('.%s' % mod_name, __name__) - thismod = sys.modules[__name__] - for nn_module in backend.__dict__: - if nn_module.startswith('__'): - # ignore python builtin attributes - continue - else: - # load functions and classes - if nn_module in mod.__dict__: - setattr(thismod, nn_module, mod.__dict__[nn_module]) - else: - setattr(thismod, nn_module, _gen_missing_nn_module(nn_module, mod_name)) - -load_backend(os.environ.get('DGLBACKEND', 'pytorch').lower()) +"""Package for neural network common components.""" \ No newline at end of file diff --git a/python/dgl/nn/backend.py b/python/dgl/nn/backend.py deleted file mode 100644 index e7b6176b5e4b..000000000000 --- a/python/dgl/nn/backend.py +++ /dev/null @@ -1,25 +0,0 @@ -"""This file defines high level nn.model interface provided by DGL. - -It is recommended the frameworks implement all the interfaces. However, it is -also OK to skip some. The generated backend module has an ``is_enabled`` function -that returns whether the interface is supported by the framework or not. -""" - -############################################################################### -# Softmax module - -def edge_softmax(graph, logits): - """Compute edge softmax - Parameters - ---------- - graph : DGLGraph - The graph to perform edge softmax - logits : Tensor - The input edge feature - - Returns - ------- - Tensor - Softmax value - """ - pass diff --git a/tests/backend/__init__.py b/tests/backend/__init__.py index 0825ae6e778c..dd189bdec706 100644 --- a/tests/backend/__init__.py +++ b/tests/backend/__init__.py @@ -36,6 +36,12 @@ } _default_context = _context_dict[_default_context_str] +def ctx(): + return _default_context + +def gpu_ctx(): + return (_default_context_str == 'gpu') + def zeros(shape, dtype=float32, ctx=_default_context): return _zeros(shape, dtype, ctx) diff --git a/tests/mxnet/test_nn.py b/tests/mxnet/test_nn.py index d9bef57cc53c..ffff72a0ae45 100644 --- a/tests/mxnet/test_nn.py +++ b/tests/mxnet/test_nn.py @@ -16,19 +16,19 @@ def _AXWb(A, X, W, b): def test_graph_conv(): g = dgl.DGLGraph(nx.path_graph(3)) - adj = g.adjacency_matrix() - ctx = mx.cpu(0) + ctx = F.ctx() + adj = g.adjacency_matrix(ctx=ctx) conv = nn.GraphConv(5, 2, norm=False, bias=True) conv.initialize(ctx=ctx) # test#1: basic - h0 = mx.nd.ones((3, 5)) + h0 = F.ones((3, 5)) h1 = conv(h0, g) assert len(g.ndata) == 0 assert len(g.edata) == 0 check_close(h1, _AXWb(adj, h0, conv.weight, conv.bias)) # test#2: more-dim - h0 = mx.nd.ones((3, 5, 5)) + h0 = F.ones((3, 5, 5)) h1 = conv(h0, g) assert len(g.ndata) == 0 assert len(g.edata) == 0 @@ -38,12 +38,12 @@ def test_graph_conv(): conv.initialize(ctx=ctx) # test#3: basic - h0 = mx.nd.ones((3, 5)) + h0 = F.ones((3, 5)) h1 = conv(h0, g) assert len(g.ndata) == 0 assert len(g.edata) == 0 # test#4: basic - h0 = mx.nd.ones((3, 5, 5)) + h0 = F.ones((3, 5, 5)) h1 = conv(h0, g) assert len(g.ndata) == 0 assert len(g.edata) == 0 @@ -53,23 +53,23 @@ def test_graph_conv(): with autograd.train_mode(): # test#3: basic - h0 = mx.nd.ones((3, 5)) + h0 = F.ones((3, 5)) h1 = conv(h0, g) assert len(g.ndata) == 0 assert len(g.edata) == 0 # test#4: basic - h0 = mx.nd.ones((3, 5, 5)) + h0 = F.ones((3, 5, 5)) h1 = conv(h0, g) assert len(g.ndata) == 0 assert len(g.edata) == 0 # test not override features - g.ndata["h"] = 2 * mx.nd.ones((3, 1)) + g.ndata["h"] = 2 * F.ones((3, 1)) h1 = conv(h0, g) assert len(g.ndata) == 1 assert len(g.edata) == 0 assert "h" in g.ndata - check_close(g.ndata['h'], 2 * mx.nd.ones((3, 1))) + check_close(g.ndata['h'], 2 * F.ones((3, 1))) def test_set2set(): g = dgl.DGLGraph(nx.path_graph(10)) @@ -165,7 +165,7 @@ def test_edge_softmax(): # Basic g = dgl.DGLGraph(nx.path_graph(3)) edata = F.ones((g.number_of_edges(), 1)) - a = F.edge_softmax(g, edata) + a = nn.edge_softmax(g, edata) assert len(g.ndata) == 0 assert len(g.edata) == 0 assert np.allclose(a.asnumpy(), uniform_attention(g, a.shape).asnumpy(), @@ -173,7 +173,7 @@ def test_edge_softmax(): # Test higher dimension case edata = F.ones((g.number_of_edges(), 3, 1)) - a = F.edge_softmax(g, edata) + a = nn.edge_softmax(g, edata) assert len(g.ndata) == 0 assert len(g.edata) == 0 assert np.allclose(a.asnumpy(), uniform_attention(g, a.shape).asnumpy(), diff --git a/tests/pytorch/test_nn.py b/tests/pytorch/test_nn.py index c485067c86ce..7a37e8806456 100644 --- a/tests/pytorch/test_nn.py +++ b/tests/pytorch/test_nn.py @@ -15,43 +15,50 @@ def _AXWb(A, X, W, b): def test_graph_conv(): g = dgl.DGLGraph(nx.path_graph(3)) - adj = g.adjacency_matrix() + ctx = F.ctx() + adj = g.adjacency_matrix(ctx=ctx) conv = nn.GraphConv(5, 2, norm=False, bias=True) + if F.gpu_ctx(): + conv.cuda() print(conv) # test#1: basic - h0 = th.ones((3, 5)) + h0 = F.ones((3, 5)) h1 = conv(h0, g) assert len(g.ndata) == 0 assert len(g.edata) == 0 - assert th.allclose(h1, _AXWb(adj, h0, conv.weight, conv.bias)) + assert F.allclose(h1, _AXWb(adj, h0, conv.weight, conv.bias)) # test#2: more-dim - h0 = th.ones((3, 5, 5)) + h0 = F.ones((3, 5, 5)) h1 = conv(h0, g) assert len(g.ndata) == 0 assert len(g.edata) == 0 - assert th.allclose(h1, _AXWb(adj, h0, conv.weight, conv.bias)) + assert F.allclose(h1, _AXWb(adj, h0, conv.weight, conv.bias)) conv = nn.GraphConv(5, 2) + if F.gpu_ctx(): + conv.cuda() # test#3: basic - h0 = th.ones((3, 5)) + h0 = F.ones((3, 5)) h1 = conv(h0, g) assert len(g.ndata) == 0 assert len(g.edata) == 0 # test#4: basic - h0 = th.ones((3, 5, 5)) + h0 = F.ones((3, 5, 5)) h1 = conv(h0, g) assert len(g.ndata) == 0 assert len(g.edata) == 0 conv = nn.GraphConv(5, 2) + if F.gpu_ctx(): + conv.cuda() # test#3: basic - h0 = th.ones((3, 5)) + h0 = F.ones((3, 5)) h1 = conv(h0, g) assert len(g.ndata) == 0 assert len(g.edata) == 0 # test#4: basic - h0 = th.ones((3, 5, 5)) + h0 = F.ones((3, 5, 5)) h1 = conv(h0, g) assert len(g.ndata) == 0 assert len(g.edata) == 0 @@ -60,7 +67,7 @@ def test_graph_conv(): old_weight = deepcopy(conv.weight.data) conv.reset_parameters() new_weight = conv.weight.data - assert not th.allclose(old_weight, new_weight) + assert not F.allclose(old_weight, new_weight) def test_set2set(): g = dgl.DGLGraph(nx.path_graph(10)) @@ -189,14 +196,14 @@ def test_edge_softmax(): # Basic g = dgl.DGLGraph(nx.path_graph(3)) edata = F.ones((g.number_of_edges(), 1)) - a = F.edge_softmax(g, edata) + a = nn.edge_softmax(g, edata) assert len(g.ndata) == 0 assert len(g.edata) == 0 assert F.allclose(a, uniform_attention(g, a.shape)) # Test higher dimension case edata = F.ones((g.number_of_edges(), 3, 1)) - a = F.edge_softmax(g, edata) + a = nn.edge_softmax(g, edata) assert len(g.ndata) == 0 assert len(g.edata) == 0 assert F.allclose(a, uniform_attention(g, a.shape)) @@ -216,7 +223,7 @@ def test_edge_softmax(): y.backward(grad) grad_score = score.grad score.grad.zero_() - y_dgl = F.edge_softmax(g, score) + y_dgl = nn.edge_softmax(g, score) assert len(g.ndata) == 0 assert len(g.edata) == 0 # check forward @@ -238,7 +245,7 @@ def generate_rand_graph(n): g.group_apply_edges('dst', lambda edges: {'ss':F.softmax(edges.data['s'], 1)}) g.edata['ss'].sum().backward() - builtin_sm = F.edge_softmax(g, a2) + builtin_sm = nn.edge_softmax(g, a2) builtin_sm.sum().backward() print(a1.grad - a2.grad) assert len(g.ndata) == 0 From 874352f4ce00de9e56f5517ca361dedca9e64716 Mon Sep 17 00:00:00 2001 From: Song Date: Mon, 19 Aug 2019 15:53:16 +0800 Subject: [PATCH 10/32] Fix code style --- python/dgl/nn/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dgl/nn/__init__.py b/python/dgl/nn/__init__.py index 90d4522c3fd4..242106dd407b 100644 --- a/python/dgl/nn/__init__.py +++ b/python/dgl/nn/__init__.py @@ -1 +1 @@ -"""Package for neural network common components.""" \ No newline at end of file +"""Package for neural network common components.""" From 47e468ac55134a09a71e4a193563ee1dcdcf602f Mon Sep 17 00:00:00 2001 From: yzh119 Date: Mon, 19 Aug 2019 17:21:47 +0800 Subject: [PATCH 11/32] fix --- python/dgl/nn/mxnet/glob.py | 11 ----------- python/dgl/nn/pytorch/glob.py | 24 +++++++++++------------- 2 files changed, 11 insertions(+), 24 deletions(-) diff --git a/python/dgl/nn/mxnet/glob.py b/python/dgl/nn/mxnet/glob.py index 1f0863f669fa..32d33df9402e 100644 --- a/python/dgl/nn/mxnet/glob.py +++ b/python/dgl/nn/mxnet/glob.py @@ -191,13 +191,6 @@ def __init__(self, gate_nn, feat_nn=None): self.gate_nn = gate_nn self.feat_nn = feat_nn - self._reset_parameters() - - def _reset_parameters(self): - self.gate_nn.initialize(mx.init.Xavier()) - if self.feat_nn: - self.feat_nn.initialize(mx.init.Xavier()) - def forward(self, feat, graph): r"""Compute global attention pooling. @@ -265,10 +258,6 @@ def __init__(self, input_dim, n_iters, n_layers): with self.name_scope(): self.lstm = gluon.rnn.LSTM( self.input_dim, num_layers=n_layers, input_size=self.output_dim) - self._reset_parameters() - - def _reset_parameters(self): - self.lstm.initialize(mx.init.Xavier()) def forward(self, feat, graph): r"""Compute set2set pooling. diff --git a/python/dgl/nn/pytorch/glob.py b/python/dgl/nn/pytorch/glob.py index 05254f27d4c6..3994bbc8844e 100644 --- a/python/dgl/nn/pytorch/glob.py +++ b/python/dgl/nn/pytorch/glob.py @@ -178,9 +178,9 @@ def __init__(self, gate_nn, feat_nn=None): super(GlobalAttentionPooling, self).__init__() self.gate_nn = gate_nn self.feat_nn = feat_nn - self._reset_parameters() + self.reset_parameters() - def _reset_parameters(self): + def reset_parameters(self): for p in self.gate_nn.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) @@ -256,12 +256,10 @@ def __init__(self, input_dim, n_iters, n_layers): self.n_iters = n_iters self.n_layers = n_layers self.lstm = th.nn.LSTM(self.output_dim, self.input_dim, n_layers) - self._reset_parameters() + self.reset_parameters() - def _reset_parameters(self): - for p in self.lstm.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) + def reset_parameters(self): + self.lstm.reset_parameters() def forward(self, feat, graph): r"""Compute set2set pooling. @@ -342,9 +340,9 @@ def __init__(self, d_model, num_heads, d_head, d_ff, dropouth=0., dropouta=0.): self.dropa = nn.Dropout(dropouta) self.norm_in = nn.LayerNorm(d_model) self.norm_inter = nn.LayerNorm(d_model) - self._reset_parameters() + self.reset_parameters() - def _reset_parameters(self): + def reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) @@ -441,9 +439,9 @@ def __init__(self, m, d_model, num_heads, d_head, d_ff, dropouth=0., dropouta=0. self.mha = nn.ModuleList([ MultiHeadAttention(d_model, num_heads, d_head, d_ff, dropouth=dropouth, dropouta=dropouta) for _ in range(2)]) - self._reset_parameters() + self.reset_parameters() - def _reset_parameters(self): + def reset_parameters(self): nn.init.xavier_uniform_(self.inducing_points) def forward(self, feat, lengths): @@ -492,9 +490,9 @@ def __init__(self, k, d_model, num_heads, d_head, d_ff, dropouth=0., dropouta=0. nn.Dropout(dropouth), nn.Linear(d_ff, d_model) ) - self._reset_parameters() + self.reset_parameters() - def _reset_parameters(self): + def reset_parameters(self): nn.init.xavier_uniform_(self.seed_vectors) def forward(self, feat, lengths): From 10e1d274d4f6fa5c783e2ce5c1aca9bd438169dc Mon Sep 17 00:00:00 2001 From: yzh119 Date: Mon, 19 Aug 2019 17:41:37 +0800 Subject: [PATCH 12/32] doc --- python/dgl/nn/pytorch/glob.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/dgl/nn/pytorch/glob.py b/python/dgl/nn/pytorch/glob.py index 3994bbc8844e..26fd52444515 100644 --- a/python/dgl/nn/pytorch/glob.py +++ b/python/dgl/nn/pytorch/glob.py @@ -181,6 +181,7 @@ def __init__(self, gate_nn, feat_nn=None): self.reset_parameters() def reset_parameters(self): + """Reinitialize learnable parameters.""" for p in self.gate_nn.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) @@ -259,6 +260,7 @@ def __init__(self, input_dim, n_iters, n_layers): self.reset_parameters() def reset_parameters(self): + """Reinitialize learnable parameters.""" self.lstm.reset_parameters() def forward(self, feat, graph): @@ -343,6 +345,7 @@ def __init__(self, d_model, num_heads, d_head, d_ff, dropouth=0., dropouta=0.): self.reset_parameters() def reset_parameters(self): + """Reinitialize learnable parameters.""" for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) @@ -442,6 +445,7 @@ def __init__(self, m, d_model, num_heads, d_head, d_ff, dropouth=0., dropouta=0. self.reset_parameters() def reset_parameters(self): + """Reinitialize learnable parameters.""" nn.init.xavier_uniform_(self.inducing_points) def forward(self, feat, lengths): @@ -493,6 +497,7 @@ def __init__(self, k, d_model, num_heads, d_head, d_ff, dropouth=0., dropouta=0. self.reset_parameters() def reset_parameters(self): + """Reinitialize learnable parameters.""" nn.init.xavier_uniform_(self.seed_vectors) def forward(self, feat, lengths): From 4bfe71cbbf21cb5f7d03e254fb83112a4156bab4 Mon Sep 17 00:00:00 2001 From: Song Date: Mon, 19 Aug 2019 21:31:54 +0800 Subject: [PATCH 13/32] Make all test code under tests.mxnet/pytorch.test_nn.py work on both CPU and GPU. --- tests/mxnet/test_nn.py | 53 +++++++++++++++++--------------- tests/pytorch/test_nn.py | 66 ++++++++++++++++++++++------------------ 2 files changed, 65 insertions(+), 54 deletions(-) diff --git a/tests/mxnet/test_nn.py b/tests/mxnet/test_nn.py index ffff72a0ae45..e151958f8d31 100644 --- a/tests/mxnet/test_nn.py +++ b/tests/mxnet/test_nn.py @@ -73,34 +73,38 @@ def test_graph_conv(): def test_set2set(): g = dgl.DGLGraph(nx.path_graph(10)) + ctx = F.ctx() s2s = nn.Set2Set(5, 3, 3) # hidden size 5, 3 iters, 3 layers + s2s.initialize(ctx=ctx) print(s2s) # test#1: basic - h0 = mx.nd.random.randn(g.number_of_nodes(), 5) + h0 = F.randn((g.number_of_nodes(), 5)) h1 = s2s(h0, g) assert h1.shape[0] == 10 and h1.ndim == 1 # test#2: batched graph bg = dgl.batch([g, g, g]) - h0 = mx.nd.random.randn(bg.number_of_nodes(), 5) + h0 = F.randn((bg.number_of_nodes(), 5)) h1 = s2s(h0, bg) assert h1.shape[0] == 3 and h1.shape[1] == 10 and h1.ndim == 2 def test_glob_att_pool(): g = dgl.DGLGraph(nx.path_graph(10)) + ctx = F.ctx() gap = nn.GlobalAttentionPooling(gluon.nn.Dense(1), gluon.nn.Dense(10)) + gap.initialize(ctx=ctx) print(gap) # test#1: basic - h0 = mx.nd.random.randn(g.number_of_nodes(), 5) + h0 = F.randn((g.number_of_nodes(), 5)) h1 = gap(h0, g) assert h1.shape[0] == 10 and h1.ndim == 1 # test#2: batched graph bg = dgl.batch([g, g, g, g]) - h0 = mx.nd.random.randn(bg.number_of_nodes(), 5) + h0 = F.randn((bg.number_of_nodes(), 5)) h1 = gap(h0, bg) assert h1.shape[0] == 4 and h1.shape[1] == 10 and h1.ndim == 2 @@ -114,48 +118,47 @@ def test_simple_pool(): print(sum_pool, avg_pool, max_pool, sort_pool) # test#1: basic - h0 = mx.nd.random.randn(g.number_of_nodes(), 5) + h0 = F.randn((g.number_of_nodes(), 5)) h1 = sum_pool(h0, g) - check_close(h1, mx.nd.sum(h0, 0)) + check_close(h1, F.sum(h0, 0)) h1 = avg_pool(h0, g) - check_close(h1, mx.nd.mean(h0, 0)) + check_close(h1, F.mean(h0, 0)) h1 = max_pool(h0, g) - check_close(h1, mx.nd.max(h0, 0)) + check_close(h1, F.max(h0, 0)) h1 = sort_pool(h0, g) assert h1.shape[0] == 10 * 5 and h1.ndim == 1 # test#2: batched graph g_ = dgl.DGLGraph(nx.path_graph(5)) bg = dgl.batch([g, g_, g, g_, g]) - h0 = mx.nd.random.randn(bg.number_of_nodes(), 5) + h0 = F.randn((bg.number_of_nodes(), 5)) h1 = sum_pool(h0, bg) - truth = mx.nd.stack(mx.nd.sum(h0[:15], 0), - mx.nd.sum(h0[15:20], 0), - mx.nd.sum(h0[20:35], 0), - mx.nd.sum(h0[35:40], 0), - mx.nd.sum(h0[40:55], 0), axis=0) + truth = mx.nd.stack(F.sum(h0[:15], 0), + F.sum(h0[15:20], 0), + F.sum(h0[20:35], 0), + F.sum(h0[35:40], 0), + F.sum(h0[40:55], 0), axis=0) check_close(h1, truth) h1 = avg_pool(h0, bg) - truth = mx.nd.stack(mx.nd.mean(h0[:15], 0), - mx.nd.mean(h0[15:20], 0), - mx.nd.mean(h0[20:35], 0), - mx.nd.mean(h0[35:40], 0), - mx.nd.mean(h0[40:55], 0), axis=0) + truth = mx.nd.stack(F.mean(h0[:15], 0), + F.mean(h0[15:20], 0), + F.mean(h0[20:35], 0), + F.mean(h0[35:40], 0), + F.mean(h0[40:55], 0), axis=0) check_close(h1, truth) h1 = max_pool(h0, bg) - truth = mx.nd.stack(mx.nd.max(h0[:15], 0), - mx.nd.max(h0[15:20], 0), - mx.nd.max(h0[20:35], 0), - mx.nd.max(h0[35:40], 0), - mx.nd.max(h0[40:55], 0), axis=0) + truth = mx.nd.stack(F.max(h0[:15], 0), + F.max(h0[15:20], 0), + F.max(h0[20:35], 0), + F.max(h0[35:40], 0), + F.max(h0[40:55], 0), axis=0) check_close(h1, truth) h1 = sort_pool(h0, bg) assert h1.shape[0] == 5 and h1.shape[1] == 10 * 5 and h1.ndim == 2 - def uniform_attention(g, shape): a = mx.nd.ones(shape) target_shape = (g.number_of_edges(),) + (1,) * (len(shape) - 1) diff --git a/tests/pytorch/test_nn.py b/tests/pytorch/test_nn.py index 7a37e8806456..b38913b2c9b6 100644 --- a/tests/pytorch/test_nn.py +++ b/tests/pytorch/test_nn.py @@ -73,10 +73,12 @@ def test_set2set(): g = dgl.DGLGraph(nx.path_graph(10)) s2s = nn.Set2Set(5, 3, 3) # hidden size 5, 3 iters, 3 layers + if F.gpu_ctx(): + s2s.cuda() print(s2s) # test#1: basic - h0 = th.rand(g.number_of_nodes(), 5) + h0 = F.rand((g.number_of_nodes(), 5)) h1 = s2s(h0, g) assert h1.shape[0] == 10 and h1.dim() == 1 @@ -84,7 +86,7 @@ def test_set2set(): g1 = dgl.DGLGraph(nx.path_graph(11)) g2 = dgl.DGLGraph(nx.path_graph(5)) bg = dgl.batch([g, g1, g2]) - h0 = th.rand(bg.number_of_nodes(), 5) + h0 = F.rand((bg.number_of_nodes(), 5)) h1 = s2s(h0, bg) assert h1.shape[0] == 3 and h1.shape[1] == 10 and h1.dim() == 2 @@ -92,16 +94,18 @@ def test_glob_att_pool(): g = dgl.DGLGraph(nx.path_graph(10)) gap = nn.GlobalAttentionPooling(th.nn.Linear(5, 1), th.nn.Linear(5, 10)) + if F.gpu_ctx(): + gap.cuda() print(gap) # test#1: basic - h0 = th.rand(g.number_of_nodes(), 5) + h0 = F.rand((g.number_of_nodes(), 5)) h1 = gap(h0, g) assert h1.shape[0] == 10 and h1.dim() == 1 # test#2: batched graph bg = dgl.batch([g, g, g, g]) - h0 = th.rand(bg.number_of_nodes(), 5) + h0 = F.rand((bg.number_of_nodes(), 5)) h1 = gap(h0, bg) assert h1.shape[0] == 4 and h1.shape[1] == 10 and h1.dim() == 2 @@ -115,44 +119,44 @@ def test_simple_pool(): print(sum_pool, avg_pool, max_pool, sort_pool) # test#1: basic - h0 = th.rand(g.number_of_nodes(), 5) + h0 = F.rand((g.number_of_nodes(), 5)) h1 = sum_pool(h0, g) - assert th.allclose(h1, th.sum(h0, 0)) + assert F.allclose(h1, F.sum(h0, 0)) h1 = avg_pool(h0, g) - assert th.allclose(h1, th.mean(h0, 0)) + assert F.allclose(h1, F.mean(h0, 0)) h1 = max_pool(h0, g) - assert th.allclose(h1, th.max(h0, 0)[0]) + assert F.allclose(h1, F.max(h0, 0)) h1 = sort_pool(h0, g) assert h1.shape[0] == 10 * 5 and h1.dim() == 1 # test#2: batched graph g_ = dgl.DGLGraph(nx.path_graph(5)) bg = dgl.batch([g, g_, g, g_, g]) - h0 = th.rand(bg.number_of_nodes(), 5) + h0 = F.rand((bg.number_of_nodes(), 5)) h1 = sum_pool(h0, bg) - truth = th.stack([th.sum(h0[:15], 0), - th.sum(h0[15:20], 0), - th.sum(h0[20:35], 0), - th.sum(h0[35:40], 0), - th.sum(h0[40:55], 0)], 0) - assert th.allclose(h1, truth) + truth = th.stack([F.sum(h0[:15], 0), + F.sum(h0[15:20], 0), + F.sum(h0[20:35], 0), + F.sum(h0[35:40], 0), + F.sum(h0[40:55], 0)], 0) + assert F.allclose(h1, truth) h1 = avg_pool(h0, bg) - truth = th.stack([th.mean(h0[:15], 0), - th.mean(h0[15:20], 0), - th.mean(h0[20:35], 0), - th.mean(h0[35:40], 0), - th.mean(h0[40:55], 0)], 0) - assert th.allclose(h1, truth) + truth = th.stack([F.mean(h0[:15], 0), + F.mean(h0[15:20], 0), + F.mean(h0[20:35], 0), + F.mean(h0[35:40], 0), + F.mean(h0[40:55], 0)], 0) + assert F.allclose(h1, truth) h1 = max_pool(h0, bg) - truth = th.stack([th.max(h0[:15], 0)[0], - th.max(h0[15:20], 0)[0], - th.max(h0[20:35], 0)[0], - th.max(h0[35:40], 0)[0], - th.max(h0[40:55], 0)[0]], 0) - assert th.allclose(h1, truth) + truth = th.stack([F.max(h0[:15], 0), + F.max(h0[15:20], 0), + F.max(h0[20:35], 0), + F.max(h0[35:40], 0), + F.max(h0[40:55], 0)], 0) + assert F.allclose(h1, truth) h1 = sort_pool(h0, bg) assert h1.shape[0] == 5 and h1.shape[1] == 10 * 5 and h1.dim() == 2 @@ -163,10 +167,14 @@ def test_set_trans(): st_enc_0 = nn.SetTransformerEncoder(50, 5, 10, 100, 2, 'sab') st_enc_1 = nn.SetTransformerEncoder(50, 5, 10, 100, 2, 'isab', 3) st_dec = nn.SetTransformerDecoder(50, 5, 10, 100, 2, 4) + if F.gpu_ctx(): + st_enc_0.cuda() + st_enc_1.cuda() + st_dec.cuda() print(st_enc_0, st_enc_1, st_dec) # test#1: basic - h0 = th.rand(g.number_of_nodes(), 50) + h0 = F.rand((g.number_of_nodes(), 50)) h1 = st_enc_0(h0, g) assert h1.shape == h0.shape h1 = st_enc_1(h0, g) @@ -178,7 +186,7 @@ def test_set_trans(): g1 = dgl.DGLGraph(nx.path_graph(5)) g2 = dgl.DGLGraph(nx.path_graph(10)) bg = dgl.batch([g, g1, g2]) - h0 = th.rand(bg.number_of_nodes(), 50) + h0 = F.rand((bg.number_of_nodes(), 50)) h1 = st_enc_0(h0, bg) assert h1.shape == h0.shape h1 = st_enc_1(h0, bg) From a91b1bbdf0e7d1626fd182b7b5d048112986afa1 Mon Sep 17 00:00:00 2001 From: Song Date: Mon, 19 Aug 2019 21:34:54 +0800 Subject: [PATCH 14/32] Fix syntex --- python/dgl/nn/mxnet/glob.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/dgl/nn/mxnet/glob.py b/python/dgl/nn/mxnet/glob.py index 32d33df9402e..62eef9f40678 100644 --- a/python/dgl/nn/mxnet/glob.py +++ b/python/dgl/nn/mxnet/glob.py @@ -1,6 +1,5 @@ """MXNet modules for graph global pooling.""" # pylint: disable= no-member, arguments-differ, C0103, W0235 -import mxnet as mx from mxnet import gluon, nd from mxnet.gluon import nn From edf6a0ed69cbc6c954dd7eb60bab15d2767faf0b Mon Sep 17 00:00:00 2001 From: Song Date: Wed, 21 Aug 2019 15:17:12 +0800 Subject: [PATCH 15/32] Remove rand --- tests/backend/__init__.py | 4 ---- tests/backend/backend_unittest.py | 4 ---- tests/backend/mxnet/__init__.py | 3 --- tests/backend/pytorch/__init__.py | 3 --- tests/pytorch/test_nn.py | 20 ++++++++++---------- 5 files changed, 10 insertions(+), 24 deletions(-) diff --git a/tests/backend/__init__.py b/tests/backend/__init__.py index dd189bdec706..542f2daa0d84 100644 --- a/tests/backend/__init__.py +++ b/tests/backend/__init__.py @@ -23,7 +23,6 @@ _zeros = zeros _ones = ones _randn = randn -_rand = rand _tensor = tensor _arange = arange _full = full @@ -51,9 +50,6 @@ def ones(shape, dtype=float32, ctx=_default_context): def randn(shape): return copy_to(_randn(shape), _default_context) -def rand(shape): - return copy_to(_rand(shape), _default_context) - def tensor(data, dtype=None): if dtype is None: if is_tensor(data): diff --git a/tests/backend/backend_unittest.py b/tests/backend/backend_unittest.py index 00a391244d37..3c7e90542421 100644 --- a/tests/backend/backend_unittest.py +++ b/tests/backend/backend_unittest.py @@ -31,10 +31,6 @@ def randn(shape): """Generate a tensor with elements from standard normal distribution.""" pass -def rand(shape): - """Generate a tensor with elements from a discrete uniform distribution.""" - pass - def attach_grad(x): """Flag the tensor *in-place* to have its gradient computed in backward pass. diff --git a/tests/backend/mxnet/__init__.py b/tests/backend/mxnet/__init__.py index 5dcba2e5f0e5..cd678699b8ca 100644 --- a/tests/backend/mxnet/__init__.py +++ b/tests/backend/mxnet/__init__.py @@ -25,9 +25,6 @@ def allclose(a, b, rtol=1e-4, atol=1e-4): def randn(shape): return nd.random.randn(*shape) -def rand(shape): - return nd.random.randint(0, 1, *shape) - def attach_grad(x): x.attach_grad() return x diff --git a/tests/backend/pytorch/__init__.py b/tests/backend/pytorch/__init__.py index 690902a2ee71..c7ffa72f655b 100644 --- a/tests/backend/pytorch/__init__.py +++ b/tests/backend/pytorch/__init__.py @@ -18,9 +18,6 @@ def allclose(a, b, rtol=1e-4, atol=1e-4): def randn(shape): return th.randn(*shape) -def rand(shape): - return th.rand(*shape) - def attach_grad(x): if x.grad is not None: x.grad.zero_() diff --git a/tests/pytorch/test_nn.py b/tests/pytorch/test_nn.py index b38913b2c9b6..3fbfdbadcf9e 100644 --- a/tests/pytorch/test_nn.py +++ b/tests/pytorch/test_nn.py @@ -78,7 +78,7 @@ def test_set2set(): print(s2s) # test#1: basic - h0 = F.rand((g.number_of_nodes(), 5)) + h0 = F.randn((g.number_of_nodes(), 5)) h1 = s2s(h0, g) assert h1.shape[0] == 10 and h1.dim() == 1 @@ -86,7 +86,7 @@ def test_set2set(): g1 = dgl.DGLGraph(nx.path_graph(11)) g2 = dgl.DGLGraph(nx.path_graph(5)) bg = dgl.batch([g, g1, g2]) - h0 = F.rand((bg.number_of_nodes(), 5)) + h0 = F.randn((bg.number_of_nodes(), 5)) h1 = s2s(h0, bg) assert h1.shape[0] == 3 and h1.shape[1] == 10 and h1.dim() == 2 @@ -99,13 +99,13 @@ def test_glob_att_pool(): print(gap) # test#1: basic - h0 = F.rand((g.number_of_nodes(), 5)) + h0 = F.randn((g.number_of_nodes(), 5)) h1 = gap(h0, g) assert h1.shape[0] == 10 and h1.dim() == 1 # test#2: batched graph bg = dgl.batch([g, g, g, g]) - h0 = F.rand((bg.number_of_nodes(), 5)) + h0 = F.randn((bg.number_of_nodes(), 5)) h1 = gap(h0, bg) assert h1.shape[0] == 4 and h1.shape[1] == 10 and h1.dim() == 2 @@ -119,7 +119,7 @@ def test_simple_pool(): print(sum_pool, avg_pool, max_pool, sort_pool) # test#1: basic - h0 = F.rand((g.number_of_nodes(), 5)) + h0 = F.randn((g.number_of_nodes(), 5)) h1 = sum_pool(h0, g) assert F.allclose(h1, F.sum(h0, 0)) h1 = avg_pool(h0, g) @@ -132,7 +132,7 @@ def test_simple_pool(): # test#2: batched graph g_ = dgl.DGLGraph(nx.path_graph(5)) bg = dgl.batch([g, g_, g, g_, g]) - h0 = F.rand((bg.number_of_nodes(), 5)) + h0 = F.randn((bg.number_of_nodes(), 5)) h1 = sum_pool(h0, bg) truth = th.stack([F.sum(h0[:15], 0), @@ -174,7 +174,7 @@ def test_set_trans(): print(st_enc_0, st_enc_1, st_dec) # test#1: basic - h0 = F.rand((g.number_of_nodes(), 50)) + h0 = F.randn((g.number_of_nodes(), 50)) h1 = st_enc_0(h0, g) assert h1.shape == h0.shape h1 = st_enc_1(h0, g) @@ -186,7 +186,7 @@ def test_set_trans(): g1 = dgl.DGLGraph(nx.path_graph(5)) g2 = dgl.DGLGraph(nx.path_graph(10)) bg = dgl.batch([g, g1, g2]) - h0 = F.rand((bg.number_of_nodes(), 50)) + h0 = F.randn((bg.number_of_nodes(), 50)) h1 = st_enc_0(h0, bg) assert h1.shape == h0.shape h1 = st_enc_1(h0, bg) @@ -224,9 +224,9 @@ def test_edge_softmax(): for j in range(30): g.add_edge(i, j) - score = F.rand((900, 1)) + score = F.randn((900, 1)) score.requires_grad_() - grad = F.rand((900, 1)) + grad = F.randn((900, 1)) y = F.softmax(score.view(30, 30), dim=0).view(-1, 1) y.backward(grad) grad_score = score.grad From b86e8db81e324080c6977508f32f86e180f94d27 Mon Sep 17 00:00:00 2001 From: Song Date: Tue, 3 Sep 2019 16:05:02 +0800 Subject: [PATCH 16/32] Start implementing masked-mm kernel. Add base control flow code. --- python/dgl/function/message.py | 11 ++++++ src/kernel/binary_reduce_common.h | 26 +++++++++++++++ src/kernel/binary_reduce_impl.h | 39 ++++++++++++++++++++++ src/kernel/cpu/binary_reduce_impl.h | 45 +++++++++++++++++++++++++ src/kernel/cuda/binary_reduce_impl.cuh | 46 ++++++++++++++++++++++++++ 5 files changed, 167 insertions(+) diff --git a/python/dgl/function/message.py b/python/dgl/function/message.py index 2627aa7fcc2b..91c98485546b 100644 --- a/python/dgl/function/message.py +++ b/python/dgl/function/message.py @@ -147,12 +147,16 @@ def copy_e(e, out): ############################################################################### # Generate all following builtin message functions: +# element-wise message functions: # u_add_v, u_sub_v, u_mul_v, u_div_v # u_add_e, u_sub_e, u_mul_e, u_div_e # v_add_u, v_sub_u, v_mul_u, v_div_u # v_add_e, v_sub_e, v_mul_e, v_div_e # e_add_u, e_sub_u, e_mul_u, e_div_u # e_add_v, e_sub_v, e_mul_v, e_div_v +# +# masked-mm message functions: +# u_dot_v, u_dot_e, v_dot_e _TARGET_MAP = { "u": TargetCode.SRC, @@ -205,6 +209,13 @@ def _register_builtin_message_func(): setattr(sys.modules[__name__], func.__name__, func) __all__.append(func.__name__) + """Register builtin masked-mm functions""" + for lhs, rhs in product(["u", "v"], ["v", "e"]): + if lhs != rhs: + for binary_op in ["dot"]: + func = _gen_message_builtin(lhs, rhs, binary_op) + setattr(sys.modules[__name__], func.__name__, func) + __all__.append(func.__name__) _register_builtin_message_func() diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h index f7052d99c3c1..106242bafc6d 100644 --- a/src/kernel/binary_reduce_common.h +++ b/src/kernel/binary_reduce_common.h @@ -29,6 +29,7 @@ static const char kAdd[] = "add"; static const char kSub[] = "sub"; static const char kMul[] = "mul"; static const char kDiv[] = "div"; +static const char kDot[] = "dot"; static const char kUseLhs[] = "use_lhs"; /*! @@ -192,6 +193,19 @@ struct BinaryUseLhs { } }; +template +struct BinaryDot { + static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) { + return lhs * rhs; + } + static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) { + return 1; + } + static DGLDEVICE DGLINLINE DType BackwardRhs(DType lhs, DType rhs, DType out) { + return 1; + } +}; + // Macro for dispatching op enum code and target code into template arguments. // The macro dispatches following combinations: // - Add(Src, Dst), Add(Src, Edge), Add(Dst, Edge) @@ -306,6 +320,18 @@ struct BinaryUseLhs { typedef SelectEdge LeftType; \ typedef SelectNone RightType; \ {__VA_ARGS__} \ + } else if (op == kDot && lhs == kSrc && rhs == kDst) { \ + typedef SelectSrc LeftType; \ + typedef SelectDst RightType; \ + {__VA_ARGS__} \ + } else if (op == kDot && lhs == kSrc && rhs == kEdge) { \ + typedef SelectSrc LeftType; \ + typedef SelectEdge RightType; \ + {__VA_ARGS__} \ + } else if (op == kDot && lhs == kDst && rhs == kEdge) { \ + typedef SelectDst LeftType; \ + typedef SelectEdge RightType; \ + {__VA_ARGS__} \ } else { \ LOG(FATAL) << "Unsupported operation: op=" << op \ << " lhs=" << lhs << " rhs=" << rhs; \ diff --git a/src/kernel/binary_reduce_impl.h b/src/kernel/binary_reduce_impl.h index f35fe7ef197c..19d132837aa1 100644 --- a/src/kernel/binary_reduce_impl.h +++ b/src/kernel/binary_reduce_impl.h @@ -83,6 +83,27 @@ void BinaryReduceImpl( // instruction level parallelism rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2); #endif + if (op == binary_op::kDot) { + // A dot B impl is different from others + if (reducer != binary_op::kReduceNone) { + // TODO(xiang song): Need Reduce for A Dot B? + LOG(FATAL) << "With Dot operation, Only None reduce is supported."; + } + + // Built in A dot B impl + const DLDataType& dtype = out_data->dtype; + const auto bits = graph.NumBits(); + DGL_DTYPE_SWITCH(dtype, DType, { + DGL_IDX_TYPE_SWITCH(bits, Idx, { + auto gdata = AllocGData>( + rtcfg.ctx, x_len, lhs_mapping, rhs_mapping, + lhs_data, rhs_data, out_mapping, out_data); + OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, { + CallBinaryDot(rtcfg, graph, &gdata); + }); + }); + }); + } if (reducer == binary_op::kReduceMean) { // TODO(minjie): divide LOG(FATAL) << "reduce mean is not supported."; @@ -182,6 +203,18 @@ void BackwardBinaryReduceImpl( const bool req_lhs = !utils::IsNoneArray(grad_lhs_data); const bool req_rhs = !utils::IsNoneArray(grad_rhs_data); const auto bits = graph.NumBits(); + + if (op == binary_op::kDot) { + // A dot B impl is different from others + if (reducer != binary_op::kReduceNone) { + // TODO(xiang song): Need Reduce for A Dot B? + LOG(FATAL) << "With Dot operation, Only None reduce is supported."; + } + + // Built in A dot B impl + // (TODO: xiang song) To implement it + } + if (reducer == binary_op::kReduceMean) { // TODO(minjie): divide LOG(FATAL) << "reduce mean is not supported."; @@ -280,6 +313,9 @@ void BinaryReduceBcastImpl( const DLDataType& dtype = out_data->dtype; const int bcast_ndim = info.out_shape.size(); const auto bits = graph.NumBits(); + if (op == binary_op::kDot) { + LOG(FATAL) << "dot operation is not allowed with broadcast"; + } if (reducer == binary_op::kReduceMean) { // TODO(minjie): divide LOG(FATAL) << "reduce mean is not supported."; @@ -387,6 +423,9 @@ void BackwardBinaryReduceBcastImpl( const bool req_lhs = !utils::IsNoneArray(grad_lhs); const bool req_rhs = !utils::IsNoneArray(grad_rhs); const auto bits = graph.NumBits(); + if (op == binary_op::kDot) { + LOG(FATAL) << "dot operation is not allowed with broadcast"; + } if (reducer == binary_op::kReduceMean) { // TODO(minjie): divide LOG(FATAL) << "reduce mean is not supported."; diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h index 7d9ba646180b..d8de1ca86992 100644 --- a/src/kernel/cpu/binary_reduce_impl.h +++ b/src/kernel/cpu/binary_reduce_impl.h @@ -53,6 +53,19 @@ struct BinaryReduce { } }; +// Minigun UDF to compute binary reduce. +template +struct BinaryDot { + static __device__ __forceinline__ bool CondEdge( + Idx src, Idx dst, Idx eid, GData* gdata) { + return true; + } + static __device__ __forceinline__ void ApplyEdge( + Idx src, Idx dst, Idx eid, GData* gdata) { + return true; + } +}; + // Convert flattened index to multi-dimension index (assume row-major). inline void Unravel(int64_t idx, int ndim, const int64_t* shape, const int64_t* stride, int64_t* out) { @@ -175,6 +188,38 @@ void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg, rtcfg, csr, gdata, minigun::IntArray1D()); } +// Template implementation of BinaryReduce operator. +template +void CallBinaryDot(const minigun::advance::RuntimeConfig& rtcfg, + const CSRWrapper& graph, + GData* gdata) { + //For binary dot, it should be none reducer. + typedef cpu::FunctorsTempl, ReduceNone> + Functors; + typedef cpu::BinaryDot UDF; + // csr + auto outcsr = graph.GetOutCSRMatrix(); + minigun::Csr csr = utils::CreateCsr(outcsr.indptr, outcsr.indices); + // If the user-given mapping is none and the target is edge data, we need to + // replace the mapping by the edge ids in the csr graph so that the edge + // data is correctly read/written. + if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) { + gdata->lhs_mapping = static_cast(outcsr.data->data); + } + if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) { + gdata->rhs_mapping = static_cast(outcsr.data->data); + } + if (OutSelector::Type::target == binary_op::kEdge + && gdata->out_mapping == nullptr) { + gdata->out_mapping = static_cast(outcsr.data->data); + } + // TODO(minjie): allocator + minigun::advance::Advance, UDF>( + rtcfg, csr, gdata, minigun::IntArray1D()); +} + // Template implementation of BinaryReduce broadcasting operator. template +struct BinaryDot { + static __device__ __forceinline__ bool CondEdge( + Idx src, Idx dst, Idx eid, GData* gdata) { + return true; + } + static __device__ __forceinline__ void ApplyEdge( + Idx src, Idx dst, Idx eid, GData* gdata) { + return true; + } +}; + // Convert flattened index to multi-dimension index (assume row-major). __device__ __forceinline__ void Unravel( int64_t idx, int ndim, const int64_t* shape, const int64_t* stride, int64_t* out) { @@ -178,6 +191,39 @@ void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg, rtcfg, csr, gdata, minigun::IntArray1D()); } +// Template implementation of BinaryDot operator. +template +void CallBinaryDot(const minigun::advance::RuntimeConfig& rtcfg, + const CSRWrapper& graph, + GData* gdata) { + //For binary dot, it should be none reducer. + typedef cuda::FunctorsTempl, ReduceNone> + Functors; + typedef cuda::BinaryDot UDF; + + // csr + auto outcsr = graph.GetOutCSRMatrix(); + minigun::Csr csr = utils::CreateCsr(outcsr.indptr, outcsr.indices); + // If the user-given mapping is none and the target is edge data, we need to + // replace the mapping by the edge ids in the csr graph so that the edge + // data is correctly read/written. + if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) { + gdata->lhs_mapping = static_cast(outcsr.data->data); + } + if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) { + gdata->rhs_mapping = static_cast(outcsr.data->data); + } + if (OutSelector::Type::target == binary_op::kEdge + && gdata->out_mapping == nullptr) { + gdata->out_mapping = static_cast(outcsr.data->data); + } + // TODO(minjie): allocator + minigun::advance::Advance, UDF>( + rtcfg, csr, gdata, minigun::IntArray1D()); +} + // Template implementation of BinaryReduce broadcasting operator. template Date: Tue, 3 Sep 2019 17:46:32 +0800 Subject: [PATCH 17/32] Add masked dot declare --- src/kernel/cpu/binary_masked_dot.cc | 26 ++++++++++++++++++++++++++ src/kernel/cpu/binary_reduce_impl.h | 9 +++++++++ src/kernel/cuda/binary_masked_dot.cu | 20 ++++++++++++++++++++ src/kernel/cuda/binary_reduce_impl.cuh | 9 +++++++++ 4 files changed, 64 insertions(+) create mode 100644 src/kernel/cpu/binary_masked_dot.cc create mode 100644 src/kernel/cuda/binary_masked_dot.cu diff --git a/src/kernel/cpu/binary_masked_dot.cc b/src/kernel/cpu/binary_masked_dot.cc new file mode 100644 index 000000000000..c1f24ca97bbc --- /dev/null +++ b/src/kernel/cpu/binary_masked_dot.cc @@ -0,0 +1,26 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file kernel/cpu/binary_masked_dot.cc + * \brief CPU kernels for binary reduce prod + */ +#include "./binary_reduce_impl.h" +#include "./backward_binary_reduce_impl.h" + +namespace dgl { +namespace kernel { + +#define REDUCER ReduceNone +#define XPU kDLCPU + +#define IDX int32_t +EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_MASKED_DOT_DEFINE); +EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE); +#undef IDX + +#define IDX int64_t +EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_MASKED_DOT_DEFINE); +EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE); +#undef IDX + +} // namespace kernel +} // namespace dgl diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h index d8de1ca86992..3f2f7cf5c81a 100644 --- a/src/kernel/cpu/binary_reduce_impl.h +++ b/src/kernel/cpu/binary_reduce_impl.h @@ -263,6 +263,15 @@ void CallBinaryReduceBcast( const CSRWrapper& graph, \ GData* gdata); +// Following macro is used to generate explicit-specialization of the template +// operator. +#define GEN_MASKED_DOT_DEFINE(dtype, lhs_tgt, rhs_tgt) \ + template void CallBinaryMaskedDot( \ + const minigun::advance::RuntimeConfig& rtcfg, \ + const CSRWrapper& graph, \ + GData* gdata); + #define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op) \ template void CallBinaryReduceBcast* gdata); +// Following macro is used to generate explicit-specialization of the template +// operator. +#define GEN_MASKED_DOT_DEFINE(dtype, lhs_tgt, rhs_tgt) \ + template void CallBinaryMaskedDot( \ + const minigun::advance::RuntimeConfig& rtcfg, \ + const CSRWrapper& graph, \ + GData* gdata); + #define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op) \ template void CallBinaryReduceBcast Date: Tue, 3 Sep 2019 09:50:21 +0000 Subject: [PATCH 18/32] Update func/variable name --- src/kernel/binary_reduce_impl.h | 2 +- src/kernel/binary_reduce_impl_decl.h | 28 ++++++++++++++++++++++++++ src/kernel/cpu/binary_reduce_impl.h | 17 +++++++++------- src/kernel/cuda/binary_reduce_impl.cuh | 19 +++++++++-------- 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/src/kernel/binary_reduce_impl.h b/src/kernel/binary_reduce_impl.h index 19d132837aa1..9f4e86eed594 100644 --- a/src/kernel/binary_reduce_impl.h +++ b/src/kernel/binary_reduce_impl.h @@ -99,7 +99,7 @@ void BinaryReduceImpl( rtcfg.ctx, x_len, lhs_mapping, rhs_mapping, lhs_data, rhs_data, out_mapping, out_data); OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, { - CallBinaryDot(rtcfg, graph, &gdata); + CallBinaryMaskedDot(rtcfg, graph, &gdata); }); }); }); diff --git a/src/kernel/binary_reduce_impl_decl.h b/src/kernel/binary_reduce_impl_decl.h index f353ba86bc62..4701220b02b0 100644 --- a/src/kernel/binary_reduce_impl_decl.h +++ b/src/kernel/binary_reduce_impl_decl.h @@ -82,6 +82,34 @@ void CallBinaryReduce( const CSRWrapper& graph, GData* gdata); +/*! + * \brief Template declaration for BinaryMaskedDot operator. + * + * LeftSelector and RightSelector must be one of the four operand target + * categories. + * + * The implementation of this template is device-dependent + * (see kernel/xpu/binary_reduce_impl.(cu)h). + * + * See definitions in binary_reduce_common.h + * + * \tparam XPU the device flag + * \tparam Idx type of node/edge index (e.g. int32_t, int64_t) + * \tparam DType type of the feature data (e.g. float32) + * \tparam LeftSelect lhs category type + * \tparam RightSelect rhs category type + * \param rtcfg Runtime configuration used by miningun + * \param graph The graph object. + * \param gdata The feature and mapping data used by the computation. + */ +template +void CallBinaryReduce( + const minigun::advance::RuntimeConfig& rtcfg, + const CSRWrapper& graph, + GData* gdata); + + /*! * \brief Template declaration for common logics shared by different devices. * diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h index d8de1ca86992..de9f12e1e0ed 100644 --- a/src/kernel/cpu/binary_reduce_impl.h +++ b/src/kernel/cpu/binary_reduce_impl.h @@ -55,7 +55,7 @@ struct BinaryReduce { // Minigun UDF to compute binary reduce. template -struct BinaryDot { +struct BinaryMaskedDot { static __device__ __forceinline__ bool CondEdge( Idx src, Idx dst, Idx eid, GData* gdata) { return true; @@ -188,17 +188,17 @@ void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg, rtcfg, csr, gdata, minigun::IntArray1D()); } -// Template implementation of BinaryReduce operator. +// Template implementation of BinaryMaksedDot operator. template -void CallBinaryDot(const minigun::advance::RuntimeConfig& rtcfg, +void CallBinaryMaskedDot(const minigun::advance::RuntimeConfig& rtcfg, const CSRWrapper& graph, GData* gdata) { //For binary dot, it should be none reducer. typedef cpu::FunctorsTempl, ReduceNone> + RightSelector, BinaryDot, ReduceNone> Functors; - typedef cpu::BinaryDot UDF; + typedef cpu::BinaryMaskedDot UDF; // csr auto outcsr = graph.GetOutCSRMatrix(); minigun::Csr csr = utils::CreateCsr(outcsr.indptr, outcsr.indices); @@ -211,8 +211,11 @@ void CallBinaryDot(const minigun::advance::RuntimeConfig& rtcfg, if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) { gdata->rhs_mapping = static_cast(outcsr.data->data); } - if (OutSelector::Type::target == binary_op::kEdge - && gdata->out_mapping == nullptr) { + + // For Masked Matrix Multiply, the output target should be edge. + // If the user-given mapping is none, we need to replace the mapping by the + // edge ids in the csr graph. + if (gdata->out_mapping == nullptr) { gdata->out_mapping = static_cast(outcsr.data->data); } // TODO(minjie): allocator diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh index 3b21156b9c4b..013483f4255b 100644 --- a/src/kernel/cuda/binary_reduce_impl.cuh +++ b/src/kernel/cuda/binary_reduce_impl.cuh @@ -56,14 +56,14 @@ struct BinaryReduce { // Minigun UDF to compute binary reduce. template -struct BinaryDot { +struct BinaryMaskedDot { static __device__ __forceinline__ bool CondEdge( Idx src, Idx dst, Idx eid, GData* gdata) { return true; } static __device__ __forceinline__ void ApplyEdge( Idx src, Idx dst, Idx eid, GData* gdata) { - return true; + return; } }; @@ -191,17 +191,17 @@ void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg, rtcfg, csr, gdata, minigun::IntArray1D()); } -// Template implementation of BinaryDot operator. +// Template implementation of BinaryMaskedDot operator. template -void CallBinaryDot(const minigun::advance::RuntimeConfig& rtcfg, +void CallBinaryMaskedDot(const minigun::advance::RuntimeConfig& rtcfg, const CSRWrapper& graph, GData* gdata) { //For binary dot, it should be none reducer. typedef cuda::FunctorsTempl, ReduceNone> + RightSelector, BinaryDot, ReduceNone> Functors; - typedef cuda::BinaryDot UDF; + typedef cuda::BinaryMaskedDot UDF; // csr auto outcsr = graph.GetOutCSRMatrix(); @@ -215,8 +215,11 @@ void CallBinaryDot(const minigun::advance::RuntimeConfig& rtcfg, if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) { gdata->rhs_mapping = static_cast(outcsr.data->data); } - if (OutSelector::Type::target == binary_op::kEdge - && gdata->out_mapping == nullptr) { + + // For Masked Matrix Multiply, the output target should be edge. + // If the user-given mapping is none, we need to replace the mapping by the + // edge ids in the csr graph. + if (gdata->out_mapping == nullptr) { gdata->out_mapping = static_cast(outcsr.data->data); } // TODO(minjie): allocator From d8825994501bca299113f0d7791314ecdf85371c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 3 Sep 2019 13:41:56 +0000 Subject: [PATCH 19/32] Skeleton compile OK --- src/kernel/binary_reduce_common.h | 3 +++ src/kernel/binary_reduce_impl.h | 3 +++ src/kernel/binary_reduce_impl_decl.h | 2 +- src/kernel/cpu/binary_reduce_impl.h | 4 ++-- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h index 106242bafc6d..779b7eda54e3 100644 --- a/src/kernel/binary_reduce_common.h +++ b/src/kernel/binary_reduce_common.h @@ -321,14 +321,17 @@ struct BinaryDot { typedef SelectNone RightType; \ {__VA_ARGS__} \ } else if (op == kDot && lhs == kSrc && rhs == kDst) { \ + typedef BinaryDot OpType; \ typedef SelectSrc LeftType; \ typedef SelectDst RightType; \ {__VA_ARGS__} \ } else if (op == kDot && lhs == kSrc && rhs == kEdge) { \ + typedef BinaryDot OpType; \ typedef SelectSrc LeftType; \ typedef SelectEdge RightType; \ {__VA_ARGS__} \ } else if (op == kDot && lhs == kDst && rhs == kEdge) { \ + typedef BinaryDot OpType; \ typedef SelectDst LeftType; \ typedef SelectEdge RightType; \ {__VA_ARGS__} \ diff --git a/src/kernel/binary_reduce_impl.h b/src/kernel/binary_reduce_impl.h index 9f4e86eed594..361fc6387303 100644 --- a/src/kernel/binary_reduce_impl.h +++ b/src/kernel/binary_reduce_impl.h @@ -103,6 +103,8 @@ void BinaryReduceImpl( }); }); }); + + return; } if (reducer == binary_op::kReduceMean) { // TODO(minjie): divide @@ -213,6 +215,7 @@ void BackwardBinaryReduceImpl( // Built in A dot B impl // (TODO: xiang song) To implement it + return; } if (reducer == binary_op::kReduceMean) { diff --git a/src/kernel/binary_reduce_impl_decl.h b/src/kernel/binary_reduce_impl_decl.h index 4701220b02b0..f0d7822adfaa 100644 --- a/src/kernel/binary_reduce_impl_decl.h +++ b/src/kernel/binary_reduce_impl_decl.h @@ -104,7 +104,7 @@ void CallBinaryReduce( */ template -void CallBinaryReduce( +void CallBinaryMaskedDot( const minigun::advance::RuntimeConfig& rtcfg, const CSRWrapper& graph, GData* gdata); diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h index 76abbefa6eee..d657d7404137 100644 --- a/src/kernel/cpu/binary_reduce_impl.h +++ b/src/kernel/cpu/binary_reduce_impl.h @@ -56,11 +56,11 @@ struct BinaryReduce { // Minigun UDF to compute binary reduce. template struct BinaryMaskedDot { - static __device__ __forceinline__ bool CondEdge( + static inline bool CondEdge( Idx src, Idx dst, Idx eid, GData* gdata) { return true; } - static __device__ __forceinline__ void ApplyEdge( + static inline void ApplyEdge( Idx src, Idx dst, Idx eid, GData* gdata) { return true; } From 190102bbcc21a950d4085b3458677e93655fbcb6 Mon Sep 17 00:00:00 2001 From: Song Date: Wed, 4 Sep 2019 15:02:36 +0800 Subject: [PATCH 20/32] Update Implement. Unify BinaryDot with BinaryReduce --- python/dgl/backend/mxnet/tensor.py | 2 +- python/dgl/backend/pytorch/tensor.py | 2 +- python/dgl/kernel.py | 4 +- src/kernel/binary_reduce.cc | 26 +++++-- src/kernel/binary_reduce.h | 2 + src/kernel/binary_reduce_common.h | 34 +++++---- src/kernel/binary_reduce_impl.h | 58 ++++------------ src/kernel/binary_reduce_impl_decl.h | 32 ++------- src/kernel/cpu/binary_masked_dot.cc | 4 +- src/kernel/cpu/binary_reduce_impl.h | 88 ++++-------------------- src/kernel/cuda/binary_masked_dot.cu | 2 +- src/kernel/cuda/binary_reduce_impl.cuh | 95 +++++--------------------- 12 files changed, 99 insertions(+), 250 deletions(-) diff --git a/python/dgl/backend/mxnet/tensor.py b/python/dgl/backend/mxnet/tensor.py index 63c5c6df4841..d2d38eab0646 100644 --- a/python/dgl/backend/mxnet/tensor.py +++ b/python/dgl/backend/mxnet/tensor.py @@ -363,7 +363,7 @@ def __init__(self, reducer, binary_op, graph, lhs, rhs, out_size, lhs_map, def forward(self, lhs_data, rhs_data): lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data) rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data) - feat_shape = K.infer_binary_feature_shape(lhs_data_nd, rhs_data_nd) + feat_shape = K.infer_binary_feature_shape(self.binary_op, lhs_data_nd, rhs_data_nd) out_data = nd.empty((self.out_size,) + feat_shape, ctx=lhs_data.context, dtype=lhs_data.dtype) out_data_nd = zerocopy_to_dgl_ndarray_for_write(out_data) diff --git a/python/dgl/backend/pytorch/tensor.py b/python/dgl/backend/pytorch/tensor.py index 3b2f392cbf55..bc75ed5f0cdb 100644 --- a/python/dgl/backend/pytorch/tensor.py +++ b/python/dgl/backend/pytorch/tensor.py @@ -285,7 +285,7 @@ def forward(ctx, reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data, out_size, lhs_map, rhs_map, out_map): lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data) rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data) - feat_shape = K.infer_binary_feature_shape(lhs_data_nd, rhs_data_nd) + feat_shape = K.infer_binary_feature_shape(binary_op, lhs_data_nd, rhs_data_nd) out_data = lhs_data.new_empty((out_size,) + feat_shape) out_data_nd = zerocopy_to_dgl_ndarray(out_data) K.binary_op_reduce( diff --git a/python/dgl/kernel.py b/python/dgl/kernel.py index b6314e360d39..7abe2d38bf4e 100644 --- a/python/dgl/kernel.py +++ b/python/dgl/kernel.py @@ -4,7 +4,7 @@ from ._ffi.function import _init_api from .ndarray import empty -def infer_binary_feature_shape(lhs, rhs): +def infer_binary_feature_shape(op, lhs, rhs): """Infer the output feature shape after a binary operation between lhs and rhs. Parameter @@ -19,7 +19,7 @@ def infer_binary_feature_shape(lhs, rhs): tuple of int The output feature shape. """ - ret = _CAPI_DGLKernelInferBinaryFeatureShape(lhs, rhs) + ret = _CAPI_DGLKernelInferBinaryFeatureShape(op, lhs, rhs) return tuple(ret.asnumpy()) # pylint: disable=invalid-name diff --git a/src/kernel/binary_reduce.cc b/src/kernel/binary_reduce.cc index 0adf1e7af2b7..265626f63b3e 100644 --- a/src/kernel/binary_reduce.cc +++ b/src/kernel/binary_reduce.cc @@ -92,9 +92,20 @@ bool HasBcast(NDArray lhs, NDArray rhs) { // e.g. (4, 1, 3, 3) and (4, 5, 3, 3) become (4, 1, 9) and (4, 5, 9) // // See also: BcastInfo (kernel/binary_reduce.h) -BcastInfo CalcBcastInfo(NDArray lhs, NDArray rhs) { +BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) { BcastInfo ret; const int max_ndim = std::max(lhs->ndim, rhs->ndim) - 1; + // for dot operation: vector [dot] vector + // lhs_shape[ndim-1] == rhs_shape[ndim-1] = sizeof(vector) + // out_shape[ndim-1] = 1 + if (op == binary_op::kDot) { + //get size of vector + ret.data_len = lhs->shape[lhs->ndim - 1]; + --max_ndim; + } else {//op != binary_op::kDot + ret.data_len = 1; + } + int64_t accum = 0; for (int j = 0; j < max_ndim; ++j) { const int dl = (lhs->ndim - 1 - j < 1)? 1 : lhs->shape[lhs->ndim - 1 - j]; @@ -239,9 +250,10 @@ std::vector InferBinaryFeatureShape( DGL_REGISTER_GLOBAL("kernel._CAPI_DGLKernelInferBinaryFeatureShape") .set_body([] (DGLArgs args, DGLRetValue* rv) { - NDArray lhs = args[0]; - NDArray rhs = args[1]; - const auto& shape = InferBinaryFeatureShape(lhs, rhs); + std::string op = args[0]; + NDArray lhs = args[1]; + NDArray rhs = args[2]; + const auto& shape = InferBinaryFeatureShape(op, lhs, rhs); const int64_t len = shape.size(); NDArray ret = NDArray::Empty( {len}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); @@ -274,7 +286,7 @@ void BinaryOpReduce( rhs_mapping, lhs_mapping, out_mapping); } else { if (HasBcast(lhs_data, rhs_data)) { - BcastInfo info = CalcBcastInfo(lhs_data, rhs_data); + BcastInfo info = CalcBcastInfo(op, lhs_data, rhs_data); DGL_XPU_SWITCH(ctx.device_type, BinaryReduceBcastImpl, info, reducer, op, graph, lhs, rhs, @@ -348,7 +360,7 @@ void BackwardLhsBinaryOpReduce( grad_out_data, grad_lhs_data); } else { if (HasBcast(lhs_data, rhs_data)) { - BcastInfo info = CalcBcastInfo(lhs_data, rhs_data); + BcastInfo info = CalcBcastInfo(op, lhs_data, rhs_data); DGL_XPU_SWITCH(ctx.device_type, BackwardBinaryReduceBcastImpl, info, reducer, op, graph, lhs, rhs, @@ -424,7 +436,7 @@ void BackwardRhsBinaryOpReduce( grad_out_data, grad_rhs_data); } else { if (HasBcast(lhs_data, rhs_data)) { - BcastInfo info = CalcBcastInfo(lhs_data, rhs_data); + BcastInfo info = CalcBcastInfo(op, lhs_data, rhs_data); DGL_XPU_SWITCH(ctx.device_type, BackwardBinaryReduceBcastImpl, info, reducer, op, graph, lhs, rhs, diff --git a/src/kernel/binary_reduce.h b/src/kernel/binary_reduce.h index 7f5d359df30d..6190bdd3f826 100644 --- a/src/kernel/binary_reduce.h +++ b/src/kernel/binary_reduce.h @@ -30,6 +30,8 @@ struct BcastInfo { std::vector lhs_shape, lhs_stride; std::vector rhs_shape, rhs_stride; std::vector out_shape, out_stride; + + int64_t data_len; }; /* diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h index 779b7eda54e3..a1cd914be03f 100644 --- a/src/kernel/binary_reduce_common.h +++ b/src/kernel/binary_reduce_common.h @@ -130,8 +130,8 @@ struct SwitchSrcDst { // common binary functors template struct BinaryAdd { - static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) { - return lhs + rhs; + static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) { + return lhs[0] + rhs[0]; } static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) { return 1; @@ -143,8 +143,8 @@ struct BinaryAdd { template struct BinaryMul { - static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) { - return lhs * rhs; + static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) { + return lhs[0] * rhs[0]; } static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) { return rhs; @@ -156,8 +156,8 @@ struct BinaryMul { template struct BinarySub { - static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) { - return lhs - rhs; + static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) { + return lhs[0] - rhs[0]; } static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) { return 1; @@ -169,8 +169,8 @@ struct BinarySub { template struct BinaryDiv { - static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) { - return lhs / rhs; + static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) { + return lhs[0] / rhs[0]; } static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) { return static_cast(1) / rhs; @@ -182,8 +182,8 @@ struct BinaryDiv { template struct BinaryUseLhs { - static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) { - return lhs; + static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) { + return lhs[0]; } static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) { return 1; @@ -195,8 +195,13 @@ struct BinaryUseLhs { template struct BinaryDot { - static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) { - return lhs * rhs; + static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) { + Dtype out = 0; + //simple vector dot vector + for (int i = 0; i < len; i ++) + out += lhs[i] * rhs[i]; + + return out; } static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) { return 1; @@ -362,7 +367,10 @@ struct BinaryDot { MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryDiv)) \ MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectDst, BinaryDiv)) \ MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectNone, BinaryUseLhs)) \ - MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectNone, BinaryUseLhs)) + MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectNone, BinaryUseLhs)) \ + MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinaryDot)) \ + MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinaryDot)) \ + MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryDot)) ////////////////////////////////////////////////////////////////////////// // Defines reducer category. Each category is an empty structure. diff --git a/src/kernel/binary_reduce_impl.h b/src/kernel/binary_reduce_impl.h index 361fc6387303..5f64f2dc6ea9 100644 --- a/src/kernel/binary_reduce_impl.h +++ b/src/kernel/binary_reduce_impl.h @@ -28,7 +28,7 @@ namespace kernel { /////////////////////////////////////////////////////////////////////////////// template -GData AllocGData( +GData AllocGData(const std::string& op, const DLContext& ctx, int64_t x_len, runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray lhs_data, runtime::NDArray rhs_data, @@ -49,6 +49,15 @@ GData AllocGData( if (!utils::IsNoneArray(out_mapping)) { gdata.out_mapping = static_cast(out_mapping->data); } + + // for dot operation: vector [dot] vector + if (op == binary::kDot) { + //get size of vector + gdata.data_len = lhs_data->shape[lhs_data->ndim - 1]; + } else { + gdata.data_len = 1; + } + // fill out data with zero values utils::Fill(ctx, gdata.out_data, utils::NElements(out_data), Zero::value); return gdata; @@ -83,29 +92,6 @@ void BinaryReduceImpl( // instruction level parallelism rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2); #endif - if (op == binary_op::kDot) { - // A dot B impl is different from others - if (reducer != binary_op::kReduceNone) { - // TODO(xiang song): Need Reduce for A Dot B? - LOG(FATAL) << "With Dot operation, Only None reduce is supported."; - } - - // Built in A dot B impl - const DLDataType& dtype = out_data->dtype; - const auto bits = graph.NumBits(); - DGL_DTYPE_SWITCH(dtype, DType, { - DGL_IDX_TYPE_SWITCH(bits, Idx, { - auto gdata = AllocGData>( - rtcfg.ctx, x_len, lhs_mapping, rhs_mapping, - lhs_data, rhs_data, out_mapping, out_data); - OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, { - CallBinaryMaskedDot(rtcfg, graph, &gdata); - }); - }); - }); - - return; - } if (reducer == binary_op::kReduceMean) { // TODO(minjie): divide LOG(FATAL) << "reduce mean is not supported."; @@ -115,7 +101,7 @@ void BinaryReduceImpl( DGL_DTYPE_SWITCH(dtype, DType, { DGL_IDX_TYPE_SWITCH(bits, Idx, { REDUCER_SWITCH(reducer, XPU, DType, Reducer, { - auto gdata = AllocGData( + auto gdata = AllocGData(op, rtcfg.ctx, x_len, lhs_mapping, rhs_mapping, lhs_data, rhs_data, out_mapping, out_data); OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, { @@ -206,18 +192,6 @@ void BackwardBinaryReduceImpl( const bool req_rhs = !utils::IsNoneArray(grad_rhs_data); const auto bits = graph.NumBits(); - if (op == binary_op::kDot) { - // A dot B impl is different from others - if (reducer != binary_op::kReduceNone) { - // TODO(xiang song): Need Reduce for A Dot B? - LOG(FATAL) << "With Dot operation, Only None reduce is supported."; - } - - // Built in A dot B impl - // (TODO: xiang song) To implement it - return; - } - if (reducer == binary_op::kReduceMean) { // TODO(minjie): divide LOG(FATAL) << "reduce mean is not supported."; @@ -276,6 +250,8 @@ BcastGData AllocBcastGData( if (!utils::IsNoneArray(out_mapping)) { gdata.out_mapping = static_cast(out_mapping->data); } + + gdata.data_len = info.data_len; // fill out data with zero values utils::Fill(ctx, gdata.out_data, utils::NElements(out_data), Zero::value); return gdata; @@ -316,9 +292,7 @@ void BinaryReduceBcastImpl( const DLDataType& dtype = out_data->dtype; const int bcast_ndim = info.out_shape.size(); const auto bits = graph.NumBits(); - if (op == binary_op::kDot) { - LOG(FATAL) << "dot operation is not allowed with broadcast"; - } + if (reducer == binary_op::kReduceMean) { // TODO(minjie): divide LOG(FATAL) << "reduce mean is not supported."; @@ -426,9 +400,7 @@ void BackwardBinaryReduceBcastImpl( const bool req_lhs = !utils::IsNoneArray(grad_lhs); const bool req_rhs = !utils::IsNoneArray(grad_rhs); const auto bits = graph.NumBits(); - if (op == binary_op::kDot) { - LOG(FATAL) << "dot operation is not allowed with broadcast"; - } + if (reducer == binary_op::kReduceMean) { // TODO(minjie): divide LOG(FATAL) << "reduce mean is not supported."; diff --git a/src/kernel/binary_reduce_impl_decl.h b/src/kernel/binary_reduce_impl_decl.h index f0d7822adfaa..5c799372d7e4 100644 --- a/src/kernel/binary_reduce_impl_decl.h +++ b/src/kernel/binary_reduce_impl_decl.h @@ -36,6 +36,8 @@ template struct GData { // length along x(feature) dimension int64_t x_length{0}; + // size of data, can be single value or a vector + int64_t data_len; // number of rows of the output tensor int64_t out_size{0}; // input data @@ -82,34 +84,6 @@ void CallBinaryReduce( const CSRWrapper& graph, GData* gdata); -/*! - * \brief Template declaration for BinaryMaskedDot operator. - * - * LeftSelector and RightSelector must be one of the four operand target - * categories. - * - * The implementation of this template is device-dependent - * (see kernel/xpu/binary_reduce_impl.(cu)h). - * - * See definitions in binary_reduce_common.h - * - * \tparam XPU the device flag - * \tparam Idx type of node/edge index (e.g. int32_t, int64_t) - * \tparam DType type of the feature data (e.g. float32) - * \tparam LeftSelect lhs category type - * \tparam RightSelect rhs category type - * \param rtcfg Runtime configuration used by miningun - * \param graph The graph object. - * \param gdata The feature and mapping data used by the computation. - */ -template -void CallBinaryMaskedDot( - const minigun::advance::RuntimeConfig& rtcfg, - const CSRWrapper& graph, - GData* gdata); - - /*! * \brief Template declaration for common logics shared by different devices. * @@ -250,6 +224,8 @@ struct BcastGData { int64_t lhs_len{0}, rhs_len{0}; int64_t lhs_shape[NDim]{0}, lhs_stride[NDim]{0}; int64_t rhs_shape[NDim]{0}, rhs_stride[NDim]{0}; + // size of data, can be single value or a vector + int64_t data_len; // input data DType *lhs_data{nullptr}, *rhs_data{nullptr}; // input id mappings diff --git a/src/kernel/cpu/binary_masked_dot.cc b/src/kernel/cpu/binary_masked_dot.cc index c1f24ca97bbc..fcbd5284211c 100644 --- a/src/kernel/cpu/binary_masked_dot.cc +++ b/src/kernel/cpu/binary_masked_dot.cc @@ -13,12 +13,12 @@ namespace kernel { #define XPU kDLCPU #define IDX int32_t -EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_MASKED_DOT_DEFINE); +EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE); EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE); #undef IDX #define IDX int64_t -EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_MASKED_DOT_DEFINE); +EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE); EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE); #undef IDX diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h index d657d7404137..fe259cf1dfdb 100644 --- a/src/kernel/cpu/binary_reduce_impl.h +++ b/src/kernel/cpu/binary_reduce_impl.h @@ -29,6 +29,7 @@ struct BinaryReduce { static inline void ApplyEdge( Idx src, Idx dst, Idx eid, GData* gdata) { const int64_t D = gdata->x_length; + const int64_t len = gdata->data_len Idx lid = Functors::SelectLeft(src, eid, dst); Idx rid = Functors::SelectRight(src, eid, dst); Idx oid = Functors::SelectOut(src, eid, dst); @@ -41,31 +42,16 @@ struct BinaryReduce { if (gdata->out_mapping) { oid = Functors::GetId(oid, gdata->out_mapping); } - DType* lhsoff = gdata->lhs_data + lid * D; - DType* rhsoff = gdata->rhs_data + rid * D; + DType* lhsoff = gdata->lhs_data + lid * D * len; + DType* rhsoff = gdata->rhs_data + rid * D * len; DType* outoff = gdata->out_data + oid * D; for (int64_t tx = 0; tx < D; ++tx) { - DType lhs = Functors::Read(lhsoff + tx); - DType rhs = Functors::Read(rhsoff + tx); - DType out = Functors::Op(lhs, rhs); + DType out = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len); Functors::Write(outoff + tx, out); } } }; -// Minigun UDF to compute binary reduce. -template -struct BinaryMaskedDot { - static inline bool CondEdge( - Idx src, Idx dst, Idx eid, GData* gdata) { - return true; - } - static inline void ApplyEdge( - Idx src, Idx dst, Idx eid, GData* gdata) { - return true; - } -}; - // Convert flattened index to multi-dimension index (assume row-major). inline void Unravel(int64_t idx, int ndim, const int64_t* shape, const int64_t* stride, int64_t* out) { @@ -93,6 +79,7 @@ struct BinaryReduceBcast { } static inline void ApplyEdge( Idx src, Idx dst, Idx eid, BcastGData* gdata) { + const int64_t len = gdata->data_len; Idx lid = Functors::SelectLeft(src, eid, dst); Idx rid = Functors::SelectRight(src, eid, dst); Idx oid = Functors::SelectOut(src, eid, dst); @@ -105,17 +92,17 @@ struct BinaryReduceBcast { if (gdata->out_mapping) { oid = Functors::GetId(oid, gdata->out_mapping); } - DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len; - DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len; + DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; //data with len size + DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len; DType* outoff = gdata->out_data + oid * gdata->out_len; int64_t tmp[NDim]; // store unraveled idx. for (int64_t tx = 0; tx < gdata->out_len; ++tx) { Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp); - DType lhs = Functors::Read(lhsoff + - Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride)); - DType rhs = Functors::Read(rhsoff + - Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride)); - DType out = Functors::Op(lhs, rhs); + DType out = Functors::Op( + lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, + rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, + len); + Functors::Write(outoff + tx, out); } } @@ -138,11 +125,8 @@ struct FunctorsTempl { Idx src, Idx edge, Idx dst) { return RightSelector::Call(src, edge, dst); } - static inline DType Op(DType lhs, DType rhs) { - return BinaryOp::Call(lhs, rhs); - } - static inline DType Read(DType* addr) { - return *addr; + static inline Dtype Op(DType *lhs, Dtype *rhs, int64_t len) { + return BinaryOp::Call(lhs, rhs, len); } static inline void Write(DType* addr, DType val) { Reducer::Call(addr, val); @@ -188,41 +172,6 @@ void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg, rtcfg, csr, gdata, minigun::IntArray1D()); } -// Template implementation of BinaryMaksedDot operator. -template -void CallBinaryMaskedDot(const minigun::advance::RuntimeConfig& rtcfg, - const CSRWrapper& graph, - GData* gdata) { - //For binary dot, it should be none reducer. - typedef cpu::FunctorsTempl, ReduceNone> - Functors; - typedef cpu::BinaryMaskedDot UDF; - // csr - auto outcsr = graph.GetOutCSRMatrix(); - minigun::Csr csr = utils::CreateCsr(outcsr.indptr, outcsr.indices); - // If the user-given mapping is none and the target is edge data, we need to - // replace the mapping by the edge ids in the csr graph so that the edge - // data is correctly read/written. - if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) { - gdata->lhs_mapping = static_cast(outcsr.data->data); - } - if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) { - gdata->rhs_mapping = static_cast(outcsr.data->data); - } - - // For Masked Matrix Multiply, the output target should be edge. - // If the user-given mapping is none, we need to replace the mapping by the - // edge ids in the csr graph. - if (gdata->out_mapping == nullptr) { - gdata->out_mapping = static_cast(outcsr.data->data); - } - // TODO(minjie): allocator - minigun::advance::Advance, UDF>( - rtcfg, csr, gdata, minigun::IntArray1D()); -} - // Template implementation of BinaryReduce broadcasting operator. template * gdata); -// Following macro is used to generate explicit-specialization of the template -// operator. -#define GEN_MASKED_DOT_DEFINE(dtype, lhs_tgt, rhs_tgt) \ - template void CallBinaryMaskedDot( \ - const minigun::advance::RuntimeConfig& rtcfg, \ - const CSRWrapper& graph, \ - GData* gdata); - #define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op) \ template void CallBinaryReduceBcast* gdata) { const int64_t D = gdata->x_length; int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; - int stride_x = blockDim.x * gridDim.x; + const int64_t stride_x = blockDim.x * gridDim.x; + const int64_t len = gdata->data_len Idx lid = Functors::SelectLeft(src, eid, dst); Idx rid = Functors::SelectRight(src, eid, dst); Idx oid = Functors::SelectOut(src, eid, dst); @@ -41,32 +42,17 @@ struct BinaryReduce { if (gdata->out_mapping) { oid = Functors::GetId(oid, gdata->out_mapping); } - DType* lhsoff = gdata->lhs_data + lid * D; - DType* rhsoff = gdata->rhs_data + rid * D; + DType* lhsoff = gdata->lhs_data + lid * D * len; + DType* rhsoff = gdata->rhs_data + rid * D * len; DType* outoff = gdata->out_data + oid * D; while (tx < D) { - DType lhs = Functors::Read(lhsoff + tx); - DType rhs = Functors::Read(rhsoff + tx); - DType out = Functors::Op(lhs, rhs); + DType out = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len); Functors::Write(outoff + tx, out); tx += stride_x; } } }; -// Minigun UDF to compute binary reduce. -template -struct BinaryMaskedDot { - static __device__ __forceinline__ bool CondEdge( - Idx src, Idx dst, Idx eid, GData* gdata) { - return true; - } - static __device__ __forceinline__ void ApplyEdge( - Idx src, Idx dst, Idx eid, GData* gdata) { - return; - } -}; - // Convert flattened index to multi-dimension index (assume row-major). __device__ __forceinline__ void Unravel( int64_t idx, int ndim, const int64_t* shape, const int64_t* stride, int64_t* out) { @@ -95,7 +81,8 @@ struct BinaryReduceBcast { static __device__ __forceinline__ void ApplyEdge( Idx src, Idx dst, Idx eid, BcastGData* gdata) { int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; - int stride_x = blockDim.x * gridDim.x; + constint64_t stride_x = blockDim.x * gridDim.x; + const int64_t len = gdata->data_len; Idx lid = Functors::SelectLeft(src, eid, dst); Idx rid = Functors::SelectRight(src, eid, dst); Idx oid = Functors::SelectOut(src, eid, dst); @@ -108,17 +95,17 @@ struct BinaryReduceBcast { if (gdata->out_mapping) { oid = Functors::GetId(oid, gdata->out_mapping); } - DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len; - DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len; - DType* outoff = gdata->out_data + oid * gdata->out_len; + DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; //data with len size + DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len; + DType* outoff = gdata->out_data + oid * gdata->out_len * len; int64_t tmp[NDim]; // store unraveled idx. while (tx < gdata->out_len) { Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp); - DType lhs = Functors::Read(lhsoff + - Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride)); - DType rhs = Functors::Read(rhsoff + - Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride)); - DType out = Functors::Op(lhs, rhs); + DType out = Functors::Op( + lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, + rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, + len); + Functors::Write(outoff + tx, out); tx += stride_x; } @@ -142,11 +129,8 @@ struct FunctorsTempl { Idx src, Idx edge, Idx dst) { return RightSelector::Call(src, edge, dst); } - static __device__ __forceinline__ DType Op(DType lhs, DType rhs) { - return BinaryOp::Call(lhs, rhs); - } - static __device__ __forceinline__ DType Read(DType* addr) { - return LDGReader::Call(addr); + static __device__ __forceinline__ DType Op(DType *lhs, DType *rhs, int64_t len) { + return BinaryOp::Call(lhs, rhs, len); } static __device__ __forceinline__ void Write(DType* addr, DType val) { Reducer::Call(addr, val); @@ -191,42 +175,6 @@ void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg, rtcfg, csr, gdata, minigun::IntArray1D()); } -// Template implementation of BinaryMaskedDot operator. -template -void CallBinaryMaskedDot(const minigun::advance::RuntimeConfig& rtcfg, - const CSRWrapper& graph, - GData* gdata) { - //For binary dot, it should be none reducer. - typedef cuda::FunctorsTempl, ReduceNone> - Functors; - typedef cuda::BinaryMaskedDot UDF; - - // csr - auto outcsr = graph.GetOutCSRMatrix(); - minigun::Csr csr = utils::CreateCsr(outcsr.indptr, outcsr.indices); - // If the user-given mapping is none and the target is edge data, we need to - // replace the mapping by the edge ids in the csr graph so that the edge - // data is correctly read/written. - if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) { - gdata->lhs_mapping = static_cast(outcsr.data->data); - } - if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) { - gdata->rhs_mapping = static_cast(outcsr.data->data); - } - - // For Masked Matrix Multiply, the output target should be edge. - // If the user-given mapping is none, we need to replace the mapping by the - // edge ids in the csr graph. - if (gdata->out_mapping == nullptr) { - gdata->out_mapping = static_cast(outcsr.data->data); - } - // TODO(minjie): allocator - minigun::advance::Advance, UDF>( - rtcfg, csr, gdata, minigun::IntArray1D()); -} - // Template implementation of BinaryReduce broadcasting operator. template * gdata); -// Following macro is used to generate explicit-specialization of the template -// operator. -#define GEN_MASKED_DOT_DEFINE(dtype, lhs_tgt, rhs_tgt) \ - template void CallBinaryMaskedDot( \ - const minigun::advance::RuntimeConfig& rtcfg, \ - const CSRWrapper& graph, \ - GData* gdata); - #define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op) \ template void CallBinaryReduceBcast Date: Wed, 4 Sep 2019 21:35:27 +0800 Subject: [PATCH 21/32] New Impl of x_dot_x, reuse binary reduce template --- python/dgl/backend/mxnet/tensor.py | 2 +- python/dgl/backend/pytorch/tensor.py | 2 +- src/kernel/binary_reduce.cc | 4 +- src/kernel/binary_reduce_common.h | 4 +- src/kernel/cpu/backward_binary_reduce_impl.h | 51 +++++++++++-------- src/kernel/cpu/binary_reduce_impl.h | 2 +- .../cuda/backward_binary_reduce_impl.cuh | 51 +++++++++++-------- src/kernel/cuda/binary_reduce_impl.cuh | 2 +- 8 files changed, 67 insertions(+), 51 deletions(-) diff --git a/python/dgl/backend/mxnet/tensor.py b/python/dgl/backend/mxnet/tensor.py index d2d38eab0646..07eee54ec809 100644 --- a/python/dgl/backend/mxnet/tensor.py +++ b/python/dgl/backend/mxnet/tensor.py @@ -364,7 +364,7 @@ def forward(self, lhs_data, rhs_data): lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data) rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data) feat_shape = K.infer_binary_feature_shape(self.binary_op, lhs_data_nd, rhs_data_nd) - out_data = nd.empty((self.out_size,) + feat_shape, + out_data = nd.empty((self.out_size,) + feat_shape[:-1], ctx=lhs_data.context, dtype=lhs_data.dtype) out_data_nd = zerocopy_to_dgl_ndarray_for_write(out_data) K.binary_op_reduce( diff --git a/python/dgl/backend/pytorch/tensor.py b/python/dgl/backend/pytorch/tensor.py index bc75ed5f0cdb..a9b089f3600c 100644 --- a/python/dgl/backend/pytorch/tensor.py +++ b/python/dgl/backend/pytorch/tensor.py @@ -286,7 +286,7 @@ def forward(ctx, reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data, lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data) rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data) feat_shape = K.infer_binary_feature_shape(binary_op, lhs_data_nd, rhs_data_nd) - out_data = lhs_data.new_empty((out_size,) + feat_shape) + out_data = lhs_data.new_empty((out_size,) + feat_shape[:-1]) out_data_nd = zerocopy_to_dgl_ndarray(out_data) K.binary_op_reduce( reducer, binary_op, graph, lhs, rhs, lhs_data_nd, rhs_data_nd, diff --git a/src/kernel/binary_reduce.cc b/src/kernel/binary_reduce.cc index 265626f63b3e..9cfef5a9c834 100644 --- a/src/kernel/binary_reduce.cc +++ b/src/kernel/binary_reduce.cc @@ -140,6 +140,7 @@ BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) { accum = 0; } std::reverse(ret.real_out_shape.begin(), ret.real_out_shape.end()); + ret.real_out_shape.push_back(ret.data_len); std::reverse(ret.lhs_shape.begin(), ret.lhs_shape.end()); std::reverse(ret.rhs_shape.begin(), ret.rhs_shape.end()); std::reverse(ret.out_shape.begin(), ret.out_shape.end()); @@ -243,9 +244,10 @@ class ImmutableGraphCSRWrapper : public CSRWrapper { std::vector InferBinaryFeatureShape( + const std::string& op, NDArray lhs, NDArray rhs) { - return CalcBcastInfo(lhs, rhs).real_out_shape; + return CalcBcastInfo(op, lhs, rhs).real_out_shape; } DGL_REGISTER_GLOBAL("kernel._CAPI_DGLKernelInferBinaryFeatureShape") diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h index a1cd914be03f..ba94a869f434 100644 --- a/src/kernel/binary_reduce_common.h +++ b/src/kernel/binary_reduce_common.h @@ -204,10 +204,10 @@ struct BinaryDot { return out; } static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) { - return 1; + return rhs; } static DGLDEVICE DGLINLINE DType BackwardRhs(DType lhs, DType rhs, DType out) { - return 1; + return lhs; } }; diff --git a/src/kernel/cpu/backward_binary_reduce_impl.h b/src/kernel/cpu/backward_binary_reduce_impl.h index 1db1e5448c5d..b72ba5b6bdaa 100644 --- a/src/kernel/cpu/backward_binary_reduce_impl.h +++ b/src/kernel/cpu/backward_binary_reduce_impl.h @@ -27,6 +27,7 @@ struct BackwardBinaryReduce { static inline void ApplyEdge( Idx src, Idx dst, Idx eid, BackwardGData* gdata) { const int64_t D = gdata->x_length; + const int64_t len = gdata->data_len; Idx lid = Functors::SelectLeft(src, eid, dst); Idx rid = Functors::SelectRight(src, eid, dst); Idx oid = Functors::SelectOut(src, eid, dst); @@ -39,28 +40,30 @@ struct BackwardBinaryReduce { if (gdata->out_mapping) { oid = Functors::GetId(oid, gdata->out_mapping); } - DType* lhsoff = gdata->lhs_data + lid * D; - DType* rhsoff = gdata->rhs_data + rid * D; + DType* lhsoff = gdata->lhs_data + lid * D * len; + DType* rhsoff = gdata->rhs_data + rid * D * len; DType* outoff = gdata->out_data + oid * D; - DType* gradlhsoff = gdata->grad_lhs_data + lid * D; - DType* gradrhsoff = gdata->grad_rhs_data + rid * D; + DType* gradlhsoff = gdata->grad_lhs_data + lid * D * len; + DType* gradrhsoff = gdata->grad_rhs_data + rid * D * len; DType* gradoutoff = gdata->grad_out_data + oid * D; for (int64_t tx = 0; tx < D; ++tx) { - DType lhs = Functors::Read(lhsoff + tx); - DType rhs = Functors::Read(rhsoff + tx); DType out = Functors::Read(outoff + tx); DType grad_out = Functors::Read(gradoutoff + tx); - DType e = Functors::Op(lhs, rhs); + DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len); DType grad_e = grad_out * Functors::BackwardWrite(e, out); if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); + for (int64_t i = 0; i < len; ++i) { #pragma omp atomic - gradlhsoff[tx] += grad_lhs; + gradlhsoff[tx * len + i] += grad_lhs; + } } if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); + for (int64_t i = 0; i < len; ++i) { #pragma omp atomic - gradrhsoff[tx] += grad_rhs; + gradrhsoff[tx * len + i] += grad_rhs; + } } } } @@ -76,6 +79,7 @@ struct BackwardBinaryReduceBcast { } static inline void ApplyEdge( Idx src, Idx dst, Idx eid, BackwardBcastGData* gdata) { + const int64_t len = gdata->data_len; Idx lid = Functors::SelectLeft(src, eid, dst); Idx rid = Functors::SelectRight(src, eid, dst); Idx oid = Functors::SelectOut(src, eid, dst); @@ -88,32 +92,35 @@ struct BackwardBinaryReduceBcast { if (gdata->out_mapping) { oid = Functors::GetId(oid, gdata->out_mapping); } - DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len; - DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len; + DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; + DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len; DType* outoff = gdata->out_data + oid * gdata->out_len; - DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len; - DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len; + DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len * len; + DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len * len; DType* gradoutoff = gdata->grad_out_data + oid * gdata->out_len; int64_t tmp[NDim]; // store unraveled idx. for (int64_t tx = 0; tx < gdata->out_len; ++tx) { Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp); - DType lhs = Functors::Read(lhsoff + - Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride)); - DType rhs = Functors::Read(rhsoff + - Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride)); DType out = Functors::Read(outoff + tx); DType grad_out = Functors::Read(gradoutoff + tx); - DType e = Functors::Op(lhs, rhs); + DType e = Functors::Op( + lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, + rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, + len); DType grad_e = grad_out * Functors::BackwardWrite(e, out); if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); + for (int64_t i = 0; i < len; ++i) { #pragma omp atomic - gradlhsoff[tx] += grad_lhs; + gradlhsoff[tx * len + i] += grad_lhs; + } } if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); + for (int64_t i = 0; i < len; ++i) { #pragma omp atomic - gradrhsoff[tx] += grad_rhs; + gradrhsoff[tx * len + i] += grad_rhs; + } } } } @@ -137,8 +144,8 @@ struct BackwardFunctorsTempl { Idx src, Idx edge, Idx dst) { return RightSelector::Call(src, edge, dst); } - static inline DType Op(DType lhs, DType rhs) { - return BinaryOp::Call(lhs, rhs); + static inline DType Op(DType* lhs, DType* rhs, int64_t len) { + return BinaryOp::Call(lhs, rhs, len); } static inline DType Read(DType* addr) { return *addr; diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h index fe259cf1dfdb..f68d9dea3ec1 100644 --- a/src/kernel/cpu/binary_reduce_impl.h +++ b/src/kernel/cpu/binary_reduce_impl.h @@ -29,7 +29,7 @@ struct BinaryReduce { static inline void ApplyEdge( Idx src, Idx dst, Idx eid, GData* gdata) { const int64_t D = gdata->x_length; - const int64_t len = gdata->data_len + const int64_t len = gdata->data_len; Idx lid = Functors::SelectLeft(src, eid, dst); Idx rid = Functors::SelectRight(src, eid, dst); Idx oid = Functors::SelectOut(src, eid, dst); diff --git a/src/kernel/cuda/backward_binary_reduce_impl.cuh b/src/kernel/cuda/backward_binary_reduce_impl.cuh index bbe3a9c44ce5..e035b24e1061 100644 --- a/src/kernel/cuda/backward_binary_reduce_impl.cuh +++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh @@ -29,6 +29,7 @@ struct BackwardBinaryReduce { const int64_t D = gdata->x_length; int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = blockDim.x * gridDim.x; + const int64_t len = gdata->data_len; Idx lid = Functors::SelectLeft(src, eid, dst); Idx rid = Functors::SelectRight(src, eid, dst); Idx oid = Functors::SelectOut(src, eid, dst); @@ -41,26 +42,28 @@ struct BackwardBinaryReduce { if (gdata->out_mapping) { oid = Functors::GetId(oid, gdata->out_mapping); } - DType* lhsoff = gdata->lhs_data + lid * D; - DType* rhsoff = gdata->rhs_data + rid * D; + DType* lhsoff = gdata->lhs_data + lid * D * len; + DType* rhsoff = gdata->rhs_data + rid * D * len; DType* outoff = gdata->out_data + oid * D; - DType* gradlhsoff = gdata->grad_lhs_data + lid * D; - DType* gradrhsoff = gdata->grad_rhs_data + rid * D; + DType* gradlhsoff = gdata->grad_lhs_data + lid * D * len; + DType* gradrhsoff = gdata->grad_rhs_data + rid * D * len; DType* gradoutoff = gdata->grad_out_data + oid * D; while (tx < D) { - DType lhs = Functors::Read(lhsoff + tx); - DType rhs = Functors::Read(rhsoff + tx); DType out = Functors::Read(outoff + tx); DType grad_out = Functors::Read(gradoutoff + tx); - DType e = Functors::Op(lhs, rhs); + DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len); DType grad_e = grad_out * Functors::BackwardWrite(e, out); if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); - AtomicAdd(gradlhsoff + tx, grad_lhs); + for (int64_t i = 0; i < len; ++i) { + AtomicAdd(gradlhsoff + tx * len + i, grad_lhs); + } } if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); - AtomicAdd(gradrhsoff + tx, grad_rhs); + for (int64_t i = 0; i < len; ++i) { + AtomicAdd(gradrhsoff + tx * len + i, grad_rhs); + } } tx += stride_x; } @@ -78,6 +81,7 @@ struct BackwardBinaryReduceBcast { Idx src, Idx dst, Idx eid, BackwardBcastGData* gdata) { int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = blockDim.x * gridDim.x; + const int64_t len = gdata->data_len; Idx lid = Functors::SelectLeft(src, eid, dst); Idx rid = Functors::SelectRight(src, eid, dst); Idx oid = Functors::SelectOut(src, eid, dst); @@ -90,30 +94,33 @@ struct BackwardBinaryReduceBcast { if (gdata->out_mapping) { oid = Functors::GetId(oid, gdata->out_mapping); } - DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len; - DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len; + DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; + DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len; DType* outoff = gdata->out_data + oid * gdata->out_len; - DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len; - DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len; + DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len * len; + DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len * len; DType* gradoutoff = gdata->grad_out_data + oid * gdata->out_len; int64_t tmp[NDim]; // store unraveled idx. while (tx < gdata->out_len) { Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp); - DType lhs = Functors::Read(lhsoff + - Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride)); - DType rhs = Functors::Read(rhsoff + - Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride)); DType out = Functors::Read(outoff + tx); DType grad_out = Functors::Read(gradoutoff + tx); - DType e = Functors::Op(lhs, rhs); + DType e = Functors::Op( + lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, + rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, + len); DType grad_e = grad_out * Functors::BackwardWrite(e, out); if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); - AtomicAdd(gradlhsoff + tx, grad_lhs); + for (int64_t i = 0; i < len; ++i) { + AtomicAdd(gradlhsoff + tx * len + i, grad_lhs); + } } if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); - AtomicAdd(gradrhsoff + tx, grad_rhs); + for (int64_t i = 0; i < len; ++i) { + AtomicAdd(gradrhsoff + tx * len + i, grad_rhs); + } } tx += stride_x; } @@ -138,8 +145,8 @@ struct BackwardFunctorsTempl { Idx src, Idx edge, Idx dst) { return RightSelector::Call(src, edge, dst); } - static __device__ __forceinline__ DType Op(DType lhs, DType rhs) { - return BinaryOp::Call(lhs, rhs); + static __device__ __forceinline__ DType Op(DType* lhs, DType* rhs, int64_t len) { + return BinaryOp::Call(lhs, rhs, len); } static __device__ __forceinline__ DType Read(DType* addr) { return LDGReader::Call(addr); diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh index 37cb4eca7968..66a87caf1f77 100644 --- a/src/kernel/cuda/binary_reduce_impl.cuh +++ b/src/kernel/cuda/binary_reduce_impl.cuh @@ -29,7 +29,7 @@ struct BinaryReduce { const int64_t D = gdata->x_length; int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; const int64_t stride_x = blockDim.x * gridDim.x; - const int64_t len = gdata->data_len + const int64_t len = gdata->data_len; Idx lid = Functors::SelectLeft(src, eid, dst); Idx rid = Functors::SelectRight(src, eid, dst); Idx oid = Functors::SelectOut(src, eid, dst); From faa3b2d9b2bcfb372dfa1847fa881e83f595ac42 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 4 Sep 2019 15:06:23 +0000 Subject: [PATCH 22/32] Compile OK. TODO: 1. make sure x_add_x, x_sub_x, x_mul_x, x_div_x work 2. let x_dot_x work 3. make sure backward of x_add_x, x_sub_x, x_mul_x, x_div_x work 4. let x_dot_x backward work --- src/kernel/binary_reduce.cc | 2 +- src/kernel/binary_reduce_common.h | 2 +- src/kernel/binary_reduce_impl.h | 18 ++++++++++--- src/kernel/binary_reduce_impl_decl.h | 4 +++ src/kernel/cpu/backward_binary_reduce_impl.h | 24 ++++++++++++++--- src/kernel/cpu/binary_masked_dot.cc | 26 ------------------- src/kernel/cpu/binary_reduce_impl.h | 2 +- .../cuda/backward_binary_reduce_impl.cuh | 24 ++++++++++++++--- src/kernel/cuda/binary_masked_dot.cu | 20 -------------- src/kernel/cuda/binary_reduce_impl.cuh | 2 +- 10 files changed, 62 insertions(+), 62 deletions(-) delete mode 100644 src/kernel/cpu/binary_masked_dot.cc delete mode 100644 src/kernel/cuda/binary_masked_dot.cu diff --git a/src/kernel/binary_reduce.cc b/src/kernel/binary_reduce.cc index 9cfef5a9c834..3435e0659f26 100644 --- a/src/kernel/binary_reduce.cc +++ b/src/kernel/binary_reduce.cc @@ -94,7 +94,7 @@ bool HasBcast(NDArray lhs, NDArray rhs) { // See also: BcastInfo (kernel/binary_reduce.h) BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) { BcastInfo ret; - const int max_ndim = std::max(lhs->ndim, rhs->ndim) - 1; + int max_ndim = std::max(lhs->ndim, rhs->ndim) - 1; // for dot operation: vector [dot] vector // lhs_shape[ndim-1] == rhs_shape[ndim-1] = sizeof(vector) // out_shape[ndim-1] = 1 diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h index ba94a869f434..0c0193116be1 100644 --- a/src/kernel/binary_reduce_common.h +++ b/src/kernel/binary_reduce_common.h @@ -196,7 +196,7 @@ struct BinaryUseLhs { template struct BinaryDot { static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) { - Dtype out = 0; + DType out = 0; //simple vector dot vector for (int i = 0; i < len; i ++) out += lhs[i] * rhs[i]; diff --git a/src/kernel/binary_reduce_impl.h b/src/kernel/binary_reduce_impl.h index 5f64f2dc6ea9..d14215a0f140 100644 --- a/src/kernel/binary_reduce_impl.h +++ b/src/kernel/binary_reduce_impl.h @@ -51,7 +51,7 @@ GData AllocGData(const std::string& op, } // for dot operation: vector [dot] vector - if (op == binary::kDot) { + if (op == binary_op::kDot) { //get size of vector gdata.data_len = lhs_data->shape[lhs_data->ndim - 1]; } else { @@ -119,7 +119,7 @@ void BinaryReduceImpl( template BackwardGData AllocBackwardGData( - const DLContext& ctx, int64_t x_len, + const std::string& op, const DLContext& ctx, int64_t x_len, runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping, runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data, runtime::NDArray grad_out_data, @@ -153,6 +153,14 @@ BackwardGData AllocBackwardGData( if (!utils::IsNoneArray(out_mapping)) { gdata.out_mapping = static_cast(out_mapping->data); } + + // for dot operation: vector [dot] vector + if (op == binary_op::kDot) { + //get size of vector + gdata.data_len = lhs_data->shape[lhs_data->ndim - 1]; + } else { + gdata.data_len = 1; + } return gdata; } @@ -198,7 +206,7 @@ void BackwardBinaryReduceImpl( } DGL_DTYPE_SWITCH(dtype, DType, { DGL_IDX_TYPE_SWITCH(bits, Idx, { - auto gdata = AllocBackwardGData( + auto gdata = AllocBackwardGData(op, rtcfg.ctx, x_len, lhs_mapping, rhs_mapping, out_mapping, lhs_data, rhs_data, out_data, grad_out_data, grad_lhs_data, grad_rhs_data); @@ -250,8 +258,8 @@ BcastGData AllocBcastGData( if (!utils::IsNoneArray(out_mapping)) { gdata.out_mapping = static_cast(out_mapping->data); } - gdata.data_len = info.data_len; + // fill out data with zero values utils::Fill(ctx, gdata.out_data, utils::NElements(out_data), Zero::value); return gdata; @@ -347,6 +355,8 @@ BackwardBcastGData AllocBackwardBcastGData( if (!utils::IsNoneArray(out_mapping)) { gdata.out_mapping = static_cast(out_mapping->data); } + gdata.data_len = info.data_len; + // data gdata.lhs_data = static_cast(lhs->data); gdata.rhs_data = static_cast(rhs->data); diff --git a/src/kernel/binary_reduce_impl_decl.h b/src/kernel/binary_reduce_impl_decl.h index 5c799372d7e4..af95b76cbdf5 100644 --- a/src/kernel/binary_reduce_impl_decl.h +++ b/src/kernel/binary_reduce_impl_decl.h @@ -121,6 +121,8 @@ template struct BackwardGData { // length along x(feature) dimension int64_t x_length{0}; + // size of data, can be single value or a vector + int64_t data_len; // number of rows of the output tensor int64_t out_size{0}; // input data @@ -330,6 +332,8 @@ struct BackwardBcastGData { int64_t lhs_shape[NDim]{0}, lhs_stride[NDim]{0}; int64_t rhs_shape[NDim]{0}, rhs_stride[NDim]{0}; int64_t out_shape[NDim]{0}, out_stride[NDim]{0}; + // size of data, can be single value or a vector + int64_t data_len; // input id mappings Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr}, *out_mapping{nullptr}; // input data diff --git a/src/kernel/cpu/backward_binary_reduce_impl.h b/src/kernel/cpu/backward_binary_reduce_impl.h index b72ba5b6bdaa..1b4976a4a982 100644 --- a/src/kernel/cpu/backward_binary_reduce_impl.h +++ b/src/kernel/cpu/backward_binary_reduce_impl.h @@ -51,16 +51,23 @@ struct BackwardBinaryReduce { DType grad_out = Functors::Read(gradoutoff + tx); DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len); DType grad_e = grad_out * Functors::BackwardWrite(e, out); + + DType* lhs_base = lhsoff + tx * len; + DType* rhs_base = rhsoff + tx * len; if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { - DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); for (int64_t i = 0; i < len; ++i) { + DType lhs = Functors::Read(lhs_base + i); + DType rhs = Functors::Read(rhs_base + i); + DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); #pragma omp atomic gradlhsoff[tx * len + i] += grad_lhs; } } if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { - DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); for (int64_t i = 0; i < len; ++i) { + DType lhs = Functors::Read(lhs_base + i); + DType rhs = Functors::Read(rhs_base + i); + DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); #pragma omp atomic gradrhsoff[tx * len + i] += grad_rhs; } @@ -108,16 +115,25 @@ struct BackwardBinaryReduceBcast { rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, len); DType grad_e = grad_out * Functors::BackwardWrite(e, out); + + DType* lhs_base = lhsoff + + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len; + DType* rhs_base = rhsoff + + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len; if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { - DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); for (int64_t i = 0; i < len; ++i) { + DType lhs = Functors::Read(lhs_base + i); + DType rhs = Functors::Read(rhs_base + i); + DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); #pragma omp atomic gradlhsoff[tx * len + i] += grad_lhs; } } if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { - DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); for (int64_t i = 0; i < len; ++i) { + DType lhs = Functors::Read(lhs_base + i); + DType rhs = Functors::Read(rhs_base + i); + DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); #pragma omp atomic gradrhsoff[tx * len + i] += grad_rhs; } diff --git a/src/kernel/cpu/binary_masked_dot.cc b/src/kernel/cpu/binary_masked_dot.cc deleted file mode 100644 index fcbd5284211c..000000000000 --- a/src/kernel/cpu/binary_masked_dot.cc +++ /dev/null @@ -1,26 +0,0 @@ -/*! - * Copyright (c) 2019 by Contributors - * \file kernel/cpu/binary_masked_dot.cc - * \brief CPU kernels for binary reduce prod - */ -#include "./binary_reduce_impl.h" -#include "./backward_binary_reduce_impl.h" - -namespace dgl { -namespace kernel { - -#define REDUCER ReduceNone -#define XPU kDLCPU - -#define IDX int32_t -EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE); -EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE); -#undef IDX - -#define IDX int64_t -EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE); -EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE); -#undef IDX - -} // namespace kernel -} // namespace dgl diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h index f68d9dea3ec1..cff9e22c418f 100644 --- a/src/kernel/cpu/binary_reduce_impl.h +++ b/src/kernel/cpu/binary_reduce_impl.h @@ -125,7 +125,7 @@ struct FunctorsTempl { Idx src, Idx edge, Idx dst) { return RightSelector::Call(src, edge, dst); } - static inline Dtype Op(DType *lhs, Dtype *rhs, int64_t len) { + static inline DType Op(DType *lhs, DType *rhs, int64_t len) { return BinaryOp::Call(lhs, rhs, len); } static inline void Write(DType* addr, DType val) { diff --git a/src/kernel/cuda/backward_binary_reduce_impl.cuh b/src/kernel/cuda/backward_binary_reduce_impl.cuh index e035b24e1061..871e33dbb3bb 100644 --- a/src/kernel/cuda/backward_binary_reduce_impl.cuh +++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh @@ -53,15 +53,22 @@ struct BackwardBinaryReduce { DType grad_out = Functors::Read(gradoutoff + tx); DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len); DType grad_e = grad_out * Functors::BackwardWrite(e, out); + + DType* lhs_base = lhsoff + tx * len; + DType* rhs_base = rhsoff + tx * len; if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { - DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); for (int64_t i = 0; i < len; ++i) { + DType lhs = Functors::Read(lhs_base + i); + DType rhs = Functors::Read(rhs_base + i); + DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); AtomicAdd(gradlhsoff + tx * len + i, grad_lhs); } } if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { - DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); for (int64_t i = 0; i < len; ++i) { + DType lhs = Functors::Read(lhs_base + i); + DType rhs = Functors::Read(rhs_base + i); + DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); AtomicAdd(gradrhsoff + tx * len + i, grad_rhs); } } @@ -110,15 +117,24 @@ struct BackwardBinaryReduceBcast { rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, len); DType grad_e = grad_out * Functors::BackwardWrite(e, out); + + DType* lhs_base = lhsoff + + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len; + DType* rhs_base = rhsoff + + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len; if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { - DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); for (int64_t i = 0; i < len; ++i) { + DType lhs = Functors::Read(lhs_base + i); + DType rhs = Functors::Read(rhs_base + i); + DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); AtomicAdd(gradlhsoff + tx * len + i, grad_lhs); } } if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { - DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); for (int64_t i = 0; i < len; ++i) { + DType lhs = Functors::Read(lhs_base + i); + DType rhs = Functors::Read(rhs_base + i); + DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); AtomicAdd(gradrhsoff + tx * len + i, grad_rhs); } } diff --git a/src/kernel/cuda/binary_masked_dot.cu b/src/kernel/cuda/binary_masked_dot.cu deleted file mode 100644 index b3bebfb85fdd..000000000000 --- a/src/kernel/cuda/binary_masked_dot.cu +++ /dev/null @@ -1,20 +0,0 @@ -/*! - * Copyright (c) 2019 by Contributors - * \file kernel/cuda/binary_masked_dot.cu - * \brief CUDA kernels for binary reduce prod - */ -#include "./binary_reduce_impl.cuh" -#include "./backward_binary_reduce_impl.cuh" - -namespace dgl { -namespace kernel { - -#define REDUCER ReduceNone -#define XPU kDLGPU -#define IDX int32_t - -EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE) -EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE) - -} // namespace kernel -} // namespace dgl diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh index 66a87caf1f77..cee51710b960 100644 --- a/src/kernel/cuda/binary_reduce_impl.cuh +++ b/src/kernel/cuda/binary_reduce_impl.cuh @@ -81,7 +81,7 @@ struct BinaryReduceBcast { static __device__ __forceinline__ void ApplyEdge( Idx src, Idx dst, Idx eid, BcastGData* gdata) { int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; - constint64_t stride_x = blockDim.x * gridDim.x; + const int64_t stride_x = blockDim.x * gridDim.x; const int64_t len = gdata->data_len; Idx lid = Functors::SelectLeft(src, eid, dst); Idx rid = Functors::SelectRight(src, eid, dst); From f9a0676a15978429e545fd41baafc0955003e4f1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 5 Sep 2019 03:08:46 +0000 Subject: [PATCH 23/32] Fix code style --- src/kernel/binary_reduce.cc | 4 ++-- src/kernel/binary_reduce_common.h | 2 +- src/kernel/binary_reduce_impl.h | 4 ++-- src/kernel/cpu/backward_binary_reduce_impl.h | 2 +- src/kernel/cpu/binary_reduce_impl.h | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/kernel/binary_reduce.cc b/src/kernel/binary_reduce.cc index 3435e0659f26..37800be2fe3e 100644 --- a/src/kernel/binary_reduce.cc +++ b/src/kernel/binary_reduce.cc @@ -99,10 +99,10 @@ BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) { // lhs_shape[ndim-1] == rhs_shape[ndim-1] = sizeof(vector) // out_shape[ndim-1] = 1 if (op == binary_op::kDot) { - //get size of vector + // get size of vector ret.data_len = lhs->shape[lhs->ndim - 1]; --max_ndim; - } else {//op != binary_op::kDot + } else { // op != binary_op::kDot ret.data_len = 1; } diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h index 0c0193116be1..b01fdcbb8069 100644 --- a/src/kernel/binary_reduce_common.h +++ b/src/kernel/binary_reduce_common.h @@ -197,7 +197,7 @@ template struct BinaryDot { static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) { DType out = 0; - //simple vector dot vector + // simple vector dot vector for (int i = 0; i < len; i ++) out += lhs[i] * rhs[i]; diff --git a/src/kernel/binary_reduce_impl.h b/src/kernel/binary_reduce_impl.h index d14215a0f140..5a11ea6c8c2b 100644 --- a/src/kernel/binary_reduce_impl.h +++ b/src/kernel/binary_reduce_impl.h @@ -52,7 +52,7 @@ GData AllocGData(const std::string& op, // for dot operation: vector [dot] vector if (op == binary_op::kDot) { - //get size of vector + // get size of vector gdata.data_len = lhs_data->shape[lhs_data->ndim - 1]; } else { gdata.data_len = 1; @@ -156,7 +156,7 @@ BackwardGData AllocBackwardGData( // for dot operation: vector [dot] vector if (op == binary_op::kDot) { - //get size of vector + // get size of vector gdata.data_len = lhs_data->shape[lhs_data->ndim - 1]; } else { gdata.data_len = 1; diff --git a/src/kernel/cpu/backward_binary_reduce_impl.h b/src/kernel/cpu/backward_binary_reduce_impl.h index 1b4976a4a982..0f209dc24d6a 100644 --- a/src/kernel/cpu/backward_binary_reduce_impl.h +++ b/src/kernel/cpu/backward_binary_reduce_impl.h @@ -116,7 +116,7 @@ struct BackwardBinaryReduceBcast { len); DType grad_e = grad_out * Functors::BackwardWrite(e, out); - DType* lhs_base = lhsoff + + DType* lhs_base = lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len; DType* rhs_base = rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len; diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h index cff9e22c418f..7f73765db7c8 100644 --- a/src/kernel/cpu/binary_reduce_impl.h +++ b/src/kernel/cpu/binary_reduce_impl.h @@ -92,15 +92,15 @@ struct BinaryReduceBcast { if (gdata->out_mapping) { oid = Functors::GetId(oid, gdata->out_mapping); } - DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; //data with len size + DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; // data with len size DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len; DType* outoff = gdata->out_data + oid * gdata->out_len; int64_t tmp[NDim]; // store unraveled idx. for (int64_t tx = 0; tx < gdata->out_len; ++tx) { Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp); DType out = Functors::Op( - lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, - rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, + lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, + rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, len); Functors::Write(outoff + tx, out); From 9c00adc5fefaeb16ffe57f003cf970206252bc11 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 5 Sep 2019 03:54:45 +0000 Subject: [PATCH 24/32] Now we can pass the tests/compute/test_kernel.py for add/sub/mul/div forward and backward --- python/dgl/backend/mxnet/tensor.py | 9 ++++++--- python/dgl/backend/pytorch/tensor.py | 9 ++++++--- python/dgl/function/message.py | 5 ++--- python/dgl/kernel.py | 1 + src/kernel/binary_reduce.cc | 4 +++- 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/python/dgl/backend/mxnet/tensor.py b/python/dgl/backend/mxnet/tensor.py index 39b8a3f1ae0d..4760b02acaba 100644 --- a/python/dgl/backend/mxnet/tensor.py +++ b/python/dgl/backend/mxnet/tensor.py @@ -374,7 +374,10 @@ def forward(self, lhs_data, rhs_data): lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data) rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data) feat_shape = K.infer_binary_feature_shape(self.binary_op, lhs_data_nd, rhs_data_nd) - out_data = nd.empty((self.out_size,) + feat_shape[:-1], + out_shape = feat_shape + if binary_op == 'dot': + out_shape = feat_shape[:-1] + out_data = nd.empty((self.out_size,) + out_shape, ctx=lhs_data.context, dtype=lhs_data.dtype) out_data_nd = zerocopy_to_dgl_ndarray_for_write(out_data) K.binary_op_reduce( @@ -399,10 +402,10 @@ def forward(self, lhs_data, rhs_data): in_ones = nd.ones((n,), ctx=lhs_data.context, dtype=lhs_data.dtype) in_ones_nd = zerocopy_to_dgl_ndarray(in_ones) K.copy_reduce( - 'sum', self.graph, target, in_ones_nd, degs_nd, + 'sum', self.graph, target, in_ones_nd, degs_nd, in_map, self.out_map[0]) # reshape - degs = degs.reshape((out_data.shape[0],) + (1,) * (out_data.ndim - 1)).clip(1, float('inf')) + degs = degs.reshape((out_data.shape[0],) + (1,) * (out_data.ndim - 1)).clip(1, float('inf')) out_data = out_data / degs else: degs = None diff --git a/python/dgl/backend/pytorch/tensor.py b/python/dgl/backend/pytorch/tensor.py index a237d694153a..837155fe3869 100644 --- a/python/dgl/backend/pytorch/tensor.py +++ b/python/dgl/backend/pytorch/tensor.py @@ -286,10 +286,13 @@ def forward(ctx, reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data, lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data) rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data) feat_shape = K.infer_binary_feature_shape(binary_op, lhs_data_nd, rhs_data_nd) - out_data = lhs_data.new_empty((out_size,) + feat_shape[:-1]) + out_shape = feat_shape + if binary_op == 'dot': + out_shape = feat_shape[:-1] + out_data = lhs_data.new_empty((out_size,) + out_shape) out_data_nd = zerocopy_to_dgl_ndarray(out_data) K.binary_op_reduce( - reducer if reducer != 'mean' else 'sum', + reducer if reducer != 'mean' else 'sum', binary_op, graph, lhs, rhs, lhs_data_nd, rhs_data_nd, out_data_nd, lhs_map[0], rhs_map[0], out_map[0]) # normalize if mean reducer @@ -308,7 +311,7 @@ def forward(ctx, reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data, in_ones = lhs_data.new_ones((n,)) in_ones_nd = zerocopy_to_dgl_ndarray(in_ones) K.copy_reduce( - 'sum', graph, target, in_ones_nd, degs_nd, in_map, out_map[0]) + 'sum', graph, target, in_ones_nd, degs_nd, in_map, out_map[0]) # reshape degs = degs.reshape((out_data.shape[0],) + (1,) * (out_data.dim() - 1)).clamp(min=1) out_data = out_data / degs diff --git a/python/dgl/function/message.py b/python/dgl/function/message.py index 91c98485546b..a1785ca0411d 100644 --- a/python/dgl/function/message.py +++ b/python/dgl/function/message.py @@ -154,8 +154,8 @@ def copy_e(e, out): # v_add_e, v_sub_e, v_mul_e, v_div_e # e_add_u, e_sub_u, e_mul_u, e_div_u # e_add_v, e_sub_v, e_mul_v, e_div_v -# -# masked-mm message functions: +# +# dot message functions: # u_dot_v, u_dot_e, v_dot_e _TARGET_MAP = { @@ -209,7 +209,6 @@ def _register_builtin_message_func(): setattr(sys.modules[__name__], func.__name__, func) __all__.append(func.__name__) - """Register builtin masked-mm functions""" for lhs, rhs in product(["u", "v"], ["v", "e"]): if lhs != rhs: for binary_op in ["dot"]: diff --git a/python/dgl/kernel.py b/python/dgl/kernel.py index 7abe2d38bf4e..4de76490e8d5 100644 --- a/python/dgl/kernel.py +++ b/python/dgl/kernel.py @@ -4,6 +4,7 @@ from ._ffi.function import _init_api from .ndarray import empty +# pylint: disable=invalid-name def infer_binary_feature_shape(op, lhs, rhs): """Infer the output feature shape after a binary operation between lhs and rhs. diff --git a/src/kernel/binary_reduce.cc b/src/kernel/binary_reduce.cc index fd04e1e84834..064f08fdad60 100644 --- a/src/kernel/binary_reduce.cc +++ b/src/kernel/binary_reduce.cc @@ -141,7 +141,9 @@ BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) { accum = 0; } std::reverse(ret.real_out_shape.begin(), ret.real_out_shape.end()); - ret.real_out_shape.push_back(ret.data_len); + if (op == binary_op::kDot) { + ret.real_out_shape.push_back(ret.data_len); + } std::reverse(ret.lhs_shape.begin(), ret.lhs_shape.end()); std::reverse(ret.rhs_shape.begin(), ret.rhs_shape.end()); std::reverse(ret.out_shape.begin(), ret.out_shape.end()); From 037b1425bde51acfb45c78e8c2a3d7714237898e Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 5 Sep 2019 04:47:01 +0000 Subject: [PATCH 25/32] Fix mxnet test code --- python/dgl/backend/mxnet/tensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dgl/backend/mxnet/tensor.py b/python/dgl/backend/mxnet/tensor.py index 4760b02acaba..6db926b3b0bc 100644 --- a/python/dgl/backend/mxnet/tensor.py +++ b/python/dgl/backend/mxnet/tensor.py @@ -375,7 +375,7 @@ def forward(self, lhs_data, rhs_data): rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data) feat_shape = K.infer_binary_feature_shape(self.binary_op, lhs_data_nd, rhs_data_nd) out_shape = feat_shape - if binary_op == 'dot': + if self.binary_op == 'dot': out_shape = feat_shape[:-1] out_data = nd.empty((self.out_size,) + out_shape, ctx=lhs_data.context, dtype=lhs_data.dtype) From c59de935e67dc490d3c9b68a1511e645485be6a0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 5 Sep 2019 07:25:33 +0000 Subject: [PATCH 26/32] Add u_dot_v, u_dot_e, v_dot_e unitest. --- src/kernel/binary_reduce.cc | 14 ++-- .../cuda/backward_binary_reduce_impl.cuh | 4 +- src/kernel/cuda/binary_reduce_impl.cuh | 6 +- tests/backend/backend_unittest.py | 4 ++ tests/backend/mxnet/__init__.py | 3 + tests/backend/pytorch/__init__.py | 3 + tests/compute/test_kernel.py | 70 +++++++++++++------ 7 files changed, 72 insertions(+), 32 deletions(-) diff --git a/src/kernel/binary_reduce.cc b/src/kernel/binary_reduce.cc index 064f08fdad60..9a87093eec3e 100644 --- a/src/kernel/binary_reduce.cc +++ b/src/kernel/binary_reduce.cc @@ -95,20 +95,23 @@ bool HasBcast(NDArray lhs, NDArray rhs) { // See also: BcastInfo (kernel/binary_reduce.h) BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) { BcastInfo ret; - int max_ndim = std::max(lhs->ndim, rhs->ndim) - 1; + const int max_ndim = std::max(lhs->ndim, rhs->ndim) - 1; + int64_t accum = 0; + int j = 0; // for dot operation: vector [dot] vector // lhs_shape[ndim-1] == rhs_shape[ndim-1] = sizeof(vector) // out_shape[ndim-1] = 1 if (op == binary_op::kDot) { // get size of vector ret.data_len = lhs->shape[lhs->ndim - 1]; - --max_ndim; + // skip vector size dim + ++j; + ret.real_out_shape.push_back(ret.data_len); } else { // op != binary_op::kDot ret.data_len = 1; } - int64_t accum = 0; - for (int j = 0; j < max_ndim; ++j) { + for (; j < max_ndim; ++j) { const int dl = (lhs->ndim - 1 - j < 1)? 1 : lhs->shape[lhs->ndim - 1 - j]; const int dr = (rhs->ndim - 1 - j < 1)? 1 : rhs->shape[rhs->ndim - 1 - j]; if (dl != dr) { @@ -141,9 +144,6 @@ BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) { accum = 0; } std::reverse(ret.real_out_shape.begin(), ret.real_out_shape.end()); - if (op == binary_op::kDot) { - ret.real_out_shape.push_back(ret.data_len); - } std::reverse(ret.lhs_shape.begin(), ret.lhs_shape.end()); std::reverse(ret.rhs_shape.begin(), ret.rhs_shape.end()); std::reverse(ret.out_shape.begin(), ret.out_shape.end()); diff --git a/src/kernel/cuda/backward_binary_reduce_impl.cuh b/src/kernel/cuda/backward_binary_reduce_impl.cuh index 871e33dbb3bb..644171826503 100644 --- a/src/kernel/cuda/backward_binary_reduce_impl.cuh +++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh @@ -59,7 +59,7 @@ struct BackwardBinaryReduce { if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); - DType rhs = Functors::Read(rhs_base + i); + DType rhs = Functors::Read(rhs_base + i); DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); AtomicAdd(gradlhsoff + tx * len + i, grad_lhs); } @@ -118,7 +118,7 @@ struct BackwardBinaryReduceBcast { len); DType grad_e = grad_out * Functors::BackwardWrite(e, out); - DType* lhs_base = lhsoff + + DType* lhs_base = lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len; DType* rhs_base = rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len; diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh index cee51710b960..3197ed3dd255 100644 --- a/src/kernel/cuda/binary_reduce_impl.cuh +++ b/src/kernel/cuda/binary_reduce_impl.cuh @@ -97,13 +97,13 @@ struct BinaryReduceBcast { } DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; //data with len size DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len; - DType* outoff = gdata->out_data + oid * gdata->out_len * len; + DType* outoff = gdata->out_data + oid * gdata->out_len; int64_t tmp[NDim]; // store unraveled idx. while (tx < gdata->out_len) { Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp); DType out = Functors::Op( - lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, - rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, + lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, + rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, len); Functors::Write(outoff + tx, out); diff --git a/tests/backend/backend_unittest.py b/tests/backend/backend_unittest.py index d80854bdcdf2..0caecb960548 100644 --- a/tests/backend/backend_unittest.py +++ b/tests/backend/backend_unittest.py @@ -115,6 +115,10 @@ def matmul(a, b): """Compute Matrix Multiplication between a and b""" pass +def dot(a, b): + """Compute Dot between a and b""" + pass + ############################################################################### # Tensor functions used *only* on index tensor # ---------------- diff --git a/tests/backend/mxnet/__init__.py b/tests/backend/mxnet/__init__.py index 4b2f6a068d9a..e36add88ddea 100644 --- a/tests/backend/mxnet/__init__.py +++ b/tests/backend/mxnet/__init__.py @@ -86,6 +86,9 @@ def prod(x, dim): def matmul(a, b): return nd.dot(a, b) +def dot(a, b): + return nd.sum(mul(a, b), axis=-1) + record_grad = autograd.record diff --git a/tests/backend/pytorch/__init__.py b/tests/backend/pytorch/__init__.py index 01cc2a2e3c3e..099e9efba57d 100644 --- a/tests/backend/pytorch/__init__.py +++ b/tests/backend/pytorch/__init__.py @@ -82,6 +82,9 @@ def prod(x, dim): def matmul(a, b): return a @ b +def dot(a, b): + return sum(mul(a, b), dim=-1) + class record_grad(object): def __init__(self): pass diff --git a/tests/compute/test_kernel.py b/tests/compute/test_kernel.py index 88d80502fad0..0d663b75cc52 100644 --- a/tests/compute/test_kernel.py +++ b/tests/compute/test_kernel.py @@ -26,33 +26,52 @@ def udf_max(nodes): D1 = 5 D2 = 3 D3 = 4 +D4 = 10 # NOTE(xiang): used to dot feature vector builtin = {'sum': fn.sum, 'max': fn.max, 'mean': fn.mean} udf_reduce = {'sum': udf_sum, 'max': udf_max, 'mean': udf_mean} fill_value = {'sum': 0, 'max': float("-inf")} -def generate_feature(g, broadcast='none'): +def generate_feature(g, broadcast='none', binary_op='none'): """Create graph with src, edge, dst feature. broadcast can be 'u', 'e', 'v', 'none' """ nv = g.number_of_nodes() ne = g.number_of_edges() - if broadcast == 'e': - u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3))) - e = F.tensor(np.random.uniform(-1, 1, (ne, D2, 1))) - v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3))) - elif broadcast == 'u': - u = F.tensor(np.random.uniform(-1, 1, (nv, D2, 1))) - e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3))) - v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3))) - elif broadcast == 'v': - u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3))) - e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3))) - v = F.tensor(np.random.uniform(-1, 1, (nv, D2, 1))) + if binary_op == 'dot': + if broadcast == 'e': + u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3, D4))) + e = F.tensor(np.random.uniform(-1, 1, (ne, D2, 1, D4))) + v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3, D4))) + elif broadcast == 'u': + u = F.tensor(np.random.uniform(-1, 1, (nv, D2, 1, D4))) + e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3, D4))) + v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3, D4))) + elif broadcast == 'v': + u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3, D4))) + e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3, D4))) + v = F.tensor(np.random.uniform(-1, 1, (nv, D2, 1, D4))) + else: + u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3, D4))) + e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3, D4))) + v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3, D4))) else: - u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3))) - e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3))) - v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3))) + if broadcast == 'e': + u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3))) + e = F.tensor(np.random.uniform(-1, 1, (ne, D2, 1))) + v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3))) + elif broadcast == 'u': + u = F.tensor(np.random.uniform(-1, 1, (nv, D2, 1))) + e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3))) + v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3))) + elif broadcast == 'v': + u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3))) + e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3))) + v = F.tensor(np.random.uniform(-1, 1, (nv, D2, 1))) + else: + u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3))) + e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3))) + v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3))) return u, v, e @@ -62,7 +81,7 @@ def _test(red, partial): # NOTE(zihao): add self-loop to avoid zero-degree nodes. # https://github.com/dmlc/dgl/issues/761 g.add_edges(g.nodes(), g.nodes()) - hu, hv, he = generate_feature(g, 'none') + hu, hv, he = generate_feature(g, 'none', 'none') if partial: nid = F.tensor(list(range(0, 100, 2))) @@ -123,7 +142,7 @@ def _test(red, partial): g = dgl.DGLGraph(nx.erdos_renyi_graph(100, 0.1)) # NOTE(zihao): add self-loop to avoid zero-degree nodes. g.add_edges(g.nodes(), g.nodes()) - hu, hv, he = generate_feature(g, 'none') + hu, hv, he = generate_feature(g, 'none', 'none') if partial: nid = F.tensor(list(range(0, 100, 2))) @@ -182,7 +201,7 @@ def _print_error(a, b): def test_all_binary_builtins(): def _test(g, lhs, rhs, binary_op, reducer, partial, nid, broadcast='none'): # initialize node/edge features with uniform(-1, 1) - hu, hv, he = generate_feature(g, broadcast) + hu, hv, he = generate_feature(g, broadcast, binary_op) if binary_op == 'div': # op = div # lhs range: [-1, 1] @@ -327,8 +346,19 @@ def _print_error(a, b): for partial in [False, True]: _test(g, lhs, rhs, binary_op, reducer, partial, nid, broadcast=broadcast) - + + for lhs, rhs in product(["u", "v"], ["v", "e"]): + if lhs == rhs: + continue + binary_op = "dot" + for reducer in ["sum", "max", "min", "mean"]: + for broadcast in ["none", lhs, rhs]: + for partial in [False, True]: + _test(g, lhs, rhs, binary_op, reducer, partial, nid, + broadcast=broadcast) + if __name__ == '__main__': #test_copy_src_reduce() #test_copy_edge_reduce() test_all_binary_builtins() + From bed5bcd153265f3da5de4a6bdab7988450047284 Mon Sep 17 00:00:00 2001 From: Song Date: Thu, 5 Sep 2019 15:51:25 +0800 Subject: [PATCH 27/32] Update doc --- docs/source/api/python/function.rst | 3 +++ docs/source/features/builtin.rst | 2 ++ 2 files changed, 5 insertions(+) diff --git a/docs/source/api/python/function.rst b/docs/source/api/python/function.rst index 869eb50db95e..756f471d7a6c 100644 --- a/docs/source/api/python/function.rst +++ b/docs/source/api/python/function.rst @@ -40,6 +40,9 @@ Message functions e_sub_v e_mul_v e_div_v + u_dot_v + u_dot_e + v_dot_e Reduce functions ---------------- diff --git a/docs/source/features/builtin.rst b/docs/source/features/builtin.rst index a6a36145a664..ce521de01f29 100644 --- a/docs/source/features/builtin.rst +++ b/docs/source/features/builtin.rst @@ -98,6 +98,8 @@ Here is a cheatsheet of all the DGL builtins. | +----------------------------------------------------+-----------------------+ | | ``e_add_v``, ``e_sub_v``, ``e_mul_v``, ``e_div_v`` | | | +----------------------------------------------------+-----------------------+ +| | ``u_dot_v``, ``u_dot_e``, ``v_dot_e`` | | +| +----------------------------------------------------+-----------------------+ | | ``src_mul_edge`` | alias of ``u_mul_e`` | +-------------------------+----------------------------------------------------+-----------------------+ | Reduce function | ``max`` | | From 056cbf520bdb71be3200b6da5344c2251828bfe1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 5 Sep 2019 14:13:18 +0000 Subject: [PATCH 28/32] Now also support v_dot_u, e_dot_u, e_dot_v --- docs/source/api/python/function.rst | 3 +++ docs/source/features/builtin.rst | 2 ++ python/dgl/function/message.py | 10 ++-------- src/kernel/binary_reduce_common.h | 20 +++++++++++++++++++- tests/compute/test_kernel.py | 12 +----------- 5 files changed, 27 insertions(+), 20 deletions(-) diff --git a/docs/source/api/python/function.rst b/docs/source/api/python/function.rst index 756f471d7a6c..1187496616dc 100644 --- a/docs/source/api/python/function.rst +++ b/docs/source/api/python/function.rst @@ -43,6 +43,9 @@ Message functions u_dot_v u_dot_e v_dot_e + v_dot_u + e_dot_u + e_dot_v Reduce functions ---------------- diff --git a/docs/source/features/builtin.rst b/docs/source/features/builtin.rst index ce521de01f29..558d56a122ce 100644 --- a/docs/source/features/builtin.rst +++ b/docs/source/features/builtin.rst @@ -100,6 +100,8 @@ Here is a cheatsheet of all the DGL builtins. | +----------------------------------------------------+-----------------------+ | | ``u_dot_v``, ``u_dot_e``, ``v_dot_e`` | | | +----------------------------------------------------+-----------------------+ +| | ``v_dot_u``, ``e_dot_u``, ``e_dot_v`` | | +| +----------------------------------------------------+-----------------------+ | | ``src_mul_edge`` | alias of ``u_mul_e`` | +-------------------------+----------------------------------------------------+-----------------------+ | Reduce function | ``max`` | | diff --git a/python/dgl/function/message.py b/python/dgl/function/message.py index a1785ca0411d..206d2d643295 100644 --- a/python/dgl/function/message.py +++ b/python/dgl/function/message.py @@ -157,6 +157,7 @@ def copy_e(e, out): # # dot message functions: # u_dot_v, u_dot_e, v_dot_e +# v_dot_u, e_dot_u, e_dot_v _TARGET_MAP = { "u": TargetCode.SRC, @@ -204,14 +205,7 @@ def _register_builtin_message_func(): target = ["u", "v", "e"] for lhs, rhs in product(target, target): if lhs != rhs: - for binary_op in ["add", "sub", "mul", "div"]: - func = _gen_message_builtin(lhs, rhs, binary_op) - setattr(sys.modules[__name__], func.__name__, func) - __all__.append(func.__name__) - - for lhs, rhs in product(["u", "v"], ["v", "e"]): - if lhs != rhs: - for binary_op in ["dot"]: + for binary_op in ["add", "sub", "mul", "div", "dot"]: func = _gen_message_builtin(lhs, rhs, binary_op) setattr(sys.modules[__name__], func.__name__, func) __all__.append(func.__name__) diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h index b01fdcbb8069..a79755ae0883 100644 --- a/src/kernel/binary_reduce_common.h +++ b/src/kernel/binary_reduce_common.h @@ -340,6 +340,21 @@ struct BinaryDot { typedef SelectDst LeftType; \ typedef SelectEdge RightType; \ {__VA_ARGS__} \ + } else if (op == kDot && lhs == kDst && rhs == kSrc) { \ + typedef BinaryDot OpType; \ + typedef SelectDst LeftType; \ + typedef SelectSrc RightType; \ + {__VA_ARGS__} \ + } else if (op == kDot && lhs == kEdge && rhs == kSrc) { \ + typedef BinaryDot OpType; \ + typedef SelectEdge LeftType; \ + typedef SelectSrc RightType; \ + {__VA_ARGS__} \ + } else if (op == kDot && lhs == kEdge && rhs == kDst) { \ + typedef BinaryDot OpType; \ + typedef SelectEdge LeftType; \ + typedef SelectDst RightType; \ + {__VA_ARGS__} \ } else { \ LOG(FATAL) << "Unsupported operation: op=" << op \ << " lhs=" << lhs << " rhs=" << rhs; \ @@ -370,7 +385,10 @@ struct BinaryDot { MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectNone, BinaryUseLhs)) \ MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinaryDot)) \ MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinaryDot)) \ - MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryDot)) + MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryDot)) \ + MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectSrc, BinaryDot)) \ + MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectSrc, BinaryDot)) \ + MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectDst, BinaryDot)) ////////////////////////////////////////////////////////////////////////// // Defines reducer category. Each category is an empty structure. diff --git a/tests/compute/test_kernel.py b/tests/compute/test_kernel.py index 0d663b75cc52..8d159e481f5b 100644 --- a/tests/compute/test_kernel.py +++ b/tests/compute/test_kernel.py @@ -340,23 +340,13 @@ def _print_error(a, b): for lhs, rhs in product(target, target): if lhs == rhs: continue - for binary_op in ["add", "sub", "mul", "div"]: + for binary_op in ["add", "sub", "mul", "div", "dot"]: for reducer in ["sum", "max", "min", "prod", "mean"]: for broadcast in ["none", lhs, rhs]: for partial in [False, True]: _test(g, lhs, rhs, binary_op, reducer, partial, nid, broadcast=broadcast) - for lhs, rhs in product(["u", "v"], ["v", "e"]): - if lhs == rhs: - continue - binary_op = "dot" - for reducer in ["sum", "max", "min", "mean"]: - for broadcast in ["none", lhs, rhs]: - for partial in [False, True]: - _test(g, lhs, rhs, binary_op, reducer, partial, nid, - broadcast=broadcast) - if __name__ == '__main__': #test_copy_src_reduce() #test_copy_edge_reduce() From 9e94b310ec0ecf41fedead8cf52dd6a58d630430 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 5 Sep 2019 14:29:00 +0000 Subject: [PATCH 29/32] Add unroll for some loop --- src/kernel/binary_reduce_common.h | 3 +++ src/kernel/cpu/backward_binary_reduce_impl.h | 4 ++++ src/kernel/cuda/backward_binary_reduce_impl.cuh | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h index a79755ae0883..ee284e2cb9ef 100644 --- a/src/kernel/binary_reduce_common.h +++ b/src/kernel/binary_reduce_common.h @@ -198,6 +198,7 @@ struct BinaryDot { static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) { DType out = 0; // simple vector dot vector +#pragma unroll for (int i = 0; i < len; i ++) out += lhs[i] * rhs[i]; @@ -220,6 +221,8 @@ struct BinaryDot { // - Div(Src, Dst), Div(Src, Edge), Div(Dst, Edge) // Div(Dst, Src), Div(Edge, Src), Div(Edge, Dst) // - UseLhs(Src, None), UseLhs(Edge, None) +// - Dot(Src, Dst), Dot(Src, Edge), Dot(Dst, Edge) +// - Dot(Dst, Src), Dot(Edge, Src), Dot(Edge, Dst) // Note that for commutative operators (e.g. Add and Mul), we only generate // kernels for lhs code smaller than rhs code. #define OP_TARGET_SWITCH(op, lhs, rhs, DType, OpType, LeftType, RightType, ...) \ diff --git a/src/kernel/cpu/backward_binary_reduce_impl.h b/src/kernel/cpu/backward_binary_reduce_impl.h index 0f209dc24d6a..7e5ea940aa73 100644 --- a/src/kernel/cpu/backward_binary_reduce_impl.h +++ b/src/kernel/cpu/backward_binary_reduce_impl.h @@ -55,6 +55,7 @@ struct BackwardBinaryReduce { DType* lhs_base = lhsoff + tx * len; DType* rhs_base = rhsoff + tx * len; if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { +#pragma unroll for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); DType rhs = Functors::Read(rhs_base + i); @@ -64,6 +65,7 @@ struct BackwardBinaryReduce { } } if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { +#pragma unroll for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); DType rhs = Functors::Read(rhs_base + i); @@ -121,6 +123,7 @@ struct BackwardBinaryReduceBcast { DType* rhs_base = rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len; if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { +#pragma unroll for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); DType rhs = Functors::Read(rhs_base + i); @@ -130,6 +133,7 @@ struct BackwardBinaryReduceBcast { } } if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { +#pragma unroll for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); DType rhs = Functors::Read(rhs_base + i); diff --git a/src/kernel/cuda/backward_binary_reduce_impl.cuh b/src/kernel/cuda/backward_binary_reduce_impl.cuh index 644171826503..1bc98e6041bc 100644 --- a/src/kernel/cuda/backward_binary_reduce_impl.cuh +++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh @@ -57,6 +57,7 @@ struct BackwardBinaryReduce { DType* lhs_base = lhsoff + tx * len; DType* rhs_base = rhsoff + tx * len; if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { +#pragma unroll for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); DType rhs = Functors::Read(rhs_base + i); @@ -65,6 +66,7 @@ struct BackwardBinaryReduce { } } if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { +#pragma unroll for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); DType rhs = Functors::Read(rhs_base + i); @@ -123,6 +125,7 @@ struct BackwardBinaryReduceBcast { DType* rhs_base = rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len; if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { +#pragma unroll for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); DType rhs = Functors::Read(rhs_base + i); @@ -131,6 +134,7 @@ struct BackwardBinaryReduceBcast { } } if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { +#pragma unroll for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); DType rhs = Functors::Read(rhs_base + i); From 89e61dd8a58b8cf3fc465b4ec32c4afeefdcb573 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 6 Sep 2019 08:48:06 +0000 Subject: [PATCH 30/32] Add some Opt for cuda backward of dot builtin. Backward is still slow for dot --- src/kernel/cpu/backward_binary_reduce_impl.h | 34 +++++++++++++------ .../cuda/backward_binary_reduce_impl.cuh | 30 ++++++++++++---- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/src/kernel/cpu/backward_binary_reduce_impl.h b/src/kernel/cpu/backward_binary_reduce_impl.h index 7e5ea940aa73..76a2bad062da 100644 --- a/src/kernel/cpu/backward_binary_reduce_impl.h +++ b/src/kernel/cpu/backward_binary_reduce_impl.h @@ -54,8 +54,17 @@ struct BackwardBinaryReduce { DType* lhs_base = lhsoff + tx * len; DType* rhs_base = rhsoff + tx * len; - if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { -#pragma unroll + if (Mode == binary_op::kGradBoth) { + for (int64_t i = 0; i < len; ++i) { + DType lhs = Functors::Read(lhs_base + i); + DType rhs = Functors::Read(rhs_base + i); + DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); + DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); + DType grad = grad_lhs + grad_rhs; +#pragma omp atomic + gradlhsoff[tx * len + i] += grad; + } + } else if (Mode == binary_op::kGradLhs) { for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); DType rhs = Functors::Read(rhs_base + i); @@ -63,9 +72,7 @@ struct BackwardBinaryReduce { #pragma omp atomic gradlhsoff[tx * len + i] += grad_lhs; } - } - if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { -#pragma unroll + } else if (Mode == binary_op::kGradRhs) { for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); DType rhs = Functors::Read(rhs_base + i); @@ -122,8 +129,17 @@ struct BackwardBinaryReduceBcast { Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len; DType* rhs_base = rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len; - if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { -#pragma unroll + if (Mode == binary_op::kGradBoth) { + for (int64_t i = 0; i < len; ++i) { + DType lhs = Functors::Read(lhs_base + i); + DType rhs = Functors::Read(rhs_base + i); + DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); + DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); + DType grad = grad_lhs + grad_rhs; +#pragma omp atomic + gradlhsoff[tx * len + i] += grad; + } + } else if (Mode == binary_op::kGradLhs) { for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); DType rhs = Functors::Read(rhs_base + i); @@ -131,9 +147,7 @@ struct BackwardBinaryReduceBcast { #pragma omp atomic gradlhsoff[tx * len + i] += grad_lhs; } - } - if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { -#pragma unroll + } else if (Mode == binary_op::kGradRhs) { for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); DType rhs = Functors::Read(rhs_base + i); diff --git a/src/kernel/cuda/backward_binary_reduce_impl.cuh b/src/kernel/cuda/backward_binary_reduce_impl.cuh index 1bc98e6041bc..af26a143cf73 100644 --- a/src/kernel/cuda/backward_binary_reduce_impl.cuh +++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh @@ -56,7 +56,17 @@ struct BackwardBinaryReduce { DType* lhs_base = lhsoff + tx * len; DType* rhs_base = rhsoff + tx * len; - if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { + if (Mode == binary_op::kGradBoth) { +#pragma unroll + for (int64_t i = 0; i < len; ++i) { + DType lhs = Functors::Read(lhs_base + i); + DType rhs = Functors::Read(rhs_base + i); + DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); + DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); + DType grad = grad_lhs + grad_rhs; + AtomicAdd(gradlhsoff + tx * len + i, grad); + } + } else if (Mode == binary_op::kGradLhs) { #pragma unroll for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); @@ -64,8 +74,7 @@ struct BackwardBinaryReduce { DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); AtomicAdd(gradlhsoff + tx * len + i, grad_lhs); } - } - if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { + } else if (Mode == binary_op::kGradRhs) { #pragma unroll for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); @@ -124,7 +133,17 @@ struct BackwardBinaryReduceBcast { Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len; DType* rhs_base = rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len; - if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) { + if (Mode == binary_op::kGradBoth) { +#pragma unroll + for (int64_t i = 0; i < len; ++i) { + DType lhs = Functors::Read(lhs_base + i); + DType rhs = Functors::Read(rhs_base + i); + DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); + DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e); + DType grad = grad_lhs + grad_rhs; + AtomicAdd(gradlhsoff + tx * len + i, grad); + } + } else if (Mode == binary_op::kGradLhs) { #pragma unroll for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); @@ -132,8 +151,7 @@ struct BackwardBinaryReduceBcast { DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e); AtomicAdd(gradlhsoff + tx * len + i, grad_lhs); } - } - if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) { + } else if (Mode == binary_op::kGradRhs) { #pragma unroll for (int64_t i = 0; i < len; ++i) { DType lhs = Functors::Read(lhs_base + i); From c96338a133ec4b4a1373fc970bb3f9bc8f32ca2f Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 9 Sep 2019 02:50:55 +0000 Subject: [PATCH 31/32] Apply UnravelRavel opt for broadcast backward --- .../cuda/backward_binary_reduce_impl.cuh | 36 +++++-------------- src/kernel/cuda/binary_reduce_impl.cuh | 5 ++- 2 files changed, 10 insertions(+), 31 deletions(-) diff --git a/src/kernel/cuda/backward_binary_reduce_impl.cuh b/src/kernel/cuda/backward_binary_reduce_impl.cuh index de720b67c064..fddac5f2da14 100644 --- a/src/kernel/cuda/backward_binary_reduce_impl.cuh +++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh @@ -88,24 +88,6 @@ struct BackwardBinaryReduce { } }; -// Convert flattened index to multi-dimension index (assume row-major). -__device__ __forceinline__ void Unravel( - int64_t idx, int ndim, const int64_t* shape, const int64_t* stride, int64_t* out) { - for (int d = 0; d < ndim; ++d) { - out[d] = (idx / stride[d]) % shape[d]; - } -} - -// Convert multi-dimension index to flattened index (assume row-major). -__device__ __forceinline__ int64_t Ravel( - const int64_t* idx, int ndim, const int64_t* shape, const int64_t* stride) { - int64_t out = 0; - for (int d = 0; d < ndim; ++d) { - out += min(idx[d], shape[d] - 1) * stride[d]; - } - return out; -} - // Minigun UDF to compute backward binary reduce with broadcasting. template struct BackwardBinaryReduceBcast { @@ -136,21 +118,19 @@ struct BackwardBinaryReduceBcast { DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len * len; DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len * len; DType* gradoutoff = gdata->grad_out_data + oid * gdata->out_len; - int64_t tmp[NDim]; // store unraveled idx. while (tx < gdata->out_len) { - Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp); + int64_t lhs_add = 0; + int64_t rhs_add = 0; + UnravelRavel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, + gdata->lhs_shape, gdata->lhs_stride, + gdata->rhs_shape, gdata->rhs_stride, &lhs_add, &rhs_add); DType out = Functors::Read(outoff + tx); DType grad_out = Functors::Read(gradoutoff + tx); - DType e = Functors::Op( - lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, - rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, - len); + DType e = Functors::Op(lhsoff + lhs_add * len, rhsoff + rhs_add * len, len); DType grad_e = grad_out * Functors::BackwardWrite(e, out); - DType* lhs_base = lhsoff + - Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len; - DType* rhs_base = rhsoff + - Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len; + DType* lhs_base = lhsoff + lhs_add * len; + DType* rhs_base = rhsoff + rhs_add * len; if (Mode == binary_op::kGradBoth) { #pragma unroll for (int64_t i = 0; i < len; ++i) { diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh index ea942fa104ee..ec1bf2e1c8e8 100644 --- a/src/kernel/cuda/binary_reduce_impl.cuh +++ b/src/kernel/cuda/binary_reduce_impl.cuh @@ -71,13 +71,12 @@ __device__ __forceinline__ void UnravelRavel( int64_t o_st = out_stride[d]; int64_t rhs_sh = rhs_shape[d]; int64_t rhs_st = rhs_stride[d]; - int64_t i = (idx / o_st) % o_sh; /* * Simplfied for rhs_out += min(i, rhs_sh - 1) * rhs_st; * rhs_sh be o_sh or 1 */ - if (rhs_sh > i) { + if (rhs_sh > i) { *rhs_out += i * rhs_st; } } @@ -89,7 +88,7 @@ __device__ __forceinline__ void UnravelRavel( int64_t o_st = out_stride[d]; int64_t lhs_sh = lhs_shape[d]; int64_t lhs_st = lhs_stride[d]; - + int64_t i = (idx / o_st) % o_sh; /* * Simplfied for lhs_out += min(i, lhs_sh - 1) * lhs_st; From cb7d1acc99ce71c3f22450e77e98a02384689684 Mon Sep 17 00:00:00 2001 From: Song Date: Wed, 11 Sep 2019 23:02:27 +0800 Subject: [PATCH 32/32] update docstring --- python/dgl/kernel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/dgl/kernel.py b/python/dgl/kernel.py index 4de76490e8d5..0a9444c036cf 100644 --- a/python/dgl/kernel.py +++ b/python/dgl/kernel.py @@ -10,6 +10,8 @@ def infer_binary_feature_shape(op, lhs, rhs): Parameter --------- + op : string + The binary_op name. lhs : dgl.ndarray.NDArray The lhs tensor. rhs : dgl.ndarray.NDArray