From 6219fe600523f17313e8842fc38ffd72a7d2adc8 Mon Sep 17 00:00:00 2001
From: yzh119 <expye@outlook.com>
Date: Tue, 6 Aug 2019 16:13:02 +0800
Subject: [PATCH 01/32] upd

---
 docs/source/api/python/nn.mxnet.rst   |  6 +--
 docs/source/api/python/nn.pytorch.rst |  6 +--
 python/dgl/nn/mxnet/softmax.py        | 76 +++++++++++++++++++++------
 python/dgl/nn/pytorch/softmax.py      | 76 +++++++++++++++++++++------
 4 files changed, 124 insertions(+), 40 deletions(-)

diff --git a/docs/source/api/python/nn.mxnet.rst b/docs/source/api/python/nn.mxnet.rst
index 07de782c114a..162be4f61ba8 100644
--- a/docs/source/api/python/nn.mxnet.rst
+++ b/docs/source/api/python/nn.mxnet.rst
@@ -17,12 +17,10 @@ dgl.nn.mxnet.glob
 
 .. automodule:: dgl.nn.mxnet.glob
     :members:
+    :show-inheritance:
 
 dgl.nn.mxnet.softmax
 --------------------
 
 .. automodule:: dgl.nn.mxnet.softmax
-
-.. autoclass:: dgl.nn.mxnet.softmax.EdgeSoftmax
-    :members: forward
-    :show-inheritance:
+    :members: edge_softmax
diff --git a/docs/source/api/python/nn.pytorch.rst b/docs/source/api/python/nn.pytorch.rst
index a6d4ba18ea09..24ba02d41530 100644
--- a/docs/source/api/python/nn.pytorch.rst
+++ b/docs/source/api/python/nn.pytorch.rst
@@ -14,7 +14,6 @@ dgl.nn.pytorch.conv
 
 dgl.nn.pytorch.glob
 -------------------
-
 .. automodule:: dgl.nn.pytorch.glob
 
 .. autoclass:: dgl.nn.pytorch.glob.SumPooling
@@ -53,7 +52,4 @@ dgl.nn.pytorch.softmax
 ----------------------
 
 .. automodule:: dgl.nn.pytorch.softmax
-
-.. autoclass:: dgl.nn.pytorch.softmax.EdgeSoftmax
-    :members: forward
-    :show-inheritance:
+    :members: edge_softmax
diff --git a/python/dgl/nn/mxnet/softmax.py b/python/dgl/nn/mxnet/softmax.py
index d5754b977ae5..aff19df0163d 100644
--- a/python/dgl/nn/mxnet/softmax.py
+++ b/python/dgl/nn/mxnet/softmax.py
@@ -32,12 +32,15 @@ def forward(self, score):
         """Forward function.
 
         Pseudo-code:
-        score = dgl.EData(g, score)
-        score_max = score.dst_max()  # of type dgl.NData
-        score = score - score_max  # edge_sub_dst, ret dgl.EData
-        score_sum = score.dst_sum()  # of type dgl.NData
-        out = score / score_sum    # edge_div_dst, ret dgl.EData
-        return out.data
+
+        .. code:: python
+
+            score = dgl.EData(g, score)
+            score_max = score.dst_max()  # of type dgl.NData
+            score = score - score_max  # edge_sub_dst, ret dgl.EData
+            score_sum = score.dst_sum()  # of type dgl.NData
+            out = score / score_sum    # edge_div_dst, ret dgl.EData
+            return out.data
         """
         g = self.g.local_var()
         g.edata['s'] = score
@@ -54,12 +57,15 @@ def backward(self, grad_out):
         """Backward function.
 
         Pseudo-code:
-        g, out = ctx.backward_cache
-        grad_out = dgl.EData(g, grad_out)
-        out = dgl.EData(g, out)
-        sds = out * grad_out  # type dgl.EData
-        sds_sum = sds.dst_sum()  # type dgl.NData
-        grad_score = sds - sds * sds_sum  # multiple expressions
+
+        .. code:: python
+
+            g, out = ctx.backward_cache
+            grad_out = dgl.EData(g, grad_out)
+            out = dgl.EData(g, out)
+            sds = out * grad_out  # type dgl.EData
+            sds_sum = sds.dst_sum()  # type dgl.NData
+            grad_score = sds - sds * sds_sum  # multiple expressions
         """
         g = self.g.local_var()
         out, = self.saved_tensors  # pylint: disable=access-member-before-definition, unpacking-non-sequence
@@ -75,6 +81,19 @@ def backward(self, grad_out):
 def edge_softmax(graph, logits):
     r"""Compute edge softmax.
 
+    For a node :math:`i`, edge softmax is an operation of computing
+
+    .. math::
+      a_{ij} = \frac{\exp(z_{ij})}{\sum_{j\in\mathcal{N}(i)}\exp(z_{ij})}
+
+    where :math:`z_{ij}` is a signal of edge :math:`j\rightarrow i`, also
+    called logits in the context of softmax. :math:`\mathcal{N}(i)` is
+    the set of nodes that have an edge to :math:`i`.
+
+    An example of using edge softmax is in
+    `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`__ where
+    the attention weights are computed with such an edge softmax operation.
+
     Parameters
     ----------
     graph : DGLGraph
@@ -90,13 +109,40 @@ def edge_softmax(graph, logits):
     Notes
     -----
         * Input shape: :math:`(N, *, 1)` where * means any number of
-            additional dimensions, :math:`N` is the number of edges.
+          additional dimensions, :math:`N` is the number of edges.
         * Return shape: :math:`(N, *, 1)`
 
     Examples
     --------
-    >>> import dgl.function as fn
-    >>> attention = EdgeSoftmax(logits, graph)
+    >>> from dgl.nn.mxnet.softmax import edge_softmax
+    >>> import dgl
+    >>> from mxnet import nd
+
+    Create a :code:`DGLGraph` object and initialize its edge features.
+
+    >>> g = dgl.DGLGraph()
+    >>> g.add_nodes(3)
+    >>> g.add_edges([0, 0, 0, 1, 1, 2], [0, 1, 2, 1, 2, 2])
+    >>> edata = nd.ones((6, 1))
+    >>> edata
+    [[1.]
+     [1.]
+     [1.]
+     [1.]
+     [1.]
+     [1.]]
+    <NDArray 6x1 @cpu(0)>
+
+    Apply edge softmax on g:
+
+    >>> edge_softmax(g, edata)
+    [[1.        ]
+     [0.5       ]
+     [0.33333334]
+     [0.5       ]
+     [0.33333334]
+     [0.33333334]]
+    <NDArray 6x1 @cpu(0)>
     """
     softmax_op = EdgeSoftmax(graph)
     return softmax_op(logits)
diff --git a/python/dgl/nn/pytorch/softmax.py b/python/dgl/nn/pytorch/softmax.py
index bb770ff9435b..9eda68ba7577 100644
--- a/python/dgl/nn/pytorch/softmax.py
+++ b/python/dgl/nn/pytorch/softmax.py
@@ -29,12 +29,15 @@ def forward(ctx, g, score):
         """Forward function.
 
         Pseudo-code:
-        score = dgl.EData(g, score)
-        score_max = score.dst_max()  # of type dgl.NData
-        score = score - score_max  # edge_sub_dst, ret dgl.EData
-        score_sum = score.dst_sum()  # of type dgl.NData
-        out = score / score_sum    # edge_div_dst, ret dgl.EData
-        return out.data
+
+        .. code:: python
+
+            score = dgl.EData(g, score)
+            score_max = score.dst_max()  # of type dgl.NData
+            score = score - score_max  # edge_sub_dst, ret dgl.EData
+            score_sum = score.dst_sum()  # of type dgl.NData
+            out = score / score_sum    # edge_div_dst, ret dgl.EData
+            return out.data
         """
         # remember to save the graph to backward cache before making it
         # a local variable
@@ -55,13 +58,16 @@ def backward(ctx, grad_out):
         """Backward function.
 
         Pseudo-code:
-        g, out = ctx.backward_cache
-        grad_out = dgl.EData(g, grad_out)
-        out = dgl.EData(g, out)
-        sds = out * grad_out  # type dgl.EData
-        sds_sum = sds.dst_sum()  # type dgl.NData
-        grad_score = sds - sds * sds_sum  # multiple expressions
-        return grad_score.data
+
+        .. code:: python
+
+            g, out = ctx.backward_cache
+            grad_out = dgl.EData(g, grad_out)
+            out = dgl.EData(g, out)
+            sds = out * grad_out  # type dgl.EData
+            sds_sum = sds.dst_sum()  # type dgl.NData
+            grad_score = sds - sds * sds_sum  # multiple expressions
+            return grad_score.data
         """
         g = ctx.backward_cache
         g = g.local_var()
@@ -79,6 +85,19 @@ def backward(ctx, grad_out):
 def edge_softmax(graph, logits):
     r"""Compute edge softmax.
 
+    For a node :math:`i`, edge softmax is an operation of computing
+
+    .. math::
+      a_{ij} = \frac{\exp(z_{ij})}{\sum_{j\in\mathcal{N}(i)}\exp(z_{ij})}
+
+    where :math:`z_{ij}` is a signal of edge :math:`j\rightarrow i`, also
+    called logits in the context of softmax. :math:`\mathcal{N}(i)` is
+    the set of nodes that have an edge to :math:`i`.
+
+    An example of using edge softmax is in
+    `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`__ where
+    the attention weights are computed with such an edge softmax operation.
+
     Parameters
     ----------
     graph : DGLGraph
@@ -94,12 +113,37 @@ def edge_softmax(graph, logits):
     Notes
     -----
         * Input shape: :math:`(N, *, 1)` where * means any number of
-            additional dimensions, :math:`N` is the number of edges.
+          additional dimensions, :math:`N` is the number of edges.
         * Return shape: :math:`(N, *, 1)`
 
     Examples
     --------
-    >>> import dgl.function as fn
-    >>> attention = EdgeSoftmax(logits, graph)
+    >>> from dgl.nn.pytorch.softmax import edge_softmax
+    >>> import dgl
+    >>> import torch as th
+
+    Create a :code:`DGLGraph` object and initialize its edge features.
+
+    >>> g = dgl.DGLGraph()
+    >>> g.add_nodes(3)
+    >>> g.add_edges([0, 0, 0, 1, 1, 2], [0, 1, 2, 1, 2, 2])
+    >>> edata = th.ones(6, 1).float()
+    >>> edata
+    tensor([[1.],
+            [1.],
+            [1.],
+            [1.],
+            [1.],
+            [1.]])
+
+    Apply edge softmax on g:
+
+    >>> edge_softmax(g, edata)
+    tensor([[1.0000],
+        [0.5000],
+        [0.3333],
+        [0.5000],
+        [0.3333],
+        [0.3333]])
     """
     return EdgeSoftmax.apply(graph, logits)

From 66541cc6034f5e985a7707bcca32d737559e2b6f Mon Sep 17 00:00:00 2001
From: yzh119 <expye@outlook.com>
Date: Wed, 7 Aug 2019 16:41:52 +0800
Subject: [PATCH 02/32] fig edgebatch edges

---
 python/dgl/udf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/dgl/udf.py b/python/dgl/udf.py
index 4a895140f644..e4123a87dba1 100644
--- a/python/dgl/udf.py
+++ b/python/dgl/udf.py
@@ -76,8 +76,8 @@ def edges(self):
             in the batch.
         """
         if is_all(self._edges[2]):
-            self._edges[2] = utils.toindex(F.arange(
-                0, self._g.number_of_edges()))
+            self._edges = self._edges[:2] + (utils.toindex(F.arange(
+                0, self._g.number_of_nodes())),)
         u, v, eid = self._edges
         return (u.tousertensor(), v.tousertensor(), eid.tousertensor())
 

From 8258153723e41fd9c2ab72e2a0035882dfe7b48b Mon Sep 17 00:00:00 2001
From: yzh119 <expye@outlook.com>
Date: Wed, 7 Aug 2019 17:58:08 +0800
Subject: [PATCH 03/32] add test

---
 python/dgl/graph.py       |  2 +-
 python/dgl/udf.py         |  2 +-
 tests/compute/test_udf.py | 82 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 2 deletions(-)
 create mode 100644 tests/compute/test_udf.py

diff --git a/python/dgl/graph.py b/python/dgl/graph.py
index c0f20df11d88..55c75a6922f0 100644
--- a/python/dgl/graph.py
+++ b/python/dgl/graph.py
@@ -3258,7 +3258,7 @@ def filter_edges(self, predicate, edges=ALL):
         """
         if is_all(edges):
             eid = ALL
-            u, v, _ = self._graph.edges()
+            u, v, _ = self._graph.edges('eid')
         elif isinstance(edges, tuple):
             u, v = edges
             u = utils.toindex(u)
diff --git a/python/dgl/udf.py b/python/dgl/udf.py
index e4123a87dba1..c121a7c4d321 100644
--- a/python/dgl/udf.py
+++ b/python/dgl/udf.py
@@ -77,7 +77,7 @@ def edges(self):
         """
         if is_all(self._edges[2]):
             self._edges = self._edges[:2] + (utils.toindex(F.arange(
-                0, self._g.number_of_nodes())),)
+                0, self._g.number_of_edges())),)
         u, v, eid = self._edges
         return (u.tousertensor(), v.tousertensor(), eid.tousertensor())
 
diff --git a/tests/compute/test_udf.py b/tests/compute/test_udf.py
new file mode 100644
index 000000000000..3c0b92484f2f
--- /dev/null
+++ b/tests/compute/test_udf.py
@@ -0,0 +1,82 @@
+import backend as F
+import dgl
+import networkx as nx
+import dgl.utils as utils
+from dgl import DGLGraph, ALL
+from dgl.udf import NodeBatch, EdgeBatch
+
+def test_node_batch():
+    g = dgl.DGLGraph(nx.path_graph(20))
+    feat = F.randn((g.number_of_nodes(), 10))
+    g.ndata['x'] = feat
+
+    # test all
+    v = ALL
+    n_repr = g.get_n_repr(v)
+    nbatch = NodeBatch(g, v, n_repr)
+    assert F.allclose(nbatch.data['x'], feat)
+    assert nbatch.mailbox is None
+    assert F.allclose(nbatch.nodes(), g.nodes())
+    assert nbatch.batch_size() == g.number_of_nodes()
+    assert len(nbatch) == g.number_of_nodes()
+
+    # test partial
+    v = utils.toindex(F.tensor([0, 3, 5, 7, 9]))
+    n_repr = g.get_n_repr(v)
+    nbatch = NodeBatch(g, v, n_repr)
+    assert F.allclose(nbatch.data['x'], F.gather_row(feat, F.tensor([0, 3, 5, 7, 9])))
+    assert nbatch.mailbox is None
+    assert F.allclose(nbatch.nodes(), F.tensor([0, 3, 5, 7, 9]))
+    assert nbatch.batch_size() == 5
+    assert len(nbatch) == 5
+
+def test_edge_batch():
+    d = 10
+    g = dgl.DGLGraph(nx.path_graph(20))
+    nfeat = F.randn((g.number_of_nodes(), d))
+    efeat = F.randn((g.number_of_edges(), d))
+    g.ndata['x'] = nfeat
+    g.edata['x'] = efeat
+
+    # test all
+    eid = ALL
+    u, v, _ = g._graph.edges('eid')
+
+    src_data = g.get_n_repr(u)
+    edge_data = g.get_e_repr(eid)
+    dst_data = g.get_n_repr(v)
+    ebatch = EdgeBatch(g, (u, v, eid), src_data, edge_data, dst_data)
+    assert F.shape(ebatch.src['x'])[0] == g.number_of_edges() and\
+        F.shape(ebatch.src['x'])[1] == d
+    assert F.shape(ebatch.dst['x'])[0] == g.number_of_edges() and\
+        F.shape(ebatch.dst['x'])[1] == d
+    assert F.shape(ebatch.data['x'])[0] == g.number_of_edges() and\
+        F.shape(ebatch.data['x'])[1] == d
+    assert F.allclose(ebatch.edges()[0], u.tousertensor())
+    assert F.allclose(ebatch.edges()[1], v.tousertensor())
+    assert F.allclose(ebatch.edges()[2], F.arange(0, g.number_of_edges()))
+    assert ebatch.batch_size() == g.number_of_edges()
+    assert len(ebatch) == g.number_of_edges()
+
+    # test partial
+    eid = utils.toindex(F.tensor([0, 3, 5, 7, 11, 13, 15, 27]))
+    u, v, _ = g._graph.find_edges(eid)
+    src_data = g.get_n_repr(u)
+    edge_data = g.get_e_repr(eid)
+    dst_data = g.get_n_repr(v)
+    ebatch = EdgeBatch(g, (u, v, eid), src_data, edge_data, dst_data)
+    assert F.shape(ebatch.src['x'])[0] == 8 and\
+        F.shape(ebatch.src['x'])[1] == d
+    assert F.shape(ebatch.dst['x'])[0] == 8 and\
+        F.shape(ebatch.dst['x'])[1] == d
+    assert F.shape(ebatch.data['x'])[0] == 8 and\
+        F.shape(ebatch.data['x'])[1] == d
+    assert F.allclose(ebatch.edges()[0], u.tousertensor())
+    assert F.allclose(ebatch.edges()[1], v.tousertensor())
+    assert F.allclose(ebatch.edges()[2], eid.tousertensor())
+    assert ebatch.batch_size() == 8
+    assert len(ebatch) == 8
+
+if __name__ == '__main__':
+    test_node_batch()
+    test_edge_batch()

From fb62f10deffe60d9da711442bb583951c931211d Mon Sep 17 00:00:00 2001
From: yzh119 <expye@outlook.com>
Date: Wed, 7 Aug 2019 19:21:40 +0800
Subject: [PATCH 04/32] trigger


From e291d6d1f9dbf90ce6dbba907ba6db486ddfa05c Mon Sep 17 00:00:00 2001
From: Song <xiangsx@a483e705755e.ant.amazon.com>
Date: Fri, 16 Aug 2019 11:41:54 +0800
Subject: [PATCH 05/32] Update README.md for pytorch PinSage example.

Add noting that the PinSage model example under
example/pytorch/recommendation only work with Python 3.6+
as its dataset loader depends on stanfordnlp package
which work only with Python 3.6+.
---
 examples/pytorch/recommendation/README.md | 2 ++
 examples/pytorch/recommendation/main.py   | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch/recommendation/README.md b/examples/pytorch/recommendation/README.md
index a5dda0c7b046..da21331f0cf5 100644
--- a/examples/pytorch/recommendation/README.md
+++ b/examples/pytorch/recommendation/README.md
@@ -1,6 +1,8 @@
 # PinSage model
 
 NOTE: this version is not using NodeFlow yet.
+      
+This example only work with Python 3.6+
 
 First, download and extract from https://dgl.ai.s3.us-east-2.amazonaws.com/dataset/ml-1m.tar.gz
 
diff --git a/examples/pytorch/recommendation/main.py b/examples/pytorch/recommendation/main.py
index 52beb768ef74..6a29d840b7c7 100644
--- a/examples/pytorch/recommendation/main.py
+++ b/examples/pytorch/recommendation/main.py
@@ -7,7 +7,6 @@
 from rec.model.pinsage import PinSage
 from rec.datasets.movielens import MovieLens
 from rec.utils import cuda
-from rec.adabound import AdaBound
 from dgl import DGLGraph
 
 import argparse

From 5fdc2890749df0d313023e123615bed4927a2626 Mon Sep 17 00:00:00 2001
From: Song <xiangsx@a483e705755e.ant.amazon.com>
Date: Mon, 19 Aug 2019 12:53:33 +0800
Subject: [PATCH 06/32] Provid a frame agnostic API to test nn modules on both
 CPU and CUDA side.

1. make dgl.nn.xxx frame agnostic
2. make test.backend include dgl.nn modules
3. modify test_edge_softmax of test/mxnet/test_nn.py and
    test/pytorch/test_nn.py work on both CPU and GPU
---
 python/dgl/__init__.py            |  3 +-
 python/dgl/nn/__init__.py         | 54 +++++++++++++++++++++++++++++++
 python/dgl/nn/backend.py          | 13 ++++++++
 tests/backend/__init__.py         |  5 +++
 tests/backend/backend_unittest.py |  4 +++
 tests/backend/mxnet/__init__.py   |  3 ++
 tests/backend/pytorch/__init__.py |  3 ++
 tests/mxnet/test_nn.py            |  9 +++---
 tests/pytorch/test_nn.py          | 33 ++++++++++---------
 9 files changed, 106 insertions(+), 21 deletions(-)
 create mode 100644 python/dgl/nn/backend.py

diff --git a/python/dgl/__init__.py b/python/dgl/__init__.py
index f6ec994e8e2c..2ad2c69dfc82 100644
--- a/python/dgl/__init__.py
+++ b/python/dgl/__init__.py
@@ -4,7 +4,6 @@
 import socket
 
 from . import function
-from . import nn
 from . import contrib
 from . import container
 from . import random
@@ -22,3 +21,5 @@
 from .transform import *
 from .propagate import *
 from .udf import NodeBatch, EdgeBatch
+
+from . import nn
\ No newline at end of file
diff --git a/python/dgl/nn/__init__.py b/python/dgl/nn/__init__.py
index 242106dd407b..e95e9022631d 100644
--- a/python/dgl/nn/__init__.py
+++ b/python/dgl/nn/__init__.py
@@ -1 +1,55 @@
 """Package for neural network common components."""
+from __future__ import absolute_import
+
+import sys
+import os
+import importlib
+
+from . import backend
+
+_enabled_nn_modules = set()
+
+def _gen_missing_nn_module(module, mod_name):
+    def _missing_nn_module(*args, **kwargs):
+        raise ImportError('nn.Module "%s" is not supported by backend "%s".'
+                          ' You can switch to other backends by setting'
+                          ' the DGLBACKEND environment.' % (module, mod_name))
+    return _missing_nn_module
+
+def load_backend(mod_name):
+    """load backend module according to mod_name 
+    Parameters
+    ----------
+    mod_name : str
+               The DGL Backend name.
+    """
+    mod = importlib.import_module('.%s' % mod_name, __name__)
+    thismod = sys.modules[__name__]
+    for nn_module in backend.__dict__.keys():
+        if nn_module.startswith('__'):
+            # ignore python builtin attributes
+            continue
+        else:
+            # load functions and classes
+            if nn_module in mod.__dict__:
+                _enabled_nn_modules.add(nn_module)
+                setattr(thismod, nn_module, mod.__dict__[nn_module])
+            else:
+                setattr(thismod, nn_module, _gen_missing_nn_module(nn_module, mod_name))
+
+load_backend(os.environ.get('DGLBACKEND', 'pytorch').lower())
+
+def is_enabled(nn_module):
+    """Return true if the nn.module is enabled by the current backend.
+
+    Parameters
+    ----------
+    nn_module : str
+                The nn.module name.
+
+    Returns
+    -------
+    bool
+        True if the nn_module is enabled by the current backend.
+    """
+    return nn_module in _enabled_nn_modules
\ No newline at end of file
diff --git a/python/dgl/nn/backend.py b/python/dgl/nn/backend.py
new file mode 100644
index 000000000000..9f2700924582
--- /dev/null
+++ b/python/dgl/nn/backend.py
@@ -0,0 +1,13 @@
+"""This file defines high level nn.model interface provided by DGL.
+
+It is recommended the frameworks implement all the interfaces. However, it is
+also OK to skip some. The generated backend module has an ``is_enabled`` function
+that returns whether the interface is supported by the framework or not.
+"""
+
+###############################################################################
+# Softmax module
+
+def edge_softmax(graph, logits):
+    """Compute edge softmax"""
+    pass
\ No newline at end of file
diff --git a/tests/backend/__init__.py b/tests/backend/__init__.py
index 5d388a4ae400..0825ae6e778c 100644
--- a/tests/backend/__init__.py
+++ b/tests/backend/__init__.py
@@ -1,4 +1,5 @@
 from dgl.backend import *
+from dgl.nn import *
 from . import backend_unittest
 import os
 import importlib
@@ -22,6 +23,7 @@
 _zeros = zeros
 _ones = ones
 _randn = randn
+_rand = rand
 _tensor = tensor
 _arange = arange
 _full = full
@@ -43,6 +45,9 @@ def ones(shape, dtype=float32, ctx=_default_context):
 def randn(shape):
     return copy_to(_randn(shape), _default_context)
 
+def rand(shape):
+    return copy_to(_rand(shape), _default_context)
+
 def tensor(data, dtype=None):
     if dtype is None:
         if is_tensor(data):
diff --git a/tests/backend/backend_unittest.py b/tests/backend/backend_unittest.py
index 3c7e90542421..00a391244d37 100644
--- a/tests/backend/backend_unittest.py
+++ b/tests/backend/backend_unittest.py
@@ -31,6 +31,10 @@ def randn(shape):
     """Generate a tensor with elements from standard normal distribution."""
     pass
 
+def rand(shape):
+    """Generate a tensor with elements from a discrete uniform distribution."""
+    pass
+
 def attach_grad(x):
     """Flag the tensor *in-place* to have its gradient computed in backward
     pass.
diff --git a/tests/backend/mxnet/__init__.py b/tests/backend/mxnet/__init__.py
index cd678699b8ca..5dcba2e5f0e5 100644
--- a/tests/backend/mxnet/__init__.py
+++ b/tests/backend/mxnet/__init__.py
@@ -25,6 +25,9 @@ def allclose(a, b, rtol=1e-4, atol=1e-4):
 def randn(shape):
     return nd.random.randn(*shape)
 
+def rand(shape):
+    return nd.random.randint(0, 1, *shape)
+
 def attach_grad(x):
     x.attach_grad()
     return x
diff --git a/tests/backend/pytorch/__init__.py b/tests/backend/pytorch/__init__.py
index c7ffa72f655b..690902a2ee71 100644
--- a/tests/backend/pytorch/__init__.py
+++ b/tests/backend/pytorch/__init__.py
@@ -18,6 +18,9 @@ def allclose(a, b, rtol=1e-4, atol=1e-4):
 def randn(shape):
     return th.randn(*shape)
 
+def rand(shape):
+    return th.rand(*shape)
+
 def attach_grad(x):
     if x.grad is not None:
         x.grad.zero_()
diff --git a/tests/mxnet/test_nn.py b/tests/mxnet/test_nn.py
index 4432ef2b5a65..d9bef57cc53c 100644
--- a/tests/mxnet/test_nn.py
+++ b/tests/mxnet/test_nn.py
@@ -3,6 +3,7 @@
 import numpy as np
 import dgl
 import dgl.nn.mxnet as nn
+import backend as F
 from mxnet import autograd, gluon
 
 def check_close(a, b):
@@ -163,16 +164,16 @@ def uniform_attention(g, shape):
 def test_edge_softmax():
     # Basic
     g = dgl.DGLGraph(nx.path_graph(3))
-    edata = mx.nd.ones((g.number_of_edges(), 1))
-    a = nn.edge_softmax(g, edata)
+    edata = F.ones((g.number_of_edges(), 1))
+    a = F.edge_softmax(g, edata)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
     assert np.allclose(a.asnumpy(), uniform_attention(g, a.shape).asnumpy(),
             1e-4, 1e-4)
 
     # Test higher dimension case
-    edata = mx.nd.ones((g.number_of_edges(), 3, 1))
-    a = nn.edge_softmax(g, edata)
+    edata = F.ones((g.number_of_edges(), 3, 1))
+    a = F.edge_softmax(g, edata)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
     assert np.allclose(a.asnumpy(), uniform_attention(g, a.shape).asnumpy(),
diff --git a/tests/pytorch/test_nn.py b/tests/pytorch/test_nn.py
index baea5f7e85fe..c485067c86ce 100644
--- a/tests/pytorch/test_nn.py
+++ b/tests/pytorch/test_nn.py
@@ -2,6 +2,7 @@
 import networkx as nx
 import dgl
 import dgl.nn.pytorch as nn
+import backend as F
 from copy import deepcopy
 
 import numpy as np
@@ -187,18 +188,18 @@ def uniform_attention(g, shape):
 def test_edge_softmax():
     # Basic
     g = dgl.DGLGraph(nx.path_graph(3))
-    edata = th.ones(g.number_of_edges(), 1)
-    a = nn.edge_softmax(g, edata)
+    edata = F.ones((g.number_of_edges(), 1))
+    a = F.edge_softmax(g, edata)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
-    assert th.allclose(a, uniform_attention(g, a.shape))
+    assert F.allclose(a, uniform_attention(g, a.shape))
 
     # Test higher dimension case
-    edata = th.ones(g.number_of_edges(), 3, 1)
-    a = nn.edge_softmax(g, edata)
+    edata = F.ones((g.number_of_edges(), 3, 1))
+    a = F.edge_softmax(g, edata)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
-    assert th.allclose(a, uniform_attention(g, a.shape))
+    assert F.allclose(a, uniform_attention(g, a.shape))
 
     # Test both forward and backward with PyTorch built-in softmax.
     g = dgl.DGLGraph()
@@ -208,21 +209,21 @@ def test_edge_softmax():
         for j in range(30):
             g.add_edge(i, j)
 
-    score = th.rand(900, 1)
+    score = F.rand((900, 1))
     score.requires_grad_()
-    grad = th.rand(900, 1)
-    y = th.softmax(score.view(30, 30), dim=0).view(-1, 1)
+    grad = F.rand((900, 1))
+    y = F.softmax(score.view(30, 30), dim=0).view(-1, 1)
     y.backward(grad)
     grad_score = score.grad
     score.grad.zero_()
-    y_dgl = nn.edge_softmax(g, score)
+    y_dgl = F.edge_softmax(g, score)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
     # check forward
-    assert th.allclose(y_dgl, y)
+    assert F.allclose(y_dgl, y)
     y_dgl.backward(grad)
     # checkout gradient
-    assert th.allclose(score.grad, grad_score)
+    assert F.allclose(score.grad, grad_score)
     print(score.grad[:10], grad_score[:10])
     
     # Test 2
@@ -231,18 +232,18 @@ def generate_rand_graph(n):
       return dgl.DGLGraph(arr, readonly=True)
     
     g = generate_rand_graph(50)
-    a1 = th.randn(g.number_of_edges(), 1).requires_grad_()
+    a1 = F.randn((g.number_of_edges(), 1)).requires_grad_()
     a2 = a1.clone().detach().requires_grad_()
     g.edata['s'] = a1
-    g.group_apply_edges('dst', lambda edges: {'ss':th.softmax(edges.data['s'], 1)})
+    g.group_apply_edges('dst', lambda edges: {'ss':F.softmax(edges.data['s'], 1)})
     g.edata['ss'].sum().backward()
     
-    builtin_sm = nn.edge_softmax(g, a2)
+    builtin_sm = F.edge_softmax(g, a2)
     builtin_sm.sum().backward()
     print(a1.grad - a2.grad)
     assert len(g.ndata) == 0
     assert len(g.edata) == 2
-    assert th.allclose(a1.grad, a2.grad, rtol=1e-4, atol=1e-4) # Follow tolerance in unittest backend
+    assert F.allclose(a1.grad, a2.grad, rtol=1e-4, atol=1e-4) # Follow tolerance in unittest backend
     
 
 if __name__ == '__main__':

From 2e89c6f72e932cf7ca4f2e73d5faeeef7d9682b5 Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Mon, 19 Aug 2019 13:21:43 +0800
Subject: [PATCH 07/32] Fix style

---
 python/dgl/__init__.py    |  2 +-
 python/dgl/nn/__init__.py |  6 +++---
 python/dgl/nn/backend.py  | 16 ++++++++++++++--
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/python/dgl/__init__.py b/python/dgl/__init__.py
index 2ad2c69dfc82..e8b1f0d0e0e5 100644
--- a/python/dgl/__init__.py
+++ b/python/dgl/__init__.py
@@ -22,4 +22,4 @@
 from .propagate import *
 from .udf import NodeBatch, EdgeBatch
 
-from . import nn
\ No newline at end of file
+from . import nn
diff --git a/python/dgl/nn/__init__.py b/python/dgl/nn/__init__.py
index e95e9022631d..aae9c27337d9 100644
--- a/python/dgl/nn/__init__.py
+++ b/python/dgl/nn/__init__.py
@@ -17,7 +17,7 @@ def _missing_nn_module(*args, **kwargs):
     return _missing_nn_module
 
 def load_backend(mod_name):
-    """load backend module according to mod_name 
+    """load backend module according to mod_name
     Parameters
     ----------
     mod_name : str
@@ -25,7 +25,7 @@ def load_backend(mod_name):
     """
     mod = importlib.import_module('.%s' % mod_name, __name__)
     thismod = sys.modules[__name__]
-    for nn_module in backend.__dict__.keys():
+    for nn_module in backend.__dict__:
         if nn_module.startswith('__'):
             # ignore python builtin attributes
             continue
@@ -52,4 +52,4 @@ def is_enabled(nn_module):
     bool
         True if the nn_module is enabled by the current backend.
     """
-    return nn_module in _enabled_nn_modules
\ No newline at end of file
+    return nn_module in _enabled_nn_modules
diff --git a/python/dgl/nn/backend.py b/python/dgl/nn/backend.py
index 9f2700924582..e7b6176b5e4b 100644
--- a/python/dgl/nn/backend.py
+++ b/python/dgl/nn/backend.py
@@ -9,5 +9,17 @@
 # Softmax module
 
 def edge_softmax(graph, logits):
-    """Compute edge softmax"""
-    pass
\ No newline at end of file
+    """Compute edge softmax
+    Parameters
+    ----------
+    graph : DGLGraph
+        The graph to perform edge softmax
+    logits : Tensor
+        The input edge feature
+
+    Returns
+    -------
+    Tensor
+        Softmax value
+    """
+    pass

From b1af382a766adb65ea3508485d9c786f7ff45a34 Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Mon, 19 Aug 2019 13:39:46 +0800
Subject: [PATCH 08/32] Delete unused code

---
 python/dgl/nn/__init__.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/python/dgl/nn/__init__.py b/python/dgl/nn/__init__.py
index aae9c27337d9..4be0dec86c5c 100644
--- a/python/dgl/nn/__init__.py
+++ b/python/dgl/nn/__init__.py
@@ -7,8 +7,6 @@
 
 from . import backend
 
-_enabled_nn_modules = set()
-
 def _gen_missing_nn_module(module, mod_name):
     def _missing_nn_module(*args, **kwargs):
         raise ImportError('nn.Module "%s" is not supported by backend "%s".'
@@ -32,24 +30,8 @@ def load_backend(mod_name):
         else:
             # load functions and classes
             if nn_module in mod.__dict__:
-                _enabled_nn_modules.add(nn_module)
                 setattr(thismod, nn_module, mod.__dict__[nn_module])
             else:
                 setattr(thismod, nn_module, _gen_missing_nn_module(nn_module, mod_name))
 
 load_backend(os.environ.get('DGLBACKEND', 'pytorch').lower())
-
-def is_enabled(nn_module):
-    """Return true if the nn.module is enabled by the current backend.
-
-    Parameters
-    ----------
-    nn_module : str
-                The nn.module name.
-
-    Returns
-    -------
-    bool
-        True if the nn_module is enabled by the current backend.
-    """
-    return nn_module in _enabled_nn_modules

From 85630e33ccde637fa463de368b12edee63718bf9 Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Mon, 19 Aug 2019 15:47:12 +0800
Subject: [PATCH 09/32] Make agnostic test only related to tests/backend

1. clear all agnostic related code in dgl.nn
2. make test_graph_conv agnostic to cpu/gpu
---
 python/dgl/__init__.py    |  3 +--
 python/dgl/nn/__init__.py | 38 +-------------------------------------
 python/dgl/nn/backend.py  | 25 -------------------------
 tests/backend/__init__.py |  6 ++++++
 tests/mxnet/test_nn.py    | 24 ++++++++++++------------
 tests/pytorch/test_nn.py  | 35 +++++++++++++++++++++--------------
 6 files changed, 41 insertions(+), 90 deletions(-)
 delete mode 100644 python/dgl/nn/backend.py

diff --git a/python/dgl/__init__.py b/python/dgl/__init__.py
index e8b1f0d0e0e5..f6ec994e8e2c 100644
--- a/python/dgl/__init__.py
+++ b/python/dgl/__init__.py
@@ -4,6 +4,7 @@
 import socket
 
 from . import function
+from . import nn
 from . import contrib
 from . import container
 from . import random
@@ -21,5 +22,3 @@
 from .transform import *
 from .propagate import *
 from .udf import NodeBatch, EdgeBatch
-
-from . import nn
diff --git a/python/dgl/nn/__init__.py b/python/dgl/nn/__init__.py
index 4be0dec86c5c..90d4522c3fd4 100644
--- a/python/dgl/nn/__init__.py
+++ b/python/dgl/nn/__init__.py
@@ -1,37 +1 @@
-"""Package for neural network common components."""
-from __future__ import absolute_import
-
-import sys
-import os
-import importlib
-
-from . import backend
-
-def _gen_missing_nn_module(module, mod_name):
-    def _missing_nn_module(*args, **kwargs):
-        raise ImportError('nn.Module "%s" is not supported by backend "%s".'
-                          ' You can switch to other backends by setting'
-                          ' the DGLBACKEND environment.' % (module, mod_name))
-    return _missing_nn_module
-
-def load_backend(mod_name):
-    """load backend module according to mod_name
-    Parameters
-    ----------
-    mod_name : str
-               The DGL Backend name.
-    """
-    mod = importlib.import_module('.%s' % mod_name, __name__)
-    thismod = sys.modules[__name__]
-    for nn_module in backend.__dict__:
-        if nn_module.startswith('__'):
-            # ignore python builtin attributes
-            continue
-        else:
-            # load functions and classes
-            if nn_module in mod.__dict__:
-                setattr(thismod, nn_module, mod.__dict__[nn_module])
-            else:
-                setattr(thismod, nn_module, _gen_missing_nn_module(nn_module, mod_name))
-
-load_backend(os.environ.get('DGLBACKEND', 'pytorch').lower())
+"""Package for neural network common components."""
\ No newline at end of file
diff --git a/python/dgl/nn/backend.py b/python/dgl/nn/backend.py
deleted file mode 100644
index e7b6176b5e4b..000000000000
--- a/python/dgl/nn/backend.py
+++ /dev/null
@@ -1,25 +0,0 @@
-"""This file defines high level nn.model interface provided by DGL.
-
-It is recommended the frameworks implement all the interfaces. However, it is
-also OK to skip some. The generated backend module has an ``is_enabled`` function
-that returns whether the interface is supported by the framework or not.
-"""
-
-###############################################################################
-# Softmax module
-
-def edge_softmax(graph, logits):
-    """Compute edge softmax
-    Parameters
-    ----------
-    graph : DGLGraph
-        The graph to perform edge softmax
-    logits : Tensor
-        The input edge feature
-
-    Returns
-    -------
-    Tensor
-        Softmax value
-    """
-    pass
diff --git a/tests/backend/__init__.py b/tests/backend/__init__.py
index 0825ae6e778c..dd189bdec706 100644
--- a/tests/backend/__init__.py
+++ b/tests/backend/__init__.py
@@ -36,6 +36,12 @@
         }
 _default_context = _context_dict[_default_context_str]
 
+def ctx():
+    return _default_context
+
+def gpu_ctx():
+    return (_default_context_str == 'gpu')
+
 def zeros(shape, dtype=float32, ctx=_default_context):
     return _zeros(shape, dtype, ctx)
 
diff --git a/tests/mxnet/test_nn.py b/tests/mxnet/test_nn.py
index d9bef57cc53c..ffff72a0ae45 100644
--- a/tests/mxnet/test_nn.py
+++ b/tests/mxnet/test_nn.py
@@ -16,19 +16,19 @@ def _AXWb(A, X, W, b):
 
 def test_graph_conv():
     g = dgl.DGLGraph(nx.path_graph(3))
-    adj = g.adjacency_matrix()
-    ctx = mx.cpu(0)
+    ctx = F.ctx()
+    adj = g.adjacency_matrix(ctx=ctx)
 
     conv = nn.GraphConv(5, 2, norm=False, bias=True)
     conv.initialize(ctx=ctx)
     # test#1: basic
-    h0 = mx.nd.ones((3, 5))
+    h0 = F.ones((3, 5))
     h1 = conv(h0, g)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
     check_close(h1, _AXWb(adj, h0, conv.weight, conv.bias))
     # test#2: more-dim
-    h0 = mx.nd.ones((3, 5, 5))
+    h0 = F.ones((3, 5, 5))
     h1 = conv(h0, g)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
@@ -38,12 +38,12 @@ def test_graph_conv():
     conv.initialize(ctx=ctx)
 
     # test#3: basic
-    h0 = mx.nd.ones((3, 5))
+    h0 = F.ones((3, 5))
     h1 = conv(h0, g)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
     # test#4: basic
-    h0 = mx.nd.ones((3, 5, 5))
+    h0 = F.ones((3, 5, 5))
     h1 = conv(h0, g)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
@@ -53,23 +53,23 @@ def test_graph_conv():
 
     with autograd.train_mode():
         # test#3: basic
-        h0 = mx.nd.ones((3, 5))
+        h0 = F.ones((3, 5))
         h1 = conv(h0, g)
         assert len(g.ndata) == 0
         assert len(g.edata) == 0
         # test#4: basic
-        h0 = mx.nd.ones((3, 5, 5))
+        h0 = F.ones((3, 5, 5))
         h1 = conv(h0, g)
         assert len(g.ndata) == 0
         assert len(g.edata) == 0
 
     # test not override features
-    g.ndata["h"] = 2 * mx.nd.ones((3, 1))
+    g.ndata["h"] = 2 * F.ones((3, 1))
     h1 = conv(h0, g)
     assert len(g.ndata) == 1
     assert len(g.edata) == 0
     assert "h" in g.ndata
-    check_close(g.ndata['h'], 2 * mx.nd.ones((3, 1)))
+    check_close(g.ndata['h'], 2 * F.ones((3, 1)))
 
 def test_set2set():
     g = dgl.DGLGraph(nx.path_graph(10))
@@ -165,7 +165,7 @@ def test_edge_softmax():
     # Basic
     g = dgl.DGLGraph(nx.path_graph(3))
     edata = F.ones((g.number_of_edges(), 1))
-    a = F.edge_softmax(g, edata)
+    a = nn.edge_softmax(g, edata)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
     assert np.allclose(a.asnumpy(), uniform_attention(g, a.shape).asnumpy(),
@@ -173,7 +173,7 @@ def test_edge_softmax():
 
     # Test higher dimension case
     edata = F.ones((g.number_of_edges(), 3, 1))
-    a = F.edge_softmax(g, edata)
+    a = nn.edge_softmax(g, edata)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
     assert np.allclose(a.asnumpy(), uniform_attention(g, a.shape).asnumpy(),
diff --git a/tests/pytorch/test_nn.py b/tests/pytorch/test_nn.py
index c485067c86ce..7a37e8806456 100644
--- a/tests/pytorch/test_nn.py
+++ b/tests/pytorch/test_nn.py
@@ -15,43 +15,50 @@ def _AXWb(A, X, W, b):
 
 def test_graph_conv():
     g = dgl.DGLGraph(nx.path_graph(3))
-    adj = g.adjacency_matrix()
+    ctx = F.ctx()
+    adj = g.adjacency_matrix(ctx=ctx)
 
     conv = nn.GraphConv(5, 2, norm=False, bias=True)
+    if F.gpu_ctx():
+        conv.cuda()
     print(conv)
     # test#1: basic
-    h0 = th.ones((3, 5))
+    h0 = F.ones((3, 5))
     h1 = conv(h0, g)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
-    assert th.allclose(h1, _AXWb(adj, h0, conv.weight, conv.bias))
+    assert F.allclose(h1, _AXWb(adj, h0, conv.weight, conv.bias))
     # test#2: more-dim
-    h0 = th.ones((3, 5, 5))
+    h0 = F.ones((3, 5, 5))
     h1 = conv(h0, g)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
-    assert th.allclose(h1, _AXWb(adj, h0, conv.weight, conv.bias))
+    assert F.allclose(h1, _AXWb(adj, h0, conv.weight, conv.bias))
 
     conv = nn.GraphConv(5, 2)
+    if F.gpu_ctx():
+        conv.cuda()
     # test#3: basic
-    h0 = th.ones((3, 5))
+    h0 = F.ones((3, 5))
     h1 = conv(h0, g)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
     # test#4: basic
-    h0 = th.ones((3, 5, 5))
+    h0 = F.ones((3, 5, 5))
     h1 = conv(h0, g)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
 
     conv = nn.GraphConv(5, 2)
+    if F.gpu_ctx():
+        conv.cuda()
     # test#3: basic
-    h0 = th.ones((3, 5))
+    h0 = F.ones((3, 5))
     h1 = conv(h0, g)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
     # test#4: basic
-    h0 = th.ones((3, 5, 5))
+    h0 = F.ones((3, 5, 5))
     h1 = conv(h0, g)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
@@ -60,7 +67,7 @@ def test_graph_conv():
     old_weight = deepcopy(conv.weight.data)
     conv.reset_parameters()
     new_weight = conv.weight.data
-    assert not th.allclose(old_weight, new_weight)
+    assert not F.allclose(old_weight, new_weight)
 
 def test_set2set():
     g = dgl.DGLGraph(nx.path_graph(10))
@@ -189,14 +196,14 @@ def test_edge_softmax():
     # Basic
     g = dgl.DGLGraph(nx.path_graph(3))
     edata = F.ones((g.number_of_edges(), 1))
-    a = F.edge_softmax(g, edata)
+    a = nn.edge_softmax(g, edata)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
     assert F.allclose(a, uniform_attention(g, a.shape))
 
     # Test higher dimension case
     edata = F.ones((g.number_of_edges(), 3, 1))
-    a = F.edge_softmax(g, edata)
+    a = nn.edge_softmax(g, edata)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
     assert F.allclose(a, uniform_attention(g, a.shape))
@@ -216,7 +223,7 @@ def test_edge_softmax():
     y.backward(grad)
     grad_score = score.grad
     score.grad.zero_()
-    y_dgl = F.edge_softmax(g, score)
+    y_dgl = nn.edge_softmax(g, score)
     assert len(g.ndata) == 0
     assert len(g.edata) == 0
     # check forward
@@ -238,7 +245,7 @@ def generate_rand_graph(n):
     g.group_apply_edges('dst', lambda edges: {'ss':F.softmax(edges.data['s'], 1)})
     g.edata['ss'].sum().backward()
     
-    builtin_sm = F.edge_softmax(g, a2)
+    builtin_sm = nn.edge_softmax(g, a2)
     builtin_sm.sum().backward()
     print(a1.grad - a2.grad)
     assert len(g.ndata) == 0

From 874352f4ce00de9e56f5517ca361dedca9e64716 Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Mon, 19 Aug 2019 15:53:16 +0800
Subject: [PATCH 10/32] Fix code style

---
 python/dgl/nn/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/dgl/nn/__init__.py b/python/dgl/nn/__init__.py
index 90d4522c3fd4..242106dd407b 100644
--- a/python/dgl/nn/__init__.py
+++ b/python/dgl/nn/__init__.py
@@ -1 +1 @@
-"""Package for neural network common components."""
\ No newline at end of file
+"""Package for neural network common components."""

From 47e468ac55134a09a71e4a193563ee1dcdcf602f Mon Sep 17 00:00:00 2001
From: yzh119 <expye@outlook.com>
Date: Mon, 19 Aug 2019 17:21:47 +0800
Subject: [PATCH 11/32] fix

---
 python/dgl/nn/mxnet/glob.py   | 11 -----------
 python/dgl/nn/pytorch/glob.py | 24 +++++++++++-------------
 2 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/python/dgl/nn/mxnet/glob.py b/python/dgl/nn/mxnet/glob.py
index 1f0863f669fa..32d33df9402e 100644
--- a/python/dgl/nn/mxnet/glob.py
+++ b/python/dgl/nn/mxnet/glob.py
@@ -191,13 +191,6 @@ def __init__(self, gate_nn, feat_nn=None):
             self.gate_nn = gate_nn
             self.feat_nn = feat_nn
 
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        self.gate_nn.initialize(mx.init.Xavier())
-        if self.feat_nn:
-            self.feat_nn.initialize(mx.init.Xavier())
-
     def forward(self, feat, graph):
         r"""Compute global attention pooling.
 
@@ -265,10 +258,6 @@ def __init__(self, input_dim, n_iters, n_layers):
         with self.name_scope():
             self.lstm = gluon.rnn.LSTM(
                 self.input_dim, num_layers=n_layers, input_size=self.output_dim)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        self.lstm.initialize(mx.init.Xavier())
 
     def forward(self, feat, graph):
         r"""Compute set2set pooling.
diff --git a/python/dgl/nn/pytorch/glob.py b/python/dgl/nn/pytorch/glob.py
index 05254f27d4c6..3994bbc8844e 100644
--- a/python/dgl/nn/pytorch/glob.py
+++ b/python/dgl/nn/pytorch/glob.py
@@ -178,9 +178,9 @@ def __init__(self, gate_nn, feat_nn=None):
         super(GlobalAttentionPooling, self).__init__()
         self.gate_nn = gate_nn
         self.feat_nn = feat_nn
-        self._reset_parameters()
+        self.reset_parameters()
 
-    def _reset_parameters(self):
+    def reset_parameters(self):
         for p in self.gate_nn.parameters():
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
@@ -256,12 +256,10 @@ def __init__(self, input_dim, n_iters, n_layers):
         self.n_iters = n_iters
         self.n_layers = n_layers
         self.lstm = th.nn.LSTM(self.output_dim, self.input_dim, n_layers)
-        self._reset_parameters()
+        self.reset_parameters()
 
-    def _reset_parameters(self):
-        for p in self.lstm.parameters():
-            if p.dim() > 1:
-                nn.init.xavier_uniform_(p)
+    def reset_parameters(self):
+        self.lstm.reset_parameters()
 
     def forward(self, feat, graph):
         r"""Compute set2set pooling.
@@ -342,9 +340,9 @@ def __init__(self, d_model, num_heads, d_head, d_ff, dropouth=0., dropouta=0.):
         self.dropa = nn.Dropout(dropouta)
         self.norm_in = nn.LayerNorm(d_model)
         self.norm_inter = nn.LayerNorm(d_model)
-        self._reset_parameters()
+        self.reset_parameters()
 
-    def _reset_parameters(self):
+    def reset_parameters(self):
         for p in self.parameters():
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
@@ -441,9 +439,9 @@ def __init__(self, m, d_model, num_heads, d_head, d_ff, dropouth=0., dropouta=0.
         self.mha = nn.ModuleList([
             MultiHeadAttention(d_model, num_heads, d_head, d_ff,
                                dropouth=dropouth, dropouta=dropouta) for _ in range(2)])
-        self._reset_parameters()
+        self.reset_parameters()
 
-    def _reset_parameters(self):
+    def reset_parameters(self):
         nn.init.xavier_uniform_(self.inducing_points)
 
     def forward(self, feat, lengths):
@@ -492,9 +490,9 @@ def __init__(self, k, d_model, num_heads, d_head, d_ff, dropouth=0., dropouta=0.
             nn.Dropout(dropouth),
             nn.Linear(d_ff, d_model)
         )
-        self._reset_parameters()
+        self.reset_parameters()
 
-    def _reset_parameters(self):
+    def reset_parameters(self):
         nn.init.xavier_uniform_(self.seed_vectors)
 
     def forward(self, feat, lengths):

From 10e1d274d4f6fa5c783e2ce5c1aca9bd438169dc Mon Sep 17 00:00:00 2001
From: yzh119 <expye@outlook.com>
Date: Mon, 19 Aug 2019 17:41:37 +0800
Subject: [PATCH 12/32] doc

---
 python/dgl/nn/pytorch/glob.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/dgl/nn/pytorch/glob.py b/python/dgl/nn/pytorch/glob.py
index 3994bbc8844e..26fd52444515 100644
--- a/python/dgl/nn/pytorch/glob.py
+++ b/python/dgl/nn/pytorch/glob.py
@@ -181,6 +181,7 @@ def __init__(self, gate_nn, feat_nn=None):
         self.reset_parameters()
 
     def reset_parameters(self):
+        """Reinitialize learnable parameters."""
         for p in self.gate_nn.parameters():
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
@@ -259,6 +260,7 @@ def __init__(self, input_dim, n_iters, n_layers):
         self.reset_parameters()
 
     def reset_parameters(self):
+        """Reinitialize learnable parameters."""
         self.lstm.reset_parameters()
 
     def forward(self, feat, graph):
@@ -343,6 +345,7 @@ def __init__(self, d_model, num_heads, d_head, d_ff, dropouth=0., dropouta=0.):
         self.reset_parameters()
 
     def reset_parameters(self):
+        """Reinitialize learnable parameters."""
         for p in self.parameters():
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
@@ -442,6 +445,7 @@ def __init__(self, m, d_model, num_heads, d_head, d_ff, dropouth=0., dropouta=0.
         self.reset_parameters()
 
     def reset_parameters(self):
+        """Reinitialize learnable parameters."""
         nn.init.xavier_uniform_(self.inducing_points)
 
     def forward(self, feat, lengths):
@@ -493,6 +497,7 @@ def __init__(self, k, d_model, num_heads, d_head, d_ff, dropouth=0., dropouta=0.
         self.reset_parameters()
 
     def reset_parameters(self):
+        """Reinitialize learnable parameters."""
         nn.init.xavier_uniform_(self.seed_vectors)
 
     def forward(self, feat, lengths):

From 4bfe71cbbf21cb5f7d03e254fb83112a4156bab4 Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Mon, 19 Aug 2019 21:31:54 +0800
Subject: [PATCH 13/32] Make all test code under tests.mxnet/pytorch.test_nn.py
 work on both CPU and GPU.

---
 tests/mxnet/test_nn.py   | 53 +++++++++++++++++---------------
 tests/pytorch/test_nn.py | 66 ++++++++++++++++++++++------------------
 2 files changed, 65 insertions(+), 54 deletions(-)

diff --git a/tests/mxnet/test_nn.py b/tests/mxnet/test_nn.py
index ffff72a0ae45..e151958f8d31 100644
--- a/tests/mxnet/test_nn.py
+++ b/tests/mxnet/test_nn.py
@@ -73,34 +73,38 @@ def test_graph_conv():
 
 def test_set2set():
     g = dgl.DGLGraph(nx.path_graph(10))
+    ctx = F.ctx()
 
     s2s = nn.Set2Set(5, 3, 3) # hidden size 5, 3 iters, 3 layers
+    s2s.initialize(ctx=ctx)
     print(s2s)
 
     # test#1: basic
-    h0 = mx.nd.random.randn(g.number_of_nodes(), 5)
+    h0 = F.randn((g.number_of_nodes(), 5))
     h1 = s2s(h0, g)
     assert h1.shape[0] == 10 and h1.ndim == 1
 
     # test#2: batched graph
     bg = dgl.batch([g, g, g])
-    h0 = mx.nd.random.randn(bg.number_of_nodes(), 5)
+    h0 = F.randn((bg.number_of_nodes(), 5))
     h1 = s2s(h0, bg)
     assert h1.shape[0] == 3 and h1.shape[1] == 10 and h1.ndim == 2
 
 def test_glob_att_pool():
     g = dgl.DGLGraph(nx.path_graph(10))
+    ctx = F.ctx()
 
     gap = nn.GlobalAttentionPooling(gluon.nn.Dense(1), gluon.nn.Dense(10))
+    gap.initialize(ctx=ctx)
     print(gap)
     # test#1: basic
-    h0 = mx.nd.random.randn(g.number_of_nodes(), 5)
+    h0 = F.randn((g.number_of_nodes(), 5))
     h1 = gap(h0, g)
     assert h1.shape[0] == 10 and h1.ndim == 1
 
     # test#2: batched graph
     bg = dgl.batch([g, g, g, g])
-    h0 = mx.nd.random.randn(bg.number_of_nodes(), 5)
+    h0 = F.randn((bg.number_of_nodes(), 5))
     h1 = gap(h0, bg)
     assert h1.shape[0] == 4 and h1.shape[1] == 10 and h1.ndim == 2
 
@@ -114,48 +118,47 @@ def test_simple_pool():
     print(sum_pool, avg_pool, max_pool, sort_pool)
 
     # test#1: basic
-    h0 = mx.nd.random.randn(g.number_of_nodes(), 5)
+    h0 = F.randn((g.number_of_nodes(), 5))
     h1 = sum_pool(h0, g)
-    check_close(h1, mx.nd.sum(h0, 0))
+    check_close(h1, F.sum(h0, 0))
     h1 = avg_pool(h0, g)
-    check_close(h1, mx.nd.mean(h0, 0))
+    check_close(h1, F.mean(h0, 0))
     h1 = max_pool(h0, g)
-    check_close(h1, mx.nd.max(h0, 0))
+    check_close(h1, F.max(h0, 0))
     h1 = sort_pool(h0, g)
     assert h1.shape[0] == 10 * 5 and h1.ndim == 1
 
     # test#2: batched graph
     g_ = dgl.DGLGraph(nx.path_graph(5))
     bg = dgl.batch([g, g_, g, g_, g])
-    h0 = mx.nd.random.randn(bg.number_of_nodes(), 5)
+    h0 = F.randn((bg.number_of_nodes(), 5))
     h1 = sum_pool(h0, bg)
-    truth = mx.nd.stack(mx.nd.sum(h0[:15], 0),
-                        mx.nd.sum(h0[15:20], 0),
-                        mx.nd.sum(h0[20:35], 0),
-                        mx.nd.sum(h0[35:40], 0),
-                        mx.nd.sum(h0[40:55], 0), axis=0)
+    truth = mx.nd.stack(F.sum(h0[:15], 0),
+                        F.sum(h0[15:20], 0),
+                        F.sum(h0[20:35], 0),
+                        F.sum(h0[35:40], 0),
+                        F.sum(h0[40:55], 0), axis=0)
     check_close(h1, truth)
 
     h1 = avg_pool(h0, bg)
-    truth = mx.nd.stack(mx.nd.mean(h0[:15], 0),
-                        mx.nd.mean(h0[15:20], 0),
-                        mx.nd.mean(h0[20:35], 0),
-                        mx.nd.mean(h0[35:40], 0),
-                        mx.nd.mean(h0[40:55], 0), axis=0)
+    truth = mx.nd.stack(F.mean(h0[:15], 0),
+                        F.mean(h0[15:20], 0),
+                        F.mean(h0[20:35], 0),
+                        F.mean(h0[35:40], 0),
+                        F.mean(h0[40:55], 0), axis=0)
     check_close(h1, truth)
 
     h1 = max_pool(h0, bg)
-    truth = mx.nd.stack(mx.nd.max(h0[:15], 0),
-                        mx.nd.max(h0[15:20], 0),
-                        mx.nd.max(h0[20:35], 0),
-                        mx.nd.max(h0[35:40], 0),
-                        mx.nd.max(h0[40:55], 0), axis=0)
+    truth = mx.nd.stack(F.max(h0[:15], 0),
+                        F.max(h0[15:20], 0),
+                        F.max(h0[20:35], 0),
+                        F.max(h0[35:40], 0),
+                        F.max(h0[40:55], 0), axis=0)
     check_close(h1, truth)
 
     h1 = sort_pool(h0, bg)
     assert h1.shape[0] == 5 and h1.shape[1] == 10 * 5 and h1.ndim == 2
 
-
 def uniform_attention(g, shape):
     a = mx.nd.ones(shape)
     target_shape = (g.number_of_edges(),) + (1,) * (len(shape) - 1)
diff --git a/tests/pytorch/test_nn.py b/tests/pytorch/test_nn.py
index 7a37e8806456..b38913b2c9b6 100644
--- a/tests/pytorch/test_nn.py
+++ b/tests/pytorch/test_nn.py
@@ -73,10 +73,12 @@ def test_set2set():
     g = dgl.DGLGraph(nx.path_graph(10))
 
     s2s = nn.Set2Set(5, 3, 3) # hidden size 5, 3 iters, 3 layers
+    if F.gpu_ctx():
+        s2s.cuda()
     print(s2s)
 
     # test#1: basic
-    h0 = th.rand(g.number_of_nodes(), 5)
+    h0 = F.rand((g.number_of_nodes(), 5))
     h1 = s2s(h0, g)
     assert h1.shape[0] == 10 and h1.dim() == 1
 
@@ -84,7 +86,7 @@ def test_set2set():
     g1 = dgl.DGLGraph(nx.path_graph(11))
     g2 = dgl.DGLGraph(nx.path_graph(5))
     bg = dgl.batch([g, g1, g2])
-    h0 = th.rand(bg.number_of_nodes(), 5)
+    h0 = F.rand((bg.number_of_nodes(), 5))
     h1 = s2s(h0, bg)
     assert h1.shape[0] == 3 and h1.shape[1] == 10 and h1.dim() == 2
 
@@ -92,16 +94,18 @@ def test_glob_att_pool():
     g = dgl.DGLGraph(nx.path_graph(10))
 
     gap = nn.GlobalAttentionPooling(th.nn.Linear(5, 1), th.nn.Linear(5, 10))
+    if F.gpu_ctx():
+        gap.cuda()
     print(gap)
 
     # test#1: basic
-    h0 = th.rand(g.number_of_nodes(), 5)
+    h0 = F.rand((g.number_of_nodes(), 5))
     h1 = gap(h0, g)
     assert h1.shape[0] == 10 and h1.dim() == 1
 
     # test#2: batched graph
     bg = dgl.batch([g, g, g, g])
-    h0 = th.rand(bg.number_of_nodes(), 5)
+    h0 = F.rand((bg.number_of_nodes(), 5))
     h1 = gap(h0, bg)
     assert h1.shape[0] == 4 and h1.shape[1] == 10 and h1.dim() == 2
 
@@ -115,44 +119,44 @@ def test_simple_pool():
     print(sum_pool, avg_pool, max_pool, sort_pool)
 
     # test#1: basic
-    h0 = th.rand(g.number_of_nodes(), 5)
+    h0 = F.rand((g.number_of_nodes(), 5))
     h1 = sum_pool(h0, g)
-    assert th.allclose(h1, th.sum(h0, 0))
+    assert F.allclose(h1, F.sum(h0, 0))
     h1 = avg_pool(h0, g)
-    assert th.allclose(h1, th.mean(h0, 0))
+    assert F.allclose(h1, F.mean(h0, 0))
     h1 = max_pool(h0, g)
-    assert th.allclose(h1, th.max(h0, 0)[0])
+    assert F.allclose(h1, F.max(h0, 0))
     h1 = sort_pool(h0, g)
     assert h1.shape[0] == 10 * 5 and h1.dim() == 1
 
     # test#2: batched graph
     g_ = dgl.DGLGraph(nx.path_graph(5))
     bg = dgl.batch([g, g_, g, g_, g])
-    h0 = th.rand(bg.number_of_nodes(), 5)
+    h0 = F.rand((bg.number_of_nodes(), 5))
 
     h1 = sum_pool(h0, bg)
-    truth = th.stack([th.sum(h0[:15], 0),
-                      th.sum(h0[15:20], 0),
-                      th.sum(h0[20:35], 0),
-                      th.sum(h0[35:40], 0),
-                      th.sum(h0[40:55], 0)], 0)
-    assert th.allclose(h1, truth)
+    truth = th.stack([F.sum(h0[:15], 0),
+                      F.sum(h0[15:20], 0),
+                      F.sum(h0[20:35], 0),
+                      F.sum(h0[35:40], 0),
+                      F.sum(h0[40:55], 0)], 0)
+    assert F.allclose(h1, truth)
 
     h1 = avg_pool(h0, bg)
-    truth = th.stack([th.mean(h0[:15], 0),
-                      th.mean(h0[15:20], 0),
-                      th.mean(h0[20:35], 0),
-                      th.mean(h0[35:40], 0),
-                      th.mean(h0[40:55], 0)], 0)
-    assert th.allclose(h1, truth)
+    truth = th.stack([F.mean(h0[:15], 0),
+                      F.mean(h0[15:20], 0),
+                      F.mean(h0[20:35], 0),
+                      F.mean(h0[35:40], 0),
+                      F.mean(h0[40:55], 0)], 0)
+    assert F.allclose(h1, truth)
 
     h1 = max_pool(h0, bg)
-    truth = th.stack([th.max(h0[:15], 0)[0],
-                      th.max(h0[15:20], 0)[0],
-                      th.max(h0[20:35], 0)[0],
-                      th.max(h0[35:40], 0)[0],
-                      th.max(h0[40:55], 0)[0]], 0)
-    assert th.allclose(h1, truth)
+    truth = th.stack([F.max(h0[:15], 0),
+                      F.max(h0[15:20], 0),
+                      F.max(h0[20:35], 0),
+                      F.max(h0[35:40], 0),
+                      F.max(h0[40:55], 0)], 0)
+    assert F.allclose(h1, truth)
 
     h1 = sort_pool(h0, bg)
     assert h1.shape[0] == 5 and h1.shape[1] == 10 * 5 and h1.dim() == 2
@@ -163,10 +167,14 @@ def test_set_trans():
     st_enc_0 = nn.SetTransformerEncoder(50, 5, 10, 100, 2, 'sab')
     st_enc_1 = nn.SetTransformerEncoder(50, 5, 10, 100, 2, 'isab', 3)
     st_dec = nn.SetTransformerDecoder(50, 5, 10, 100, 2, 4)
+    if F.gpu_ctx():
+        st_enc_0.cuda()
+        st_enc_1.cuda()
+        st_dec.cuda()
     print(st_enc_0, st_enc_1, st_dec)
 
     # test#1: basic
-    h0 = th.rand(g.number_of_nodes(), 50)
+    h0 = F.rand((g.number_of_nodes(), 50))
     h1 = st_enc_0(h0, g)
     assert h1.shape == h0.shape
     h1 = st_enc_1(h0, g)
@@ -178,7 +186,7 @@ def test_set_trans():
     g1 = dgl.DGLGraph(nx.path_graph(5))
     g2 = dgl.DGLGraph(nx.path_graph(10))
     bg = dgl.batch([g, g1, g2])
-    h0 = th.rand(bg.number_of_nodes(), 50)
+    h0 = F.rand((bg.number_of_nodes(), 50))
     h1 = st_enc_0(h0, bg)
     assert h1.shape == h0.shape
     h1 = st_enc_1(h0, bg)

From a91b1bbdf0e7d1626fd182b7b5d048112986afa1 Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Mon, 19 Aug 2019 21:34:54 +0800
Subject: [PATCH 14/32] Fix syntex

---
 python/dgl/nn/mxnet/glob.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/dgl/nn/mxnet/glob.py b/python/dgl/nn/mxnet/glob.py
index 32d33df9402e..62eef9f40678 100644
--- a/python/dgl/nn/mxnet/glob.py
+++ b/python/dgl/nn/mxnet/glob.py
@@ -1,6 +1,5 @@
 """MXNet modules for graph global pooling."""
 # pylint: disable= no-member, arguments-differ, C0103, W0235
-import mxnet as mx
 from mxnet import gluon, nd
 from mxnet.gluon import nn
 

From edf6a0ed69cbc6c954dd7eb60bab15d2767faf0b Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Wed, 21 Aug 2019 15:17:12 +0800
Subject: [PATCH 15/32] Remove rand

---
 tests/backend/__init__.py         |  4 ----
 tests/backend/backend_unittest.py |  4 ----
 tests/backend/mxnet/__init__.py   |  3 ---
 tests/backend/pytorch/__init__.py |  3 ---
 tests/pytorch/test_nn.py          | 20 ++++++++++----------
 5 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/tests/backend/__init__.py b/tests/backend/__init__.py
index dd189bdec706..542f2daa0d84 100644
--- a/tests/backend/__init__.py
+++ b/tests/backend/__init__.py
@@ -23,7 +23,6 @@
 _zeros = zeros
 _ones = ones
 _randn = randn
-_rand = rand
 _tensor = tensor
 _arange = arange
 _full = full
@@ -51,9 +50,6 @@ def ones(shape, dtype=float32, ctx=_default_context):
 def randn(shape):
     return copy_to(_randn(shape), _default_context)
 
-def rand(shape):
-    return copy_to(_rand(shape), _default_context)
-
 def tensor(data, dtype=None):
     if dtype is None:
         if is_tensor(data):
diff --git a/tests/backend/backend_unittest.py b/tests/backend/backend_unittest.py
index 00a391244d37..3c7e90542421 100644
--- a/tests/backend/backend_unittest.py
+++ b/tests/backend/backend_unittest.py
@@ -31,10 +31,6 @@ def randn(shape):
     """Generate a tensor with elements from standard normal distribution."""
     pass
 
-def rand(shape):
-    """Generate a tensor with elements from a discrete uniform distribution."""
-    pass
-
 def attach_grad(x):
     """Flag the tensor *in-place* to have its gradient computed in backward
     pass.
diff --git a/tests/backend/mxnet/__init__.py b/tests/backend/mxnet/__init__.py
index 5dcba2e5f0e5..cd678699b8ca 100644
--- a/tests/backend/mxnet/__init__.py
+++ b/tests/backend/mxnet/__init__.py
@@ -25,9 +25,6 @@ def allclose(a, b, rtol=1e-4, atol=1e-4):
 def randn(shape):
     return nd.random.randn(*shape)
 
-def rand(shape):
-    return nd.random.randint(0, 1, *shape)
-
 def attach_grad(x):
     x.attach_grad()
     return x
diff --git a/tests/backend/pytorch/__init__.py b/tests/backend/pytorch/__init__.py
index 690902a2ee71..c7ffa72f655b 100644
--- a/tests/backend/pytorch/__init__.py
+++ b/tests/backend/pytorch/__init__.py
@@ -18,9 +18,6 @@ def allclose(a, b, rtol=1e-4, atol=1e-4):
 def randn(shape):
     return th.randn(*shape)
 
-def rand(shape):
-    return th.rand(*shape)
-
 def attach_grad(x):
     if x.grad is not None:
         x.grad.zero_()
diff --git a/tests/pytorch/test_nn.py b/tests/pytorch/test_nn.py
index b38913b2c9b6..3fbfdbadcf9e 100644
--- a/tests/pytorch/test_nn.py
+++ b/tests/pytorch/test_nn.py
@@ -78,7 +78,7 @@ def test_set2set():
     print(s2s)
 
     # test#1: basic
-    h0 = F.rand((g.number_of_nodes(), 5))
+    h0 = F.randn((g.number_of_nodes(), 5))
     h1 = s2s(h0, g)
     assert h1.shape[0] == 10 and h1.dim() == 1
 
@@ -86,7 +86,7 @@ def test_set2set():
     g1 = dgl.DGLGraph(nx.path_graph(11))
     g2 = dgl.DGLGraph(nx.path_graph(5))
     bg = dgl.batch([g, g1, g2])
-    h0 = F.rand((bg.number_of_nodes(), 5))
+    h0 = F.randn((bg.number_of_nodes(), 5))
     h1 = s2s(h0, bg)
     assert h1.shape[0] == 3 and h1.shape[1] == 10 and h1.dim() == 2
 
@@ -99,13 +99,13 @@ def test_glob_att_pool():
     print(gap)
 
     # test#1: basic
-    h0 = F.rand((g.number_of_nodes(), 5))
+    h0 = F.randn((g.number_of_nodes(), 5))
     h1 = gap(h0, g)
     assert h1.shape[0] == 10 and h1.dim() == 1
 
     # test#2: batched graph
     bg = dgl.batch([g, g, g, g])
-    h0 = F.rand((bg.number_of_nodes(), 5))
+    h0 = F.randn((bg.number_of_nodes(), 5))
     h1 = gap(h0, bg)
     assert h1.shape[0] == 4 and h1.shape[1] == 10 and h1.dim() == 2
 
@@ -119,7 +119,7 @@ def test_simple_pool():
     print(sum_pool, avg_pool, max_pool, sort_pool)
 
     # test#1: basic
-    h0 = F.rand((g.number_of_nodes(), 5))
+    h0 = F.randn((g.number_of_nodes(), 5))
     h1 = sum_pool(h0, g)
     assert F.allclose(h1, F.sum(h0, 0))
     h1 = avg_pool(h0, g)
@@ -132,7 +132,7 @@ def test_simple_pool():
     # test#2: batched graph
     g_ = dgl.DGLGraph(nx.path_graph(5))
     bg = dgl.batch([g, g_, g, g_, g])
-    h0 = F.rand((bg.number_of_nodes(), 5))
+    h0 = F.randn((bg.number_of_nodes(), 5))
 
     h1 = sum_pool(h0, bg)
     truth = th.stack([F.sum(h0[:15], 0),
@@ -174,7 +174,7 @@ def test_set_trans():
     print(st_enc_0, st_enc_1, st_dec)
 
     # test#1: basic
-    h0 = F.rand((g.number_of_nodes(), 50))
+    h0 = F.randn((g.number_of_nodes(), 50))
     h1 = st_enc_0(h0, g)
     assert h1.shape == h0.shape
     h1 = st_enc_1(h0, g)
@@ -186,7 +186,7 @@ def test_set_trans():
     g1 = dgl.DGLGraph(nx.path_graph(5))
     g2 = dgl.DGLGraph(nx.path_graph(10))
     bg = dgl.batch([g, g1, g2])
-    h0 = F.rand((bg.number_of_nodes(), 50))
+    h0 = F.randn((bg.number_of_nodes(), 50))
     h1 = st_enc_0(h0, bg)
     assert h1.shape == h0.shape
     h1 = st_enc_1(h0, bg)
@@ -224,9 +224,9 @@ def test_edge_softmax():
         for j in range(30):
             g.add_edge(i, j)
 
-    score = F.rand((900, 1))
+    score = F.randn((900, 1))
     score.requires_grad_()
-    grad = F.rand((900, 1))
+    grad = F.randn((900, 1))
     y = F.softmax(score.view(30, 30), dim=0).view(-1, 1)
     y.backward(grad)
     grad_score = score.grad

From b86e8db81e324080c6977508f32f86e180f94d27 Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Tue, 3 Sep 2019 16:05:02 +0800
Subject: [PATCH 16/32] Start implementing masked-mm kernel.

Add base control flow code.
---
 python/dgl/function/message.py         | 11 ++++++
 src/kernel/binary_reduce_common.h      | 26 +++++++++++++++
 src/kernel/binary_reduce_impl.h        | 39 ++++++++++++++++++++++
 src/kernel/cpu/binary_reduce_impl.h    | 45 +++++++++++++++++++++++++
 src/kernel/cuda/binary_reduce_impl.cuh | 46 ++++++++++++++++++++++++++
 5 files changed, 167 insertions(+)

diff --git a/python/dgl/function/message.py b/python/dgl/function/message.py
index 2627aa7fcc2b..91c98485546b 100644
--- a/python/dgl/function/message.py
+++ b/python/dgl/function/message.py
@@ -147,12 +147,16 @@ def copy_e(e, out):
 
 ###############################################################################
 # Generate all following  builtin message functions:
+# element-wise message functions:
 # u_add_v, u_sub_v, u_mul_v, u_div_v
 # u_add_e, u_sub_e, u_mul_e, u_div_e
 # v_add_u, v_sub_u, v_mul_u, v_div_u
 # v_add_e, v_sub_e, v_mul_e, v_div_e
 # e_add_u, e_sub_u, e_mul_u, e_div_u
 # e_add_v, e_sub_v, e_mul_v, e_div_v
+# 
+# masked-mm message functions:
+# u_dot_v, u_dot_e, v_dot_e
 
 _TARGET_MAP = {
     "u": TargetCode.SRC,
@@ -205,6 +209,13 @@ def _register_builtin_message_func():
                 setattr(sys.modules[__name__], func.__name__, func)
                 __all__.append(func.__name__)
 
+    """Register builtin masked-mm functions"""
+    for lhs, rhs in product(["u", "v"], ["v", "e"]):
+        if lhs != rhs:
+            for binary_op in ["dot"]:
+                func = _gen_message_builtin(lhs, rhs, binary_op)
+                setattr(sys.modules[__name__], func.__name__, func)
+                __all__.append(func.__name__)
 
 _register_builtin_message_func()
 
diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h
index f7052d99c3c1..106242bafc6d 100644
--- a/src/kernel/binary_reduce_common.h
+++ b/src/kernel/binary_reduce_common.h
@@ -29,6 +29,7 @@ static const char kAdd[] = "add";
 static const char kSub[] = "sub";
 static const char kMul[] = "mul";
 static const char kDiv[] = "div";
+static const char kDot[] = "dot";
 static const char kUseLhs[] = "use_lhs";
 
 /*!
@@ -192,6 +193,19 @@ struct BinaryUseLhs {
   }
 };
 
+template <typename DType>
+struct BinaryDot {
+  static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) {
+    return lhs * rhs;
+  }
+  static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
+    return 1;
+  }
+  static DGLDEVICE DGLINLINE DType BackwardRhs(DType lhs, DType rhs, DType out) {
+    return 1;
+  }
+};
+
 // Macro for dispatching op enum code and target code into template arguments.
 // The macro dispatches following combinations:
 //  - Add(Src, Dst), Add(Src, Edge), Add(Dst, Edge)
@@ -306,6 +320,18 @@ struct BinaryUseLhs {
     typedef SelectEdge LeftType;                               \
     typedef SelectNone RightType;                              \
     {__VA_ARGS__}                                              \
+  } else if (op == kDot && lhs == kSrc && rhs == kDst) {       \
+    typedef SelectSrc LeftType;                                \
+    typedef SelectDst RightType;                               \
+    {__VA_ARGS__}                                              \
+  } else if (op == kDot && lhs == kSrc && rhs == kEdge) {      \
+    typedef SelectSrc LeftType;                                \
+    typedef SelectEdge RightType;                              \
+    {__VA_ARGS__}                                              \
+  } else if (op == kDot && lhs == kDst && rhs == kEdge) {      \
+    typedef SelectDst LeftType;                                \
+    typedef SelectEdge RightType;                              \
+    {__VA_ARGS__}                                              \
   } else {                                                     \
     LOG(FATAL) << "Unsupported operation: op=" << op           \
       << " lhs=" << lhs << " rhs=" << rhs;                     \
diff --git a/src/kernel/binary_reduce_impl.h b/src/kernel/binary_reduce_impl.h
index f35fe7ef197c..19d132837aa1 100644
--- a/src/kernel/binary_reduce_impl.h
+++ b/src/kernel/binary_reduce_impl.h
@@ -83,6 +83,27 @@ void BinaryReduceImpl(
   //              instruction level parallelism
   rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
 #endif
+  if (op == binary_op::kDot) {
+    // A dot B impl is different from others
+    if (reducer != binary_op::kReduceNone) {
+      // TODO(xiang song): Need Reduce for A Dot B?
+      LOG(FATAL) << "With Dot operation, Only None reduce is supported.";
+    }
+
+    // Built in A dot B impl
+    const DLDataType& dtype = out_data->dtype;
+    const auto bits = graph.NumBits();
+    DGL_DTYPE_SWITCH(dtype, DType, {
+      DGL_IDX_TYPE_SWITCH(bits, Idx, {
+        auto gdata = AllocGData<XPU, Idx, DType, ReduceNone<XPU, DType>>(
+            rtcfg.ctx, x_len, lhs_mapping, rhs_mapping,
+            lhs_data, rhs_data, out_mapping, out_data);
+        OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
+          CallBinaryDot<XPU, Idx, DType, LeftTarget, RightTarget>(rtcfg, graph, &gdata);
+        });
+      });
+    });
+  }
   if (reducer == binary_op::kReduceMean) {
     // TODO(minjie): divide
     LOG(FATAL) << "reduce mean is not supported.";
@@ -182,6 +203,18 @@ void BackwardBinaryReduceImpl(
   const bool req_lhs = !utils::IsNoneArray(grad_lhs_data);
   const bool req_rhs = !utils::IsNoneArray(grad_rhs_data);
   const auto bits = graph.NumBits();
+
+  if (op == binary_op::kDot) {
+    // A dot B impl is different from others
+    if (reducer != binary_op::kReduceNone) {
+      // TODO(xiang song): Need Reduce for A Dot B?
+      LOG(FATAL) << "With Dot operation, Only None reduce is supported.";
+    }
+
+    // Built in A dot B impl
+    // (TODO: xiang song) To implement it
+  }
+
   if (reducer == binary_op::kReduceMean) {
     // TODO(minjie): divide
     LOG(FATAL) << "reduce mean is not supported.";
@@ -280,6 +313,9 @@ void BinaryReduceBcastImpl(
   const DLDataType& dtype = out_data->dtype;
   const int bcast_ndim = info.out_shape.size();
   const auto bits = graph.NumBits();
+  if (op == binary_op::kDot) {
+    LOG(FATAL) << "dot operation is not allowed with broadcast";
+  }
   if (reducer == binary_op::kReduceMean) {
     // TODO(minjie): divide
     LOG(FATAL) << "reduce mean is not supported.";
@@ -387,6 +423,9 @@ void BackwardBinaryReduceBcastImpl(
   const bool req_lhs = !utils::IsNoneArray(grad_lhs);
   const bool req_rhs = !utils::IsNoneArray(grad_rhs);
   const auto bits = graph.NumBits();
+  if (op == binary_op::kDot) {
+    LOG(FATAL) << "dot operation is not allowed with broadcast";
+  }
   if (reducer == binary_op::kReduceMean) {
     // TODO(minjie): divide
     LOG(FATAL) << "reduce mean is not supported.";
diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h
index 7d9ba646180b..d8de1ca86992 100644
--- a/src/kernel/cpu/binary_reduce_impl.h
+++ b/src/kernel/cpu/binary_reduce_impl.h
@@ -53,6 +53,19 @@ struct BinaryReduce {
   }
 };
 
+// Minigun UDF to compute binary reduce.
+template <typename Idx, typename DType, typename Functors>
+struct BinaryDot {
+  static __device__ __forceinline__ bool CondEdge(
+      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
+    return true;
+  }
+  static __device__ __forceinline__ void ApplyEdge(
+      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
+    return true;
+  }
+};
+
 // Convert flattened index to multi-dimension index (assume row-major).
 inline void Unravel(int64_t idx, int ndim,
     const int64_t* shape, const int64_t* stride, int64_t* out) {
@@ -175,6 +188,38 @@ void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg,
         rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
 }
 
+// Template implementation of BinaryReduce operator.
+template <int XPU, typename Idx, typename DType,
+          typename LeftSelector, typename RightSelector>
+void CallBinaryDot(const minigun::advance::RuntimeConfig& rtcfg,
+                      const CSRWrapper& graph,
+                      GData<Idx, DType>* gdata) {
+  //For binary dot, it should be none reducer.
+  typedef cpu::FunctorsTempl<Idx, DType, LeftSelector,
+                        RightSelector, binary_op::BinaryDot<Dtype>, ReduceNone<Dtype>>
+          Functors;
+  typedef cpu::BinaryDot<Idx, DType, Functors> UDF;
+  // csr
+  auto outcsr = graph.GetOutCSRMatrix();
+  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
+  // If the user-given mapping is none and the target is edge data, we need to
+  // replace the mapping by the edge ids in the csr graph so that the edge
+  // data is correctly read/written.
+  if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
+    gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
+  }
+  if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
+    gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
+  }
+  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
+      && gdata->out_mapping == nullptr) {
+    gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
+  }
+  // TODO(minjie): allocator
+  minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig, GData<Idx, DType>, UDF>(
+        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
+}
+
 // Template implementation of BinaryReduce broadcasting operator.
 template <int XPU, int NDim, typename Idx, typename DType,
           typename LeftSelector, typename RightSelector,
diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh
index 9a619a090432..3b21156b9c4b 100644
--- a/src/kernel/cuda/binary_reduce_impl.cuh
+++ b/src/kernel/cuda/binary_reduce_impl.cuh
@@ -54,6 +54,19 @@ struct BinaryReduce {
   }
 };
 
+// Minigun UDF to compute binary reduce.
+template <typename Idx, typename DType, typename Functors>
+struct BinaryDot {
+  static __device__ __forceinline__ bool CondEdge(
+      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
+    return true;
+  }
+  static __device__ __forceinline__ void ApplyEdge(
+      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
+    return true;
+  }
+};
+
 // Convert flattened index to multi-dimension index (assume row-major).
 __device__ __forceinline__ void Unravel(
     int64_t idx, int ndim, const int64_t* shape, const int64_t* stride, int64_t* out) {
@@ -178,6 +191,39 @@ void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg,
         rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
 }
 
+// Template implementation of BinaryDot operator.
+template <int XPU, typename Idx, typename DType,
+          typename LeftSelector, typename RightSelector>
+void CallBinaryDot(const minigun::advance::RuntimeConfig& rtcfg,
+                      const CSRWrapper& graph,
+                      GData<Idx, DType>* gdata) {
+  //For binary dot, it should be none reducer.
+  typedef cuda::FunctorsTempl<Idx, DType, LeftSelector,
+                        RightSelector, binary_op::BinaryDot<Dtype>, ReduceNone<Dtype>>
+          Functors;
+  typedef cuda::BinaryDot<Idx, DType, Functors> UDF;
+
+  // csr
+  auto outcsr = graph.GetOutCSRMatrix();
+  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
+  // If the user-given mapping is none and the target is edge data, we need to
+  // replace the mapping by the edge ids in the csr graph so that the edge
+  // data is correctly read/written.
+  if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
+    gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
+  }
+  if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
+    gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
+  }
+  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
+      && gdata->out_mapping == nullptr) {
+    gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
+  }
+  // TODO(minjie): allocator
+  minigun::advance::Advance<XPU, Idx, cuda::AdvanceConfig, GData<Idx, DType>, UDF>(
+        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
+}
+
 // Template implementation of BinaryReduce broadcasting operator.
 template <int XPU, int NDim, typename Idx, typename DType,
           typename LeftSelector, typename RightSelector,

From a0a55ecaaa55e5e3a793793f3fa7198cbe7d66c0 Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Tue, 3 Sep 2019 17:46:32 +0800
Subject: [PATCH 17/32] Add masked dot declare

---
 src/kernel/cpu/binary_masked_dot.cc    | 26 ++++++++++++++++++++++++++
 src/kernel/cpu/binary_reduce_impl.h    |  9 +++++++++
 src/kernel/cuda/binary_masked_dot.cu   | 20 ++++++++++++++++++++
 src/kernel/cuda/binary_reduce_impl.cuh |  9 +++++++++
 4 files changed, 64 insertions(+)
 create mode 100644 src/kernel/cpu/binary_masked_dot.cc
 create mode 100644 src/kernel/cuda/binary_masked_dot.cu

diff --git a/src/kernel/cpu/binary_masked_dot.cc b/src/kernel/cpu/binary_masked_dot.cc
new file mode 100644
index 000000000000..c1f24ca97bbc
--- /dev/null
+++ b/src/kernel/cpu/binary_masked_dot.cc
@@ -0,0 +1,26 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file kernel/cpu/binary_masked_dot.cc
+ * \brief CPU kernels for binary reduce prod
+ */
+#include "./binary_reduce_impl.h"
+#include "./backward_binary_reduce_impl.h"
+
+namespace dgl {
+namespace kernel {
+
+#define REDUCER ReduceNone
+#define XPU kDLCPU
+
+#define IDX int32_t
+EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_MASKED_DOT_DEFINE);
+EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
+#undef IDX
+
+#define IDX int64_t
+EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_MASKED_DOT_DEFINE);
+EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
+#undef IDX
+
+}  // namespace kernel
+}  // namespace dgl
diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h
index d8de1ca86992..3f2f7cf5c81a 100644
--- a/src/kernel/cpu/binary_reduce_impl.h
+++ b/src/kernel/cpu/binary_reduce_impl.h
@@ -263,6 +263,15 @@ void CallBinaryReduceBcast(
       const CSRWrapper& graph,                                     \
       GData<IDX, dtype>* gdata);
 
+// Following macro is used to generate explicit-specialization of the template
+// operator.
+#define GEN_MASKED_DOT_DEFINE(dtype, lhs_tgt, rhs_tgt)             \
+  template void CallBinaryMaskedDot<XPU, IDX,                      \
+        dtype, lhs_tgt, rhs_tgt>(                                  \
+      const minigun::advance::RuntimeConfig& rtcfg,                \
+      const CSRWrapper& graph,                                     \
+      GData<IDX, dtype>* gdata);
+
 #define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op)         \
   template void CallBinaryReduceBcast<XPU, ndim, IDX, dtype,     \
                                  lhs_tgt, rhs_tgt,                  \
diff --git a/src/kernel/cuda/binary_masked_dot.cu b/src/kernel/cuda/binary_masked_dot.cu
new file mode 100644
index 000000000000..4eb1db9f8670
--- /dev/null
+++ b/src/kernel/cuda/binary_masked_dot.cu
@@ -0,0 +1,20 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file kernel/cuda/binary_masked_dot.cu
+ * \brief CUDA kernels for binary reduce prod
+ */
+#include "./binary_reduce_impl.cuh"
+#include "./backward_binary_reduce_impl.cuh"
+
+namespace dgl {
+namespace kernel {
+
+#define REDUCER ReduceNone
+#define XPU kDLGPU
+#define IDX int32_t
+
+EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_MASKED_DOT_DEFINE)
+EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE)
+
+}  // namespace kernel
+}  // namespace dgl
diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh
index 3b21156b9c4b..5f81d0b659f1 100644
--- a/src/kernel/cuda/binary_reduce_impl.cuh
+++ b/src/kernel/cuda/binary_reduce_impl.cuh
@@ -267,6 +267,15 @@ void CallBinaryReduceBcast(
       const CSRWrapper& graph,                                     \
       GData<IDX, dtype>* gdata);
 
+// Following macro is used to generate explicit-specialization of the template
+// operator.
+#define GEN_MASKED_DOT_DEFINE(dtype, lhs_tgt, rhs_tgt)             \
+  template void CallBinaryMaskedDot<XPU, IDX,                      \
+        dtype, lhs_tgt, rhs_tgt>(                                  \
+      const minigun::advance::RuntimeConfig& rtcfg,                \
+      const CSRWrapper& graph,                                     \
+      GData<IDX, dtype>* gdata);
+
 #define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op)         \
   template void CallBinaryReduceBcast<XPU, ndim, IDX, dtype,     \
                                  lhs_tgt, rhs_tgt,                  \

From f27fddc2e24f6aa0aefb748e39094056b40eb837 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-3-16.us-east-2.compute.internal>
Date: Tue, 3 Sep 2019 09:50:21 +0000
Subject: [PATCH 18/32] Update func/variable name

---
 src/kernel/binary_reduce_impl.h        |  2 +-
 src/kernel/binary_reduce_impl_decl.h   | 28 ++++++++++++++++++++++++++
 src/kernel/cpu/binary_reduce_impl.h    | 17 +++++++++-------
 src/kernel/cuda/binary_reduce_impl.cuh | 19 +++++++++--------
 4 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/src/kernel/binary_reduce_impl.h b/src/kernel/binary_reduce_impl.h
index 19d132837aa1..9f4e86eed594 100644
--- a/src/kernel/binary_reduce_impl.h
+++ b/src/kernel/binary_reduce_impl.h
@@ -99,7 +99,7 @@ void BinaryReduceImpl(
             rtcfg.ctx, x_len, lhs_mapping, rhs_mapping,
             lhs_data, rhs_data, out_mapping, out_data);
         OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
-          CallBinaryDot<XPU, Idx, DType, LeftTarget, RightTarget>(rtcfg, graph, &gdata);
+          CallBinaryMaskedDot<XPU, Idx, DType, LeftTarget, RightTarget>(rtcfg, graph, &gdata);
         });
       });
     });
diff --git a/src/kernel/binary_reduce_impl_decl.h b/src/kernel/binary_reduce_impl_decl.h
index f353ba86bc62..4701220b02b0 100644
--- a/src/kernel/binary_reduce_impl_decl.h
+++ b/src/kernel/binary_reduce_impl_decl.h
@@ -82,6 +82,34 @@ void CallBinaryReduce(
     const CSRWrapper& graph,
     GData<Idx, DType>* gdata);
 
+/*!
+ * \brief Template declaration for BinaryMaskedDot operator.
+ *
+ * LeftSelector and RightSelector must be one of the four operand target
+ * categories.
+ *
+ * The implementation of this template is device-dependent
+ * (see kernel/xpu/binary_reduce_impl.(cu)h).
+ *
+ * See definitions in binary_reduce_common.h
+ *
+ * \tparam XPU the device flag
+ * \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
+ * \tparam DType type of the feature data (e.g. float32)
+ * \tparam LeftSelect lhs category type
+ * \tparam RightSelect rhs category type
+ * \param rtcfg Runtime configuration used by miningun
+ * \param graph The graph object.
+ * \param gdata The feature and mapping data used by the computation.
+ */
+template <int XPU, typename Idx, typename DType,
+          typename LeftSelector, typename RightSelector>
+void CallBinaryReduce(
+    const minigun::advance::RuntimeConfig& rtcfg,
+    const CSRWrapper& graph,
+    GData<Idx, DType>* gdata);
+
+
 /*!
  * \brief Template declaration for common logics shared by different devices.
  *
diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h
index d8de1ca86992..de9f12e1e0ed 100644
--- a/src/kernel/cpu/binary_reduce_impl.h
+++ b/src/kernel/cpu/binary_reduce_impl.h
@@ -55,7 +55,7 @@ struct BinaryReduce {
 
 // Minigun UDF to compute binary reduce.
 template <typename Idx, typename DType, typename Functors>
-struct BinaryDot {
+struct BinaryMaskedDot {
   static __device__ __forceinline__ bool CondEdge(
       Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
     return true;
@@ -188,17 +188,17 @@ void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg,
         rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
 }
 
-// Template implementation of BinaryReduce operator.
+// Template implementation of BinaryMaksedDot operator.
 template <int XPU, typename Idx, typename DType,
           typename LeftSelector, typename RightSelector>
-void CallBinaryDot(const minigun::advance::RuntimeConfig& rtcfg,
+void CallBinaryMaskedDot(const minigun::advance::RuntimeConfig& rtcfg,
                       const CSRWrapper& graph,
                       GData<Idx, DType>* gdata) {
   //For binary dot, it should be none reducer.
   typedef cpu::FunctorsTempl<Idx, DType, LeftSelector,
-                        RightSelector, binary_op::BinaryDot<Dtype>, ReduceNone<Dtype>>
+                        RightSelector, BinaryDot<DType>, ReduceNone<XPU, DType>>
           Functors;
-  typedef cpu::BinaryDot<Idx, DType, Functors> UDF;
+  typedef cpu::BinaryMaskedDot<Idx, DType, Functors> UDF;
   // csr
   auto outcsr = graph.GetOutCSRMatrix();
   minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
@@ -211,8 +211,11 @@ void CallBinaryDot(const minigun::advance::RuntimeConfig& rtcfg,
   if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
     gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
   }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
+
+  // For Masked Matrix Multiply, the output target should be edge.
+  // If the user-given mapping is none, we need to replace the mapping by the 
+  // edge ids in the csr graph.
+  if (gdata->out_mapping == nullptr) {
     gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
   }
   // TODO(minjie): allocator
diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh
index 3b21156b9c4b..013483f4255b 100644
--- a/src/kernel/cuda/binary_reduce_impl.cuh
+++ b/src/kernel/cuda/binary_reduce_impl.cuh
@@ -56,14 +56,14 @@ struct BinaryReduce {
 
 // Minigun UDF to compute binary reduce.
 template <typename Idx, typename DType, typename Functors>
-struct BinaryDot {
+struct BinaryMaskedDot {
   static __device__ __forceinline__ bool CondEdge(
       Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
     return true;
   }
   static __device__ __forceinline__ void ApplyEdge(
       Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
-    return true;
+    return;
   }
 };
 
@@ -191,17 +191,17 @@ void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg,
         rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
 }
 
-// Template implementation of BinaryDot operator.
+// Template implementation of BinaryMaskedDot operator.
 template <int XPU, typename Idx, typename DType,
           typename LeftSelector, typename RightSelector>
-void CallBinaryDot(const minigun::advance::RuntimeConfig& rtcfg,
+void CallBinaryMaskedDot(const minigun::advance::RuntimeConfig& rtcfg,
                       const CSRWrapper& graph,
                       GData<Idx, DType>* gdata) {
   //For binary dot, it should be none reducer.
   typedef cuda::FunctorsTempl<Idx, DType, LeftSelector,
-                        RightSelector, binary_op::BinaryDot<Dtype>, ReduceNone<Dtype>>
+                        RightSelector, BinaryDot<DType>, ReduceNone<XPU, DType>>
           Functors;
-  typedef cuda::BinaryDot<Idx, DType, Functors> UDF;
+  typedef cuda::BinaryMaskedDot<Idx, DType, Functors> UDF;
 
   // csr
   auto outcsr = graph.GetOutCSRMatrix();
@@ -215,8 +215,11 @@ void CallBinaryDot(const minigun::advance::RuntimeConfig& rtcfg,
   if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
     gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
   }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
+
+  // For Masked Matrix Multiply, the output target should be edge.
+  // If the user-given mapping is none, we need to replace the mapping by the 
+  // edge ids in the csr graph.
+  if (gdata->out_mapping == nullptr) {
     gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
   }
   // TODO(minjie): allocator

From d8825994501bca299113f0d7791314ecdf85371c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-3-16.us-east-2.compute.internal>
Date: Tue, 3 Sep 2019 13:41:56 +0000
Subject: [PATCH 19/32] Skeleton compile OK

---
 src/kernel/binary_reduce_common.h    | 3 +++
 src/kernel/binary_reduce_impl.h      | 3 +++
 src/kernel/binary_reduce_impl_decl.h | 2 +-
 src/kernel/cpu/binary_reduce_impl.h  | 4 ++--
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h
index 106242bafc6d..779b7eda54e3 100644
--- a/src/kernel/binary_reduce_common.h
+++ b/src/kernel/binary_reduce_common.h
@@ -321,14 +321,17 @@ struct BinaryDot {
     typedef SelectNone RightType;                              \
     {__VA_ARGS__}                                              \
   } else if (op == kDot && lhs == kSrc && rhs == kDst) {       \
+    typedef BinaryDot<DType> OpType;                           \
     typedef SelectSrc LeftType;                                \
     typedef SelectDst RightType;                               \
     {__VA_ARGS__}                                              \
   } else if (op == kDot && lhs == kSrc && rhs == kEdge) {      \
+    typedef BinaryDot<DType> OpType;                           \
     typedef SelectSrc LeftType;                                \
     typedef SelectEdge RightType;                              \
     {__VA_ARGS__}                                              \
   } else if (op == kDot && lhs == kDst && rhs == kEdge) {      \
+    typedef BinaryDot<DType> OpType;                           \
     typedef SelectDst LeftType;                                \
     typedef SelectEdge RightType;                              \
     {__VA_ARGS__}                                              \
diff --git a/src/kernel/binary_reduce_impl.h b/src/kernel/binary_reduce_impl.h
index 9f4e86eed594..361fc6387303 100644
--- a/src/kernel/binary_reduce_impl.h
+++ b/src/kernel/binary_reduce_impl.h
@@ -103,6 +103,8 @@ void BinaryReduceImpl(
         });
       });
     });
+
+    return;
   }
   if (reducer == binary_op::kReduceMean) {
     // TODO(minjie): divide
@@ -213,6 +215,7 @@ void BackwardBinaryReduceImpl(
 
     // Built in A dot B impl
     // (TODO: xiang song) To implement it
+    return;
   }
 
   if (reducer == binary_op::kReduceMean) {
diff --git a/src/kernel/binary_reduce_impl_decl.h b/src/kernel/binary_reduce_impl_decl.h
index 4701220b02b0..f0d7822adfaa 100644
--- a/src/kernel/binary_reduce_impl_decl.h
+++ b/src/kernel/binary_reduce_impl_decl.h
@@ -104,7 +104,7 @@ void CallBinaryReduce(
  */
 template <int XPU, typename Idx, typename DType,
           typename LeftSelector, typename RightSelector>
-void CallBinaryReduce(
+void CallBinaryMaskedDot(
     const minigun::advance::RuntimeConfig& rtcfg,
     const CSRWrapper& graph,
     GData<Idx, DType>* gdata);
diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h
index 76abbefa6eee..d657d7404137 100644
--- a/src/kernel/cpu/binary_reduce_impl.h
+++ b/src/kernel/cpu/binary_reduce_impl.h
@@ -56,11 +56,11 @@ struct BinaryReduce {
 // Minigun UDF to compute binary reduce.
 template <typename Idx, typename DType, typename Functors>
 struct BinaryMaskedDot {
-  static __device__ __forceinline__ bool CondEdge(
+  static inline bool CondEdge(
       Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
     return true;
   }
-  static __device__ __forceinline__ void ApplyEdge(
+  static inline void ApplyEdge(
       Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
     return true;
   }

From 190102bbcc21a950d4085b3458677e93655fbcb6 Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Wed, 4 Sep 2019 15:02:36 +0800
Subject: [PATCH 20/32] Update Implement. Unify BinaryDot with BinaryReduce

---
 python/dgl/backend/mxnet/tensor.py     |  2 +-
 python/dgl/backend/pytorch/tensor.py   |  2 +-
 python/dgl/kernel.py                   |  4 +-
 src/kernel/binary_reduce.cc            | 26 +++++--
 src/kernel/binary_reduce.h             |  2 +
 src/kernel/binary_reduce_common.h      | 34 +++++----
 src/kernel/binary_reduce_impl.h        | 58 ++++------------
 src/kernel/binary_reduce_impl_decl.h   | 32 ++-------
 src/kernel/cpu/binary_masked_dot.cc    |  4 +-
 src/kernel/cpu/binary_reduce_impl.h    | 88 ++++--------------------
 src/kernel/cuda/binary_masked_dot.cu   |  2 +-
 src/kernel/cuda/binary_reduce_impl.cuh | 95 +++++---------------------
 12 files changed, 99 insertions(+), 250 deletions(-)

diff --git a/python/dgl/backend/mxnet/tensor.py b/python/dgl/backend/mxnet/tensor.py
index 63c5c6df4841..d2d38eab0646 100644
--- a/python/dgl/backend/mxnet/tensor.py
+++ b/python/dgl/backend/mxnet/tensor.py
@@ -363,7 +363,7 @@ def __init__(self, reducer, binary_op, graph, lhs, rhs, out_size, lhs_map,
     def forward(self, lhs_data, rhs_data):
         lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data)
         rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
-        feat_shape = K.infer_binary_feature_shape(lhs_data_nd, rhs_data_nd)
+        feat_shape = K.infer_binary_feature_shape(self.binary_op, lhs_data_nd, rhs_data_nd)
         out_data = nd.empty((self.out_size,) + feat_shape,
                             ctx=lhs_data.context, dtype=lhs_data.dtype)
         out_data_nd = zerocopy_to_dgl_ndarray_for_write(out_data)
diff --git a/python/dgl/backend/pytorch/tensor.py b/python/dgl/backend/pytorch/tensor.py
index 3b2f392cbf55..bc75ed5f0cdb 100644
--- a/python/dgl/backend/pytorch/tensor.py
+++ b/python/dgl/backend/pytorch/tensor.py
@@ -285,7 +285,7 @@ def forward(ctx, reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data,
                 out_size, lhs_map, rhs_map, out_map):
         lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data)
         rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
-        feat_shape = K.infer_binary_feature_shape(lhs_data_nd, rhs_data_nd)
+        feat_shape = K.infer_binary_feature_shape(binary_op, lhs_data_nd, rhs_data_nd)
         out_data = lhs_data.new_empty((out_size,) + feat_shape)
         out_data_nd = zerocopy_to_dgl_ndarray(out_data)
         K.binary_op_reduce(
diff --git a/python/dgl/kernel.py b/python/dgl/kernel.py
index b6314e360d39..7abe2d38bf4e 100644
--- a/python/dgl/kernel.py
+++ b/python/dgl/kernel.py
@@ -4,7 +4,7 @@
 from ._ffi.function import _init_api
 from .ndarray import empty
 
-def infer_binary_feature_shape(lhs, rhs):
+def infer_binary_feature_shape(op, lhs, rhs):
     """Infer the output feature shape after a binary operation between lhs and rhs.
 
     Parameter
@@ -19,7 +19,7 @@ def infer_binary_feature_shape(lhs, rhs):
     tuple of int
         The output feature shape.
     """
-    ret = _CAPI_DGLKernelInferBinaryFeatureShape(lhs, rhs)
+    ret = _CAPI_DGLKernelInferBinaryFeatureShape(op, lhs, rhs)
     return tuple(ret.asnumpy())
 
 # pylint: disable=invalid-name
diff --git a/src/kernel/binary_reduce.cc b/src/kernel/binary_reduce.cc
index 0adf1e7af2b7..265626f63b3e 100644
--- a/src/kernel/binary_reduce.cc
+++ b/src/kernel/binary_reduce.cc
@@ -92,9 +92,20 @@ bool HasBcast(NDArray lhs, NDArray rhs) {
 //    e.g. (4, 1, 3, 3) and (4, 5, 3, 3) become (4, 1, 9) and (4, 5, 9)
 //
 // See also: BcastInfo (kernel/binary_reduce.h)
-BcastInfo CalcBcastInfo(NDArray lhs, NDArray rhs) {
+BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) {
   BcastInfo ret;
   const int max_ndim = std::max(lhs->ndim, rhs->ndim) - 1;
+  // for dot operation: vector [dot] vector
+  // lhs_shape[ndim-1] == rhs_shape[ndim-1] = sizeof(vector)
+  // out_shape[ndim-1] = 1
+  if (op == binary_op::kDot) {
+    //get size of vector
+    ret.data_len = lhs->shape[lhs->ndim - 1];
+    --max_ndim;
+  } else {//op != binary_op::kDot
+    ret.data_len = 1;
+  }
+
   int64_t accum = 0;
   for (int j = 0; j < max_ndim; ++j) {
     const int dl = (lhs->ndim - 1 - j < 1)? 1 : lhs->shape[lhs->ndim - 1 - j];
@@ -239,9 +250,10 @@ std::vector<int64_t> InferBinaryFeatureShape(
 
 DGL_REGISTER_GLOBAL("kernel._CAPI_DGLKernelInferBinaryFeatureShape")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
-    NDArray lhs = args[0];
-    NDArray rhs = args[1];
-    const auto& shape = InferBinaryFeatureShape(lhs, rhs);
+    std::string op = args[0];
+    NDArray lhs = args[1];
+    NDArray rhs = args[2];
+    const auto& shape = InferBinaryFeatureShape(op, lhs, rhs);
     const int64_t len = shape.size();
     NDArray ret = NDArray::Empty(
         {len}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
@@ -274,7 +286,7 @@ void BinaryOpReduce(
         rhs_mapping, lhs_mapping, out_mapping);
   } else {
     if (HasBcast(lhs_data, rhs_data)) {
-      BcastInfo info = CalcBcastInfo(lhs_data, rhs_data);
+      BcastInfo info = CalcBcastInfo(op, lhs_data, rhs_data);
       DGL_XPU_SWITCH(ctx.device_type, BinaryReduceBcastImpl,
           info, reducer, op, graph,
           lhs, rhs,
@@ -348,7 +360,7 @@ void BackwardLhsBinaryOpReduce(
         grad_out_data, grad_lhs_data);
   } else {
     if (HasBcast(lhs_data, rhs_data)) {
-      BcastInfo info = CalcBcastInfo(lhs_data, rhs_data);
+      BcastInfo info = CalcBcastInfo(op, lhs_data, rhs_data);
       DGL_XPU_SWITCH(ctx.device_type, BackwardBinaryReduceBcastImpl,
           info, reducer, op, graph,
           lhs, rhs,
@@ -424,7 +436,7 @@ void BackwardRhsBinaryOpReduce(
         grad_out_data, grad_rhs_data);
   } else {
     if (HasBcast(lhs_data, rhs_data)) {
-      BcastInfo info = CalcBcastInfo(lhs_data, rhs_data);
+      BcastInfo info = CalcBcastInfo(op, lhs_data, rhs_data);
       DGL_XPU_SWITCH(ctx.device_type, BackwardBinaryReduceBcastImpl,
           info, reducer, op, graph,
           lhs, rhs,
diff --git a/src/kernel/binary_reduce.h b/src/kernel/binary_reduce.h
index 7f5d359df30d..6190bdd3f826 100644
--- a/src/kernel/binary_reduce.h
+++ b/src/kernel/binary_reduce.h
@@ -30,6 +30,8 @@ struct BcastInfo {
   std::vector<int64_t> lhs_shape, lhs_stride;
   std::vector<int64_t> rhs_shape, rhs_stride;
   std::vector<int64_t> out_shape, out_stride;
+
+  int64_t data_len;
 };
 
 /*
diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h
index 779b7eda54e3..a1cd914be03f 100644
--- a/src/kernel/binary_reduce_common.h
+++ b/src/kernel/binary_reduce_common.h
@@ -130,8 +130,8 @@ struct SwitchSrcDst<SelectDst> {
 // common binary functors
 template <typename DType>
 struct BinaryAdd {
-  static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) {
-    return lhs + rhs;
+  static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) {
+    return lhs[0] + rhs[0];
   }
   static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
     return 1;
@@ -143,8 +143,8 @@ struct BinaryAdd {
 
 template <typename DType>
 struct BinaryMul {
-  static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) {
-    return lhs * rhs;
+  static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) {
+    return lhs[0] * rhs[0];
   }
   static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
     return rhs;
@@ -156,8 +156,8 @@ struct BinaryMul {
 
 template <typename DType>
 struct BinarySub {
-  static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) {
-    return lhs - rhs;
+  static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) {
+    return lhs[0] - rhs[0];
   }
   static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
     return 1;
@@ -169,8 +169,8 @@ struct BinarySub {
 
 template <typename DType>
 struct BinaryDiv {
-  static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) {
-    return lhs / rhs;
+  static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) {
+    return lhs[0] / rhs[0];
   }
   static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
     return static_cast<DType>(1) / rhs;
@@ -182,8 +182,8 @@ struct BinaryDiv {
 
 template <typename DType>
 struct BinaryUseLhs {
-  static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) {
-    return lhs;
+  static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) {
+    return lhs[0];
   }
   static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
     return 1;
@@ -195,8 +195,13 @@ struct BinaryUseLhs {
 
 template <typename DType>
 struct BinaryDot {
-  static DGLDEVICE DGLINLINE DType Call(DType lhs, DType rhs) {
-    return lhs * rhs;
+  static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) {
+    Dtype out = 0;
+    //simple vector dot vector
+    for (int i = 0; i < len; i ++)
+      out += lhs[i] * rhs[i];
+
+    return out;
   }
   static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
     return 1;
@@ -362,7 +367,10 @@ struct BinaryDot {
   MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryDiv))     \
   MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectDst, BinaryDiv))     \
   MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectNone, BinaryUseLhs))  \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectNone, BinaryUseLhs))
+  MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectNone, BinaryUseLhs)) \
+  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinaryDot))      \
+  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinaryDot))     \
+  MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryDot))
 
 //////////////////////////////////////////////////////////////////////////
 // Defines reducer category. Each category is an empty structure.
diff --git a/src/kernel/binary_reduce_impl.h b/src/kernel/binary_reduce_impl.h
index 361fc6387303..5f64f2dc6ea9 100644
--- a/src/kernel/binary_reduce_impl.h
+++ b/src/kernel/binary_reduce_impl.h
@@ -28,7 +28,7 @@ namespace kernel {
 ///////////////////////////////////////////////////////////////////////////////
 
 template <int XPU, typename Idx, typename DType, typename Reducer>
-GData<Idx, DType> AllocGData(
+GData<Idx, DType> AllocGData(const std::string& op,
     const DLContext& ctx, int64_t x_len,
     runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
     runtime::NDArray lhs_data, runtime::NDArray rhs_data,
@@ -49,6 +49,15 @@ GData<Idx, DType> AllocGData(
   if (!utils::IsNoneArray(out_mapping)) {
     gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
   }
+
+  // for dot operation: vector [dot] vector
+  if (op == binary::kDot) {
+    //get size of vector
+    gdata.data_len = lhs_data->shape[lhs_data->ndim - 1];
+  } else {
+    gdata.data_len = 1;
+  }
+
   // fill out data with zero values
   utils::Fill<XPU>(ctx, gdata.out_data, utils::NElements(out_data), Zero<Reducer>::value);
   return gdata;
@@ -83,29 +92,6 @@ void BinaryReduceImpl(
   //              instruction level parallelism
   rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
 #endif
-  if (op == binary_op::kDot) {
-    // A dot B impl is different from others
-    if (reducer != binary_op::kReduceNone) {
-      // TODO(xiang song): Need Reduce for A Dot B?
-      LOG(FATAL) << "With Dot operation, Only None reduce is supported.";
-    }
-
-    // Built in A dot B impl
-    const DLDataType& dtype = out_data->dtype;
-    const auto bits = graph.NumBits();
-    DGL_DTYPE_SWITCH(dtype, DType, {
-      DGL_IDX_TYPE_SWITCH(bits, Idx, {
-        auto gdata = AllocGData<XPU, Idx, DType, ReduceNone<XPU, DType>>(
-            rtcfg.ctx, x_len, lhs_mapping, rhs_mapping,
-            lhs_data, rhs_data, out_mapping, out_data);
-        OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
-          CallBinaryMaskedDot<XPU, Idx, DType, LeftTarget, RightTarget>(rtcfg, graph, &gdata);
-        });
-      });
-    });
-
-    return;
-  }
   if (reducer == binary_op::kReduceMean) {
     // TODO(minjie): divide
     LOG(FATAL) << "reduce mean is not supported.";
@@ -115,7 +101,7 @@ void BinaryReduceImpl(
   DGL_DTYPE_SWITCH(dtype, DType, {
     DGL_IDX_TYPE_SWITCH(bits, Idx, {
       REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
-        auto gdata = AllocGData<XPU, Idx, DType, Reducer>(
+        auto gdata = AllocGData<XPU, Idx, DType, Reducer>(op,
             rtcfg.ctx, x_len, lhs_mapping, rhs_mapping,
             lhs_data, rhs_data, out_mapping, out_data);
         OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
@@ -206,18 +192,6 @@ void BackwardBinaryReduceImpl(
   const bool req_rhs = !utils::IsNoneArray(grad_rhs_data);
   const auto bits = graph.NumBits();
 
-  if (op == binary_op::kDot) {
-    // A dot B impl is different from others
-    if (reducer != binary_op::kReduceNone) {
-      // TODO(xiang song): Need Reduce for A Dot B?
-      LOG(FATAL) << "With Dot operation, Only None reduce is supported.";
-    }
-
-    // Built in A dot B impl
-    // (TODO: xiang song) To implement it
-    return;
-  }
-
   if (reducer == binary_op::kReduceMean) {
     // TODO(minjie): divide
     LOG(FATAL) << "reduce mean is not supported.";
@@ -276,6 +250,8 @@ BcastGData<NDim, Idx, DType> AllocBcastGData(
   if (!utils::IsNoneArray(out_mapping)) {
     gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
   }
+
+  gdata.data_len = info.data_len;
   // fill out data with zero values
   utils::Fill<XPU>(ctx, gdata.out_data, utils::NElements(out_data), Zero<Reducer>::value);
   return gdata;
@@ -316,9 +292,7 @@ void BinaryReduceBcastImpl(
   const DLDataType& dtype = out_data->dtype;
   const int bcast_ndim = info.out_shape.size();
   const auto bits = graph.NumBits();
-  if (op == binary_op::kDot) {
-    LOG(FATAL) << "dot operation is not allowed with broadcast";
-  }
+
   if (reducer == binary_op::kReduceMean) {
     // TODO(minjie): divide
     LOG(FATAL) << "reduce mean is not supported.";
@@ -426,9 +400,7 @@ void BackwardBinaryReduceBcastImpl(
   const bool req_lhs = !utils::IsNoneArray(grad_lhs);
   const bool req_rhs = !utils::IsNoneArray(grad_rhs);
   const auto bits = graph.NumBits();
-  if (op == binary_op::kDot) {
-    LOG(FATAL) << "dot operation is not allowed with broadcast";
-  }
+
   if (reducer == binary_op::kReduceMean) {
     // TODO(minjie): divide
     LOG(FATAL) << "reduce mean is not supported.";
diff --git a/src/kernel/binary_reduce_impl_decl.h b/src/kernel/binary_reduce_impl_decl.h
index f0d7822adfaa..5c799372d7e4 100644
--- a/src/kernel/binary_reduce_impl_decl.h
+++ b/src/kernel/binary_reduce_impl_decl.h
@@ -36,6 +36,8 @@ template <typename Idx, typename DType>
 struct GData {
   // length along x(feature) dimension
   int64_t x_length{0};
+  // size of data, can be single value or a vector
+  int64_t data_len;
   // number of rows of the output tensor
   int64_t out_size{0};
   // input data
@@ -82,34 +84,6 @@ void CallBinaryReduce(
     const CSRWrapper& graph,
     GData<Idx, DType>* gdata);
 
-/*!
- * \brief Template declaration for BinaryMaskedDot operator.
- *
- * LeftSelector and RightSelector must be one of the four operand target
- * categories.
- *
- * The implementation of this template is device-dependent
- * (see kernel/xpu/binary_reduce_impl.(cu)h).
- *
- * See definitions in binary_reduce_common.h
- *
- * \tparam XPU the device flag
- * \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
- * \tparam DType type of the feature data (e.g. float32)
- * \tparam LeftSelect lhs category type
- * \tparam RightSelect rhs category type
- * \param rtcfg Runtime configuration used by miningun
- * \param graph The graph object.
- * \param gdata The feature and mapping data used by the computation.
- */
-template <int XPU, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector>
-void CallBinaryMaskedDot(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    GData<Idx, DType>* gdata);
-
-
 /*!
  * \brief Template declaration for common logics shared by different devices.
  *
@@ -250,6 +224,8 @@ struct BcastGData {
   int64_t lhs_len{0}, rhs_len{0};
   int64_t lhs_shape[NDim]{0}, lhs_stride[NDim]{0};
   int64_t rhs_shape[NDim]{0}, rhs_stride[NDim]{0};
+  // size of data, can be single value or a vector
+  int64_t data_len;
   // input data
   DType *lhs_data{nullptr}, *rhs_data{nullptr};
   // input id mappings
diff --git a/src/kernel/cpu/binary_masked_dot.cc b/src/kernel/cpu/binary_masked_dot.cc
index c1f24ca97bbc..fcbd5284211c 100644
--- a/src/kernel/cpu/binary_masked_dot.cc
+++ b/src/kernel/cpu/binary_masked_dot.cc
@@ -13,12 +13,12 @@ namespace kernel {
 #define XPU kDLCPU
 
 #define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_MASKED_DOT_DEFINE);
+EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
 EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
 #undef IDX
 
 #define IDX int64_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_MASKED_DOT_DEFINE);
+EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
 EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
 #undef IDX
 
diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h
index d657d7404137..fe259cf1dfdb 100644
--- a/src/kernel/cpu/binary_reduce_impl.h
+++ b/src/kernel/cpu/binary_reduce_impl.h
@@ -29,6 +29,7 @@ struct BinaryReduce {
   static inline void ApplyEdge(
       Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
     const int64_t D = gdata->x_length;
+    const int64_t len = gdata->data_len
     Idx lid = Functors::SelectLeft(src, eid, dst);
     Idx rid = Functors::SelectRight(src, eid, dst);
     Idx oid = Functors::SelectOut(src, eid, dst);
@@ -41,31 +42,16 @@ struct BinaryReduce {
     if (gdata->out_mapping) {
       oid = Functors::GetId(oid, gdata->out_mapping);
     }
-    DType* lhsoff = gdata->lhs_data + lid * D;
-    DType* rhsoff = gdata->rhs_data + rid * D;
+    DType* lhsoff = gdata->lhs_data + lid * D * len;
+    DType* rhsoff = gdata->rhs_data + rid * D * len;
     DType* outoff = gdata->out_data + oid * D;
     for (int64_t tx = 0; tx < D; ++tx) {
-      DType lhs = Functors::Read(lhsoff + tx);
-      DType rhs = Functors::Read(rhsoff + tx);
-      DType out = Functors::Op(lhs, rhs);
+      DType out = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
       Functors::Write(outoff + tx, out);
     }
   }
 };
 
-// Minigun UDF to compute binary reduce.
-template <typename Idx, typename DType, typename Functors>
-struct BinaryMaskedDot {
-  static inline bool CondEdge(
-      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
-    return true;
-  }
-  static inline void ApplyEdge(
-      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
-    return true;
-  }
-};
-
 // Convert flattened index to multi-dimension index (assume row-major).
 inline void Unravel(int64_t idx, int ndim,
     const int64_t* shape, const int64_t* stride, int64_t* out) {
@@ -93,6 +79,7 @@ struct BinaryReduceBcast {
   }
   static inline void ApplyEdge(
       Idx src, Idx dst, Idx eid, BcastGData<NDim, Idx, DType>* gdata) {
+    const int64_t len = gdata->data_len;
     Idx lid = Functors::SelectLeft(src, eid, dst);
     Idx rid = Functors::SelectRight(src, eid, dst);
     Idx oid = Functors::SelectOut(src, eid, dst);
@@ -105,17 +92,17 @@ struct BinaryReduceBcast {
     if (gdata->out_mapping) {
       oid = Functors::GetId(oid, gdata->out_mapping);
     }
-    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len;
-    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len;
+    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; //data with len size
+    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
     DType* outoff = gdata->out_data + oid * gdata->out_len;
     int64_t tmp[NDim];  // store unraveled idx.
     for (int64_t tx = 0; tx < gdata->out_len; ++tx) {
       Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
-      DType lhs = Functors::Read(lhsoff +
-          Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride));
-      DType rhs = Functors::Read(rhsoff +
-          Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride));
-      DType out = Functors::Op(lhs, rhs);
+      DType out = Functors::Op(
+          lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, 
+          rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, 
+          len);
+
       Functors::Write(outoff + tx, out);
     }
   }
@@ -138,11 +125,8 @@ struct FunctorsTempl {
       Idx src, Idx edge, Idx dst) {
     return RightSelector::Call(src, edge, dst);
   }
-  static inline DType Op(DType lhs, DType rhs) {
-    return BinaryOp::Call(lhs, rhs);
-  }
-  static inline DType Read(DType* addr) {
-    return *addr;
+  static inline Dtype Op(DType *lhs, Dtype *rhs, int64_t len) {
+    return BinaryOp::Call(lhs, rhs, len);
   }
   static inline void Write(DType* addr, DType val) {
     Reducer::Call(addr, val);
@@ -188,41 +172,6 @@ void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg,
         rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
 }
 
-// Template implementation of BinaryMaksedDot operator.
-template <int XPU, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector>
-void CallBinaryMaskedDot(const minigun::advance::RuntimeConfig& rtcfg,
-                      const CSRWrapper& graph,
-                      GData<Idx, DType>* gdata) {
-  //For binary dot, it should be none reducer.
-  typedef cpu::FunctorsTempl<Idx, DType, LeftSelector,
-                        RightSelector, BinaryDot<DType>, ReduceNone<XPU, DType>>
-          Functors;
-  typedef cpu::BinaryMaskedDot<Idx, DType, Functors> UDF;
-  // csr
-  auto outcsr = graph.GetOutCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-
-  // For Masked Matrix Multiply, the output target should be edge.
-  // If the user-given mapping is none, we need to replace the mapping by the 
-  // edge ids in the csr graph.
-  if (gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig, GData<Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-
 // Template implementation of BinaryReduce broadcasting operator.
 template <int XPU, int NDim, typename Idx, typename DType,
           typename LeftSelector, typename RightSelector,
@@ -266,15 +215,6 @@ void CallBinaryReduceBcast(
       const CSRWrapper& graph,                                     \
       GData<IDX, dtype>* gdata);
 
-// Following macro is used to generate explicit-specialization of the template
-// operator.
-#define GEN_MASKED_DOT_DEFINE(dtype, lhs_tgt, rhs_tgt)             \
-  template void CallBinaryMaskedDot<XPU, IDX,                      \
-        dtype, lhs_tgt, rhs_tgt>(                                  \
-      const minigun::advance::RuntimeConfig& rtcfg,                \
-      const CSRWrapper& graph,                                     \
-      GData<IDX, dtype>* gdata);
-
 #define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op)         \
   template void CallBinaryReduceBcast<XPU, ndim, IDX, dtype,     \
                                  lhs_tgt, rhs_tgt,                  \
diff --git a/src/kernel/cuda/binary_masked_dot.cu b/src/kernel/cuda/binary_masked_dot.cu
index 4eb1db9f8670..b3bebfb85fdd 100644
--- a/src/kernel/cuda/binary_masked_dot.cu
+++ b/src/kernel/cuda/binary_masked_dot.cu
@@ -13,7 +13,7 @@ namespace kernel {
 #define XPU kDLGPU
 #define IDX int32_t
 
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_MASKED_DOT_DEFINE)
+EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE)
 EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE)
 
 }  // namespace kernel
diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh
index 3b888fe9ed46..37cb4eca7968 100644
--- a/src/kernel/cuda/binary_reduce_impl.cuh
+++ b/src/kernel/cuda/binary_reduce_impl.cuh
@@ -28,7 +28,8 @@ struct BinaryReduce {
       Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
     const int64_t D = gdata->x_length;
     int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
-    int stride_x = blockDim.x * gridDim.x;
+    const int64_t stride_x = blockDim.x * gridDim.x;
+    const int64_t len = gdata->data_len
     Idx lid = Functors::SelectLeft(src, eid, dst);
     Idx rid = Functors::SelectRight(src, eid, dst);
     Idx oid = Functors::SelectOut(src, eid, dst);
@@ -41,32 +42,17 @@ struct BinaryReduce {
     if (gdata->out_mapping) {
       oid = Functors::GetId(oid, gdata->out_mapping);
     }
-    DType* lhsoff = gdata->lhs_data + lid * D;
-    DType* rhsoff = gdata->rhs_data + rid * D;
+    DType* lhsoff = gdata->lhs_data + lid * D * len;
+    DType* rhsoff = gdata->rhs_data + rid * D * len;
     DType* outoff = gdata->out_data + oid * D;
     while (tx < D) {
-      DType lhs = Functors::Read(lhsoff + tx);
-      DType rhs = Functors::Read(rhsoff + tx);
-      DType out = Functors::Op(lhs, rhs);
+      DType out = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
       Functors::Write(outoff + tx, out);
       tx += stride_x;
     }
   }
 };
 
-// Minigun UDF to compute binary reduce.
-template <typename Idx, typename DType, typename Functors>
-struct BinaryMaskedDot {
-  static __device__ __forceinline__ bool CondEdge(
-      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
-    return true;
-  }
-  static __device__ __forceinline__ void ApplyEdge(
-      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
-    return;
-  }
-};
-
 // Convert flattened index to multi-dimension index (assume row-major).
 __device__ __forceinline__ void Unravel(
     int64_t idx, int ndim, const int64_t* shape, const int64_t* stride, int64_t* out) {
@@ -95,7 +81,8 @@ struct BinaryReduceBcast {
   static __device__ __forceinline__ void ApplyEdge(
       Idx src, Idx dst, Idx eid, BcastGData<NDim, Idx, DType>* gdata) {
     int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
-    int stride_x = blockDim.x * gridDim.x;
+    constint64_t stride_x = blockDim.x * gridDim.x;
+    const int64_t len = gdata->data_len;
     Idx lid = Functors::SelectLeft(src, eid, dst);
     Idx rid = Functors::SelectRight(src, eid, dst);
     Idx oid = Functors::SelectOut(src, eid, dst);
@@ -108,17 +95,17 @@ struct BinaryReduceBcast {
     if (gdata->out_mapping) {
       oid = Functors::GetId(oid, gdata->out_mapping);
     }
-    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len;
-    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len;
-    DType* outoff = gdata->out_data + oid * gdata->out_len;
+    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; //data with len size
+    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
+    DType* outoff = gdata->out_data + oid * gdata->out_len * len;
     int64_t tmp[NDim];  // store unraveled idx.
     while (tx < gdata->out_len) {
       Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
-      DType lhs = Functors::Read(lhsoff +
-          Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride));
-      DType rhs = Functors::Read(rhsoff +
-          Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride));
-      DType out = Functors::Op(lhs, rhs);
+      DType out = Functors::Op(
+          lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, 
+          rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, 
+          len);
+
       Functors::Write(outoff + tx, out);
       tx += stride_x;
     }
@@ -142,11 +129,8 @@ struct FunctorsTempl {
       Idx src, Idx edge, Idx dst) {
     return RightSelector::Call(src, edge, dst);
   }
-  static __device__ __forceinline__ DType Op(DType lhs, DType rhs) {
-    return BinaryOp::Call(lhs, rhs);
-  }
-  static __device__ __forceinline__ DType Read(DType* addr) {
-    return LDGReader<DType>::Call(addr);
+  static __device__ __forceinline__ DType Op(DType *lhs, DType *rhs, int64_t len) {
+    return BinaryOp::Call(lhs, rhs, len);
   }
   static __device__ __forceinline__ void Write(DType* addr, DType val) {
     Reducer::Call(addr, val);
@@ -191,42 +175,6 @@ void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg,
         rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
 }
 
-// Template implementation of BinaryMaskedDot operator.
-template <int XPU, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector>
-void CallBinaryMaskedDot(const minigun::advance::RuntimeConfig& rtcfg,
-                      const CSRWrapper& graph,
-                      GData<Idx, DType>* gdata) {
-  //For binary dot, it should be none reducer.
-  typedef cuda::FunctorsTempl<Idx, DType, LeftSelector,
-                        RightSelector, BinaryDot<DType>, ReduceNone<XPU, DType>>
-          Functors;
-  typedef cuda::BinaryMaskedDot<Idx, DType, Functors> UDF;
-
-  // csr
-  auto outcsr = graph.GetOutCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-
-  // For Masked Matrix Multiply, the output target should be edge.
-  // If the user-given mapping is none, we need to replace the mapping by the 
-  // edge ids in the csr graph.
-  if (gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cuda::AdvanceConfig, GData<Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-
 // Template implementation of BinaryReduce broadcasting operator.
 template <int XPU, int NDim, typename Idx, typename DType,
           typename LeftSelector, typename RightSelector,
@@ -270,15 +218,6 @@ void CallBinaryReduceBcast(
       const CSRWrapper& graph,                                     \
       GData<IDX, dtype>* gdata);
 
-// Following macro is used to generate explicit-specialization of the template
-// operator.
-#define GEN_MASKED_DOT_DEFINE(dtype, lhs_tgt, rhs_tgt)             \
-  template void CallBinaryMaskedDot<XPU, IDX,                      \
-        dtype, lhs_tgt, rhs_tgt>(                                  \
-      const minigun::advance::RuntimeConfig& rtcfg,                \
-      const CSRWrapper& graph,                                     \
-      GData<IDX, dtype>* gdata);
-
 #define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op)         \
   template void CallBinaryReduceBcast<XPU, ndim, IDX, dtype,     \
                                  lhs_tgt, rhs_tgt,                  \

From 4fdb8fe95c47f169711086d8670a6c4911578cc4 Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Wed, 4 Sep 2019 21:35:27 +0800
Subject: [PATCH 21/32] New Impl of x_dot_x, reuse binary reduce template

---
 python/dgl/backend/mxnet/tensor.py            |  2 +-
 python/dgl/backend/pytorch/tensor.py          |  2 +-
 src/kernel/binary_reduce.cc                   |  4 +-
 src/kernel/binary_reduce_common.h             |  4 +-
 src/kernel/cpu/backward_binary_reduce_impl.h  | 51 +++++++++++--------
 src/kernel/cpu/binary_reduce_impl.h           |  2 +-
 .../cuda/backward_binary_reduce_impl.cuh      | 51 +++++++++++--------
 src/kernel/cuda/binary_reduce_impl.cuh        |  2 +-
 8 files changed, 67 insertions(+), 51 deletions(-)

diff --git a/python/dgl/backend/mxnet/tensor.py b/python/dgl/backend/mxnet/tensor.py
index d2d38eab0646..07eee54ec809 100644
--- a/python/dgl/backend/mxnet/tensor.py
+++ b/python/dgl/backend/mxnet/tensor.py
@@ -364,7 +364,7 @@ def forward(self, lhs_data, rhs_data):
         lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data)
         rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
         feat_shape = K.infer_binary_feature_shape(self.binary_op, lhs_data_nd, rhs_data_nd)
-        out_data = nd.empty((self.out_size,) + feat_shape,
+        out_data = nd.empty((self.out_size,) + feat_shape[:-1],
                             ctx=lhs_data.context, dtype=lhs_data.dtype)
         out_data_nd = zerocopy_to_dgl_ndarray_for_write(out_data)
         K.binary_op_reduce(
diff --git a/python/dgl/backend/pytorch/tensor.py b/python/dgl/backend/pytorch/tensor.py
index bc75ed5f0cdb..a9b089f3600c 100644
--- a/python/dgl/backend/pytorch/tensor.py
+++ b/python/dgl/backend/pytorch/tensor.py
@@ -286,7 +286,7 @@ def forward(ctx, reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data,
         lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data)
         rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
         feat_shape = K.infer_binary_feature_shape(binary_op, lhs_data_nd, rhs_data_nd)
-        out_data = lhs_data.new_empty((out_size,) + feat_shape)
+        out_data = lhs_data.new_empty((out_size,) + feat_shape[:-1])
         out_data_nd = zerocopy_to_dgl_ndarray(out_data)
         K.binary_op_reduce(
             reducer, binary_op, graph, lhs, rhs, lhs_data_nd, rhs_data_nd,
diff --git a/src/kernel/binary_reduce.cc b/src/kernel/binary_reduce.cc
index 265626f63b3e..9cfef5a9c834 100644
--- a/src/kernel/binary_reduce.cc
+++ b/src/kernel/binary_reduce.cc
@@ -140,6 +140,7 @@ BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) {
     accum = 0;
   }
   std::reverse(ret.real_out_shape.begin(), ret.real_out_shape.end());
+  ret.real_out_shape.push_back(ret.data_len);
   std::reverse(ret.lhs_shape.begin(), ret.lhs_shape.end());
   std::reverse(ret.rhs_shape.begin(), ret.rhs_shape.end());
   std::reverse(ret.out_shape.begin(), ret.out_shape.end());
@@ -243,9 +244,10 @@ class ImmutableGraphCSRWrapper : public CSRWrapper {
 
 
 std::vector<int64_t> InferBinaryFeatureShape(
+    const std::string& op,
     NDArray lhs,
     NDArray rhs) {
-  return CalcBcastInfo(lhs, rhs).real_out_shape;
+  return CalcBcastInfo(op, lhs, rhs).real_out_shape;
 }
 
 DGL_REGISTER_GLOBAL("kernel._CAPI_DGLKernelInferBinaryFeatureShape")
diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h
index a1cd914be03f..ba94a869f434 100644
--- a/src/kernel/binary_reduce_common.h
+++ b/src/kernel/binary_reduce_common.h
@@ -204,10 +204,10 @@ struct BinaryDot {
     return out;
   }
   static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
-    return 1;
+    return rhs;
   }
   static DGLDEVICE DGLINLINE DType BackwardRhs(DType lhs, DType rhs, DType out) {
-    return 1;
+    return lhs;
   }
 };
 
diff --git a/src/kernel/cpu/backward_binary_reduce_impl.h b/src/kernel/cpu/backward_binary_reduce_impl.h
index 1db1e5448c5d..b72ba5b6bdaa 100644
--- a/src/kernel/cpu/backward_binary_reduce_impl.h
+++ b/src/kernel/cpu/backward_binary_reduce_impl.h
@@ -27,6 +27,7 @@ struct BackwardBinaryReduce {
   static inline void ApplyEdge(
       Idx src, Idx dst, Idx eid, BackwardGData<Idx, DType>* gdata) {
     const int64_t D = gdata->x_length;
+    const int64_t len = gdata->data_len;
     Idx lid = Functors::SelectLeft(src, eid, dst);
     Idx rid = Functors::SelectRight(src, eid, dst);
     Idx oid = Functors::SelectOut(src, eid, dst);
@@ -39,28 +40,30 @@ struct BackwardBinaryReduce {
     if (gdata->out_mapping) {
       oid = Functors::GetId(oid, gdata->out_mapping);
     }
-    DType* lhsoff = gdata->lhs_data + lid * D;
-    DType* rhsoff = gdata->rhs_data + rid * D;
+    DType* lhsoff = gdata->lhs_data + lid * D * len;
+    DType* rhsoff = gdata->rhs_data + rid * D * len;
     DType* outoff = gdata->out_data + oid * D;
-    DType* gradlhsoff = gdata->grad_lhs_data + lid * D;
-    DType* gradrhsoff = gdata->grad_rhs_data + rid * D;
+    DType* gradlhsoff = gdata->grad_lhs_data + lid * D * len;
+    DType* gradrhsoff = gdata->grad_rhs_data + rid * D * len;
     DType* gradoutoff = gdata->grad_out_data + oid * D;
     for (int64_t tx = 0; tx < D; ++tx) {
-      DType lhs = Functors::Read(lhsoff + tx);
-      DType rhs = Functors::Read(rhsoff + tx);
       DType out = Functors::Read(outoff + tx);
       DType grad_out = Functors::Read(gradoutoff + tx);
-      DType e = Functors::Op(lhs, rhs);
+      DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
       DType grad_e = grad_out * Functors::BackwardWrite(e, out);
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
         DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
+        for (int64_t i = 0; i < len; ++i) {
 #pragma omp atomic
-        gradlhsoff[tx] += grad_lhs;
+          gradlhsoff[tx * len + i] += grad_lhs;
+        }
       }
       if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
         DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
+        for (int64_t i = 0; i < len; ++i) {
 #pragma omp atomic
-        gradrhsoff[tx] += grad_rhs;
+          gradrhsoff[tx * len + i] += grad_rhs;
+        }
       }
     }
   }
@@ -76,6 +79,7 @@ struct BackwardBinaryReduceBcast {
   }
   static inline void ApplyEdge(
       Idx src, Idx dst, Idx eid, BackwardBcastGData<NDim, Idx, DType>* gdata) {
+    const int64_t len = gdata->data_len;
     Idx lid = Functors::SelectLeft(src, eid, dst);
     Idx rid = Functors::SelectRight(src, eid, dst);
     Idx oid = Functors::SelectOut(src, eid, dst);
@@ -88,32 +92,35 @@ struct BackwardBinaryReduceBcast {
     if (gdata->out_mapping) {
       oid = Functors::GetId(oid, gdata->out_mapping);
     }
-    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len;
-    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len;
+    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len;
+    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
     DType* outoff = gdata->out_data + oid * gdata->out_len;
-    DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len;
-    DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len;
+    DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len * len;
+    DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len * len;
     DType* gradoutoff = gdata->grad_out_data + oid * gdata->out_len;
     int64_t tmp[NDim];  // store unraveled idx.
     for (int64_t tx = 0; tx < gdata->out_len; ++tx) {
       Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
-      DType lhs = Functors::Read(lhsoff +
-          Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride));
-      DType rhs = Functors::Read(rhsoff +
-          Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride));
       DType out = Functors::Read(outoff + tx);
       DType grad_out = Functors::Read(gradoutoff + tx);
-      DType e = Functors::Op(lhs, rhs);
+      DType e = Functors::Op(
+        lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len,
+        rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
+        len);
       DType grad_e = grad_out * Functors::BackwardWrite(e, out);
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
         DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
+        for (int64_t i = 0; i < len; ++i) {
 #pragma omp atomic
-        gradlhsoff[tx] += grad_lhs;
+          gradlhsoff[tx * len + i] += grad_lhs;
+        }
       }
       if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
         DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
+        for (int64_t i = 0; i < len; ++i) {
 #pragma omp atomic
-        gradrhsoff[tx] += grad_rhs;
+          gradrhsoff[tx * len + i] += grad_rhs;
+        }
       }
     }
   }
@@ -137,8 +144,8 @@ struct BackwardFunctorsTempl {
       Idx src, Idx edge, Idx dst) {
     return RightSelector::Call(src, edge, dst);
   }
-  static inline DType Op(DType lhs, DType rhs) {
-    return BinaryOp::Call(lhs, rhs);
+  static inline DType Op(DType* lhs, DType* rhs, int64_t len) {
+    return BinaryOp::Call(lhs, rhs, len);
   }
   static inline DType Read(DType* addr) {
     return *addr;
diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h
index fe259cf1dfdb..f68d9dea3ec1 100644
--- a/src/kernel/cpu/binary_reduce_impl.h
+++ b/src/kernel/cpu/binary_reduce_impl.h
@@ -29,7 +29,7 @@ struct BinaryReduce {
   static inline void ApplyEdge(
       Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
     const int64_t D = gdata->x_length;
-    const int64_t len = gdata->data_len
+    const int64_t len = gdata->data_len;
     Idx lid = Functors::SelectLeft(src, eid, dst);
     Idx rid = Functors::SelectRight(src, eid, dst);
     Idx oid = Functors::SelectOut(src, eid, dst);
diff --git a/src/kernel/cuda/backward_binary_reduce_impl.cuh b/src/kernel/cuda/backward_binary_reduce_impl.cuh
index bbe3a9c44ce5..e035b24e1061 100644
--- a/src/kernel/cuda/backward_binary_reduce_impl.cuh
+++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh
@@ -29,6 +29,7 @@ struct BackwardBinaryReduce {
     const int64_t D = gdata->x_length;
     int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
     int stride_x = blockDim.x * gridDim.x;
+    const int64_t len = gdata->data_len;
     Idx lid = Functors::SelectLeft(src, eid, dst);
     Idx rid = Functors::SelectRight(src, eid, dst);
     Idx oid = Functors::SelectOut(src, eid, dst);
@@ -41,26 +42,28 @@ struct BackwardBinaryReduce {
     if (gdata->out_mapping) {
       oid = Functors::GetId(oid, gdata->out_mapping);
     }
-    DType* lhsoff = gdata->lhs_data + lid * D;
-    DType* rhsoff = gdata->rhs_data + rid * D;
+    DType* lhsoff = gdata->lhs_data + lid * D * len;
+    DType* rhsoff = gdata->rhs_data + rid * D * len;
     DType* outoff = gdata->out_data + oid * D;
-    DType* gradlhsoff = gdata->grad_lhs_data + lid * D;
-    DType* gradrhsoff = gdata->grad_rhs_data + rid * D;
+    DType* gradlhsoff = gdata->grad_lhs_data + lid * D * len;
+    DType* gradrhsoff = gdata->grad_rhs_data + rid * D * len;
     DType* gradoutoff = gdata->grad_out_data + oid * D;
     while (tx < D) {
-      DType lhs = Functors::Read(lhsoff + tx);
-      DType rhs = Functors::Read(rhsoff + tx);
       DType out = Functors::Read(outoff + tx);
       DType grad_out = Functors::Read(gradoutoff + tx);
-      DType e = Functors::Op(lhs, rhs);
+      DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
       DType grad_e = grad_out * Functors::BackwardWrite(e, out);
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
         DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-        AtomicAdd(gradlhsoff + tx, grad_lhs);
+        for (int64_t i = 0; i < len; ++i) {
+          AtomicAdd(gradlhsoff + tx * len + i, grad_lhs);
+        }
       }
       if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
         DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-        AtomicAdd(gradrhsoff + tx, grad_rhs);
+        for (int64_t i = 0; i < len; ++i) {
+          AtomicAdd(gradrhsoff + tx * len + i, grad_rhs);
+        }
       }
       tx += stride_x;
     }
@@ -78,6 +81,7 @@ struct BackwardBinaryReduceBcast {
       Idx src, Idx dst, Idx eid, BackwardBcastGData<NDim, Idx, DType>* gdata) {
     int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
     int stride_x = blockDim.x * gridDim.x;
+    const int64_t len = gdata->data_len;
     Idx lid = Functors::SelectLeft(src, eid, dst);
     Idx rid = Functors::SelectRight(src, eid, dst);
     Idx oid = Functors::SelectOut(src, eid, dst);
@@ -90,30 +94,33 @@ struct BackwardBinaryReduceBcast {
     if (gdata->out_mapping) {
       oid = Functors::GetId(oid, gdata->out_mapping);
     }
-    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len;
-    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len;
+    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len;
+    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
     DType* outoff = gdata->out_data + oid * gdata->out_len;
-    DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len;
-    DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len;
+    DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len * len;
+    DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len * len;
     DType* gradoutoff = gdata->grad_out_data + oid * gdata->out_len;
     int64_t tmp[NDim];  // store unraveled idx.
     while (tx < gdata->out_len) {
       Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
-      DType lhs = Functors::Read(lhsoff +
-          Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride));
-      DType rhs = Functors::Read(rhsoff +
-          Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride));
       DType out = Functors::Read(outoff + tx);
       DType grad_out = Functors::Read(gradoutoff + tx);
-      DType e = Functors::Op(lhs, rhs);
+      DType e = Functors::Op(
+        lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len,
+        rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
+        len);
       DType grad_e = grad_out * Functors::BackwardWrite(e, out);
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
         DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-        AtomicAdd(gradlhsoff + tx, grad_lhs);
+        for (int64_t i = 0; i < len; ++i) {
+          AtomicAdd(gradlhsoff + tx * len + i, grad_lhs);
+        }
       }
       if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
         DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-        AtomicAdd(gradrhsoff + tx, grad_rhs);
+        for (int64_t i = 0; i < len; ++i) {
+          AtomicAdd(gradrhsoff + tx * len + i, grad_rhs);
+        }
       }
       tx += stride_x;
     }
@@ -138,8 +145,8 @@ struct BackwardFunctorsTempl {
       Idx src, Idx edge, Idx dst) {
     return RightSelector::Call(src, edge, dst);
   }
-  static __device__ __forceinline__ DType Op(DType lhs, DType rhs) {
-    return BinaryOp::Call(lhs, rhs);
+  static __device__ __forceinline__ DType Op(DType* lhs, DType* rhs, int64_t len) {
+    return BinaryOp::Call(lhs, rhs, len);
   }
   static __device__ __forceinline__ DType Read(DType* addr) {
     return LDGReader<DType>::Call(addr);
diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh
index 37cb4eca7968..66a87caf1f77 100644
--- a/src/kernel/cuda/binary_reduce_impl.cuh
+++ b/src/kernel/cuda/binary_reduce_impl.cuh
@@ -29,7 +29,7 @@ struct BinaryReduce {
     const int64_t D = gdata->x_length;
     int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
     const int64_t stride_x = blockDim.x * gridDim.x;
-    const int64_t len = gdata->data_len
+    const int64_t len = gdata->data_len;
     Idx lid = Functors::SelectLeft(src, eid, dst);
     Idx rid = Functors::SelectRight(src, eid, dst);
     Idx oid = Functors::SelectOut(src, eid, dst);

From faa3b2d9b2bcfb372dfa1847fa881e83f595ac42 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-3-16.us-east-2.compute.internal>
Date: Wed, 4 Sep 2019 15:06:23 +0000
Subject: [PATCH 22/32] Compile OK.

TODO:
1. make sure x_add_x, x_sub_x, x_mul_x, x_div_x work
2. let x_dot_x work
3. make sure backward of x_add_x, x_sub_x, x_mul_x, x_div_x work
4. let x_dot_x backward work
---
 src/kernel/binary_reduce.cc                   |  2 +-
 src/kernel/binary_reduce_common.h             |  2 +-
 src/kernel/binary_reduce_impl.h               | 18 ++++++++++---
 src/kernel/binary_reduce_impl_decl.h          |  4 +++
 src/kernel/cpu/backward_binary_reduce_impl.h  | 24 ++++++++++++++---
 src/kernel/cpu/binary_masked_dot.cc           | 26 -------------------
 src/kernel/cpu/binary_reduce_impl.h           |  2 +-
 .../cuda/backward_binary_reduce_impl.cuh      | 24 ++++++++++++++---
 src/kernel/cuda/binary_masked_dot.cu          | 20 --------------
 src/kernel/cuda/binary_reduce_impl.cuh        |  2 +-
 10 files changed, 62 insertions(+), 62 deletions(-)
 delete mode 100644 src/kernel/cpu/binary_masked_dot.cc
 delete mode 100644 src/kernel/cuda/binary_masked_dot.cu

diff --git a/src/kernel/binary_reduce.cc b/src/kernel/binary_reduce.cc
index 9cfef5a9c834..3435e0659f26 100644
--- a/src/kernel/binary_reduce.cc
+++ b/src/kernel/binary_reduce.cc
@@ -94,7 +94,7 @@ bool HasBcast(NDArray lhs, NDArray rhs) {
 // See also: BcastInfo (kernel/binary_reduce.h)
 BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) {
   BcastInfo ret;
-  const int max_ndim = std::max(lhs->ndim, rhs->ndim) - 1;
+  int max_ndim = std::max(lhs->ndim, rhs->ndim) - 1;
   // for dot operation: vector [dot] vector
   // lhs_shape[ndim-1] == rhs_shape[ndim-1] = sizeof(vector)
   // out_shape[ndim-1] = 1
diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h
index ba94a869f434..0c0193116be1 100644
--- a/src/kernel/binary_reduce_common.h
+++ b/src/kernel/binary_reduce_common.h
@@ -196,7 +196,7 @@ struct BinaryUseLhs {
 template <typename DType>
 struct BinaryDot {
   static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) {
-    Dtype out = 0;
+    DType out = 0;
     //simple vector dot vector
     for (int i = 0; i < len; i ++)
       out += lhs[i] * rhs[i];
diff --git a/src/kernel/binary_reduce_impl.h b/src/kernel/binary_reduce_impl.h
index 5f64f2dc6ea9..d14215a0f140 100644
--- a/src/kernel/binary_reduce_impl.h
+++ b/src/kernel/binary_reduce_impl.h
@@ -51,7 +51,7 @@ GData<Idx, DType> AllocGData(const std::string& op,
   }
 
   // for dot operation: vector [dot] vector
-  if (op == binary::kDot) {
+  if (op == binary_op::kDot) {
     //get size of vector
     gdata.data_len = lhs_data->shape[lhs_data->ndim - 1];
   } else {
@@ -119,7 +119,7 @@ void BinaryReduceImpl(
 
 template <int XPU, typename Idx, typename DType>
 BackwardGData<Idx, DType> AllocBackwardGData(
-    const DLContext& ctx, int64_t x_len,
+    const std::string& op, const DLContext& ctx, int64_t x_len,
     runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
     runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
     runtime::NDArray grad_out_data,
@@ -153,6 +153,14 @@ BackwardGData<Idx, DType> AllocBackwardGData(
   if (!utils::IsNoneArray(out_mapping)) {
     gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
   }
+
+  // for dot operation: vector [dot] vector
+  if (op == binary_op::kDot) {
+    //get size of vector
+    gdata.data_len = lhs_data->shape[lhs_data->ndim - 1];
+  } else {
+    gdata.data_len = 1;
+  }
   return gdata;
 }
 
@@ -198,7 +206,7 @@ void BackwardBinaryReduceImpl(
   }
   DGL_DTYPE_SWITCH(dtype, DType, {
     DGL_IDX_TYPE_SWITCH(bits, Idx, {
-      auto gdata = AllocBackwardGData<XPU, Idx, DType>(
+      auto gdata = AllocBackwardGData<XPU, Idx, DType>(op,
           rtcfg.ctx, x_len, lhs_mapping, rhs_mapping, out_mapping,
           lhs_data, rhs_data, out_data, grad_out_data,
           grad_lhs_data, grad_rhs_data);
@@ -250,8 +258,8 @@ BcastGData<NDim, Idx, DType> AllocBcastGData(
   if (!utils::IsNoneArray(out_mapping)) {
     gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
   }
-
   gdata.data_len = info.data_len;
+
   // fill out data with zero values
   utils::Fill<XPU>(ctx, gdata.out_data, utils::NElements(out_data), Zero<Reducer>::value);
   return gdata;
@@ -347,6 +355,8 @@ BackwardBcastGData<NDim, Idx, DType> AllocBackwardBcastGData(
   if (!utils::IsNoneArray(out_mapping)) {
     gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
   }
+  gdata.data_len = info.data_len;
+
   // data
   gdata.lhs_data = static_cast<DType*>(lhs->data);
   gdata.rhs_data = static_cast<DType*>(rhs->data);
diff --git a/src/kernel/binary_reduce_impl_decl.h b/src/kernel/binary_reduce_impl_decl.h
index 5c799372d7e4..af95b76cbdf5 100644
--- a/src/kernel/binary_reduce_impl_decl.h
+++ b/src/kernel/binary_reduce_impl_decl.h
@@ -121,6 +121,8 @@ template <typename Idx, typename DType>
 struct BackwardGData {
   // length along x(feature) dimension
   int64_t x_length{0};
+  // size of data, can be single value or a vector
+  int64_t data_len;
   // number of rows of the output tensor
   int64_t out_size{0};
   // input data
@@ -330,6 +332,8 @@ struct BackwardBcastGData {
   int64_t lhs_shape[NDim]{0}, lhs_stride[NDim]{0};
   int64_t rhs_shape[NDim]{0}, rhs_stride[NDim]{0};
   int64_t out_shape[NDim]{0}, out_stride[NDim]{0};
+  // size of data, can be single value or a vector
+  int64_t data_len;
   // input id mappings
   Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr}, *out_mapping{nullptr};
   // input data
diff --git a/src/kernel/cpu/backward_binary_reduce_impl.h b/src/kernel/cpu/backward_binary_reduce_impl.h
index b72ba5b6bdaa..1b4976a4a982 100644
--- a/src/kernel/cpu/backward_binary_reduce_impl.h
+++ b/src/kernel/cpu/backward_binary_reduce_impl.h
@@ -51,16 +51,23 @@ struct BackwardBinaryReduce {
       DType grad_out = Functors::Read(gradoutoff + tx);
       DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
       DType grad_e = grad_out * Functors::BackwardWrite(e, out);
+
+      DType* lhs_base = lhsoff + tx * len;
+      DType* rhs_base = rhsoff + tx * len;
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
-        DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
         for (int64_t i = 0; i < len; ++i) {
+          DType lhs = Functors::Read(lhs_base + i);
+          DType rhs = Functors::Read(rhs_base + i);
+          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
 #pragma omp atomic
           gradlhsoff[tx * len + i] += grad_lhs;
         }
       }
       if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
-        DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
         for (int64_t i = 0; i < len; ++i) {
+          DType lhs = Functors::Read(lhs_base + i);
+          DType rhs = Functors::Read(rhs_base + i);
+          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
 #pragma omp atomic
           gradrhsoff[tx * len + i] += grad_rhs;
         }
@@ -108,16 +115,25 @@ struct BackwardBinaryReduceBcast {
         rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
         len);
       DType grad_e = grad_out * Functors::BackwardWrite(e, out);
+
+      DType* lhs_base = lhsoff + 
+          Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len;
+      DType* rhs_base = rhsoff +
+          Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len;
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
-        DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
         for (int64_t i = 0; i < len; ++i) {
+          DType lhs = Functors::Read(lhs_base + i);
+          DType rhs = Functors::Read(rhs_base + i);
+          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
 #pragma omp atomic
           gradlhsoff[tx * len + i] += grad_lhs;
         }
       }
       if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
-        DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
         for (int64_t i = 0; i < len; ++i) {
+          DType lhs = Functors::Read(lhs_base + i);
+          DType rhs = Functors::Read(rhs_base + i);
+          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
 #pragma omp atomic
           gradrhsoff[tx * len + i] += grad_rhs;
         }
diff --git a/src/kernel/cpu/binary_masked_dot.cc b/src/kernel/cpu/binary_masked_dot.cc
deleted file mode 100644
index fcbd5284211c..000000000000
--- a/src/kernel/cpu/binary_masked_dot.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_masked_dot.cc
- * \brief CPU kernels for binary reduce prod
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-
-namespace dgl {
-namespace kernel {
-
-#define REDUCER ReduceNone
-#define XPU kDLCPU
-
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-
-#define IDX int64_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-
-}  // namespace kernel
-}  // namespace dgl
diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h
index f68d9dea3ec1..cff9e22c418f 100644
--- a/src/kernel/cpu/binary_reduce_impl.h
+++ b/src/kernel/cpu/binary_reduce_impl.h
@@ -125,7 +125,7 @@ struct FunctorsTempl {
       Idx src, Idx edge, Idx dst) {
     return RightSelector::Call(src, edge, dst);
   }
-  static inline Dtype Op(DType *lhs, Dtype *rhs, int64_t len) {
+  static inline DType Op(DType *lhs, DType *rhs, int64_t len) {
     return BinaryOp::Call(lhs, rhs, len);
   }
   static inline void Write(DType* addr, DType val) {
diff --git a/src/kernel/cuda/backward_binary_reduce_impl.cuh b/src/kernel/cuda/backward_binary_reduce_impl.cuh
index e035b24e1061..871e33dbb3bb 100644
--- a/src/kernel/cuda/backward_binary_reduce_impl.cuh
+++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh
@@ -53,15 +53,22 @@ struct BackwardBinaryReduce {
       DType grad_out = Functors::Read(gradoutoff + tx);
       DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
       DType grad_e = grad_out * Functors::BackwardWrite(e, out);
+
+      DType* lhs_base = lhsoff + tx * len;
+      DType* rhs_base = rhsoff + tx * len;
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
-        DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
         for (int64_t i = 0; i < len; ++i) {
+          DType lhs = Functors::Read(lhs_base + i);
+          DType rhs = Functors::Read(rhs_base + i); 
+          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
           AtomicAdd(gradlhsoff + tx * len + i, grad_lhs);
         }
       }
       if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
-        DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
         for (int64_t i = 0; i < len; ++i) {
+          DType lhs = Functors::Read(lhs_base + i);
+          DType rhs = Functors::Read(rhs_base + i);
+          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
           AtomicAdd(gradrhsoff + tx * len + i, grad_rhs);
         }
       }
@@ -110,15 +117,24 @@ struct BackwardBinaryReduceBcast {
         rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
         len);
       DType grad_e = grad_out * Functors::BackwardWrite(e, out);
+
+      DType* lhs_base = lhsoff + 
+          Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len;
+      DType* rhs_base = rhsoff +
+          Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len;
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
-        DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
         for (int64_t i = 0; i < len; ++i) {
+          DType lhs = Functors::Read(lhs_base + i);
+          DType rhs = Functors::Read(rhs_base + i);
+          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
           AtomicAdd(gradlhsoff + tx * len + i, grad_lhs);
         }
       }
       if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
-        DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
         for (int64_t i = 0; i < len; ++i) {
+          DType lhs = Functors::Read(lhs_base + i);
+          DType rhs = Functors::Read(rhs_base + i);
+          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
           AtomicAdd(gradrhsoff + tx * len + i, grad_rhs);
         }
       }
diff --git a/src/kernel/cuda/binary_masked_dot.cu b/src/kernel/cuda/binary_masked_dot.cu
deleted file mode 100644
index b3bebfb85fdd..000000000000
--- a/src/kernel/cuda/binary_masked_dot.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_masked_dot.cu
- * \brief CUDA kernels for binary reduce prod
- */
-#include "./binary_reduce_impl.cuh"
-#include "./backward_binary_reduce_impl.cuh"
-
-namespace dgl {
-namespace kernel {
-
-#define REDUCER ReduceNone
-#define XPU kDLGPU
-#define IDX int32_t
-
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE)
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE)
-
-}  // namespace kernel
-}  // namespace dgl
diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh
index 66a87caf1f77..cee51710b960 100644
--- a/src/kernel/cuda/binary_reduce_impl.cuh
+++ b/src/kernel/cuda/binary_reduce_impl.cuh
@@ -81,7 +81,7 @@ struct BinaryReduceBcast {
   static __device__ __forceinline__ void ApplyEdge(
       Idx src, Idx dst, Idx eid, BcastGData<NDim, Idx, DType>* gdata) {
     int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
-    constint64_t stride_x = blockDim.x * gridDim.x;
+    const int64_t stride_x = blockDim.x * gridDim.x;
     const int64_t len = gdata->data_len;
     Idx lid = Functors::SelectLeft(src, eid, dst);
     Idx rid = Functors::SelectRight(src, eid, dst);

From f9a0676a15978429e545fd41baafc0955003e4f1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-3-16.us-east-2.compute.internal>
Date: Thu, 5 Sep 2019 03:08:46 +0000
Subject: [PATCH 23/32] Fix code style

---
 src/kernel/binary_reduce.cc                  | 4 ++--
 src/kernel/binary_reduce_common.h            | 2 +-
 src/kernel/binary_reduce_impl.h              | 4 ++--
 src/kernel/cpu/backward_binary_reduce_impl.h | 2 +-
 src/kernel/cpu/binary_reduce_impl.h          | 6 +++---
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/kernel/binary_reduce.cc b/src/kernel/binary_reduce.cc
index 3435e0659f26..37800be2fe3e 100644
--- a/src/kernel/binary_reduce.cc
+++ b/src/kernel/binary_reduce.cc
@@ -99,10 +99,10 @@ BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) {
   // lhs_shape[ndim-1] == rhs_shape[ndim-1] = sizeof(vector)
   // out_shape[ndim-1] = 1
   if (op == binary_op::kDot) {
-    //get size of vector
+    // get size of vector
     ret.data_len = lhs->shape[lhs->ndim - 1];
     --max_ndim;
-  } else {//op != binary_op::kDot
+  } else {  // op != binary_op::kDot
     ret.data_len = 1;
   }
 
diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h
index 0c0193116be1..b01fdcbb8069 100644
--- a/src/kernel/binary_reduce_common.h
+++ b/src/kernel/binary_reduce_common.h
@@ -197,7 +197,7 @@ template <typename DType>
 struct BinaryDot {
   static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) {
     DType out = 0;
-    //simple vector dot vector
+    // simple vector dot vector
     for (int i = 0; i < len; i ++)
       out += lhs[i] * rhs[i];
 
diff --git a/src/kernel/binary_reduce_impl.h b/src/kernel/binary_reduce_impl.h
index d14215a0f140..5a11ea6c8c2b 100644
--- a/src/kernel/binary_reduce_impl.h
+++ b/src/kernel/binary_reduce_impl.h
@@ -52,7 +52,7 @@ GData<Idx, DType> AllocGData(const std::string& op,
 
   // for dot operation: vector [dot] vector
   if (op == binary_op::kDot) {
-    //get size of vector
+    // get size of vector
     gdata.data_len = lhs_data->shape[lhs_data->ndim - 1];
   } else {
     gdata.data_len = 1;
@@ -156,7 +156,7 @@ BackwardGData<Idx, DType> AllocBackwardGData(
 
   // for dot operation: vector [dot] vector
   if (op == binary_op::kDot) {
-    //get size of vector
+    // get size of vector
     gdata.data_len = lhs_data->shape[lhs_data->ndim - 1];
   } else {
     gdata.data_len = 1;
diff --git a/src/kernel/cpu/backward_binary_reduce_impl.h b/src/kernel/cpu/backward_binary_reduce_impl.h
index 1b4976a4a982..0f209dc24d6a 100644
--- a/src/kernel/cpu/backward_binary_reduce_impl.h
+++ b/src/kernel/cpu/backward_binary_reduce_impl.h
@@ -116,7 +116,7 @@ struct BackwardBinaryReduceBcast {
         len);
       DType grad_e = grad_out * Functors::BackwardWrite(e, out);
 
-      DType* lhs_base = lhsoff + 
+      DType* lhs_base = lhsoff +
           Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len;
       DType* rhs_base = rhsoff +
           Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len;
diff --git a/src/kernel/cpu/binary_reduce_impl.h b/src/kernel/cpu/binary_reduce_impl.h
index cff9e22c418f..7f73765db7c8 100644
--- a/src/kernel/cpu/binary_reduce_impl.h
+++ b/src/kernel/cpu/binary_reduce_impl.h
@@ -92,15 +92,15 @@ struct BinaryReduceBcast {
     if (gdata->out_mapping) {
       oid = Functors::GetId(oid, gdata->out_mapping);
     }
-    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; //data with len size
+    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len;  // data with len size
     DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
     DType* outoff = gdata->out_data + oid * gdata->out_len;
     int64_t tmp[NDim];  // store unraveled idx.
     for (int64_t tx = 0; tx < gdata->out_len; ++tx) {
       Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
       DType out = Functors::Op(
-          lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, 
-          rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, 
+          lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len,
+          rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
           len);
 
       Functors::Write(outoff + tx, out);

From 9c00adc5fefaeb16ffe57f003cf970206252bc11 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-3-16.us-east-2.compute.internal>
Date: Thu, 5 Sep 2019 03:54:45 +0000
Subject: [PATCH 24/32] Now we can pass the tests/compute/test_kernel.py for
 add/sub/mul/div forward and backward

---
 python/dgl/backend/mxnet/tensor.py   | 9 ++++++---
 python/dgl/backend/pytorch/tensor.py | 9 ++++++---
 python/dgl/function/message.py       | 5 ++---
 python/dgl/kernel.py                 | 1 +
 src/kernel/binary_reduce.cc          | 4 +++-
 5 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/python/dgl/backend/mxnet/tensor.py b/python/dgl/backend/mxnet/tensor.py
index 39b8a3f1ae0d..4760b02acaba 100644
--- a/python/dgl/backend/mxnet/tensor.py
+++ b/python/dgl/backend/mxnet/tensor.py
@@ -374,7 +374,10 @@ def forward(self, lhs_data, rhs_data):
         lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data)
         rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
         feat_shape = K.infer_binary_feature_shape(self.binary_op, lhs_data_nd, rhs_data_nd)
-        out_data = nd.empty((self.out_size,) + feat_shape[:-1],
+        out_shape = feat_shape
+        if binary_op == 'dot':
+            out_shape = feat_shape[:-1]
+        out_data = nd.empty((self.out_size,) + out_shape,
                             ctx=lhs_data.context, dtype=lhs_data.dtype)
         out_data_nd = zerocopy_to_dgl_ndarray_for_write(out_data)
         K.binary_op_reduce(
@@ -399,10 +402,10 @@ def forward(self, lhs_data, rhs_data):
             in_ones = nd.ones((n,), ctx=lhs_data.context, dtype=lhs_data.dtype)
             in_ones_nd = zerocopy_to_dgl_ndarray(in_ones)
             K.copy_reduce(
-                'sum', self.graph, target, in_ones_nd, degs_nd, 
+                'sum', self.graph, target, in_ones_nd, degs_nd,
                 in_map, self.out_map[0])
             # reshape
-            degs = degs.reshape((out_data.shape[0],) + (1,) * (out_data.ndim - 1)).clip(1, float('inf')) 
+            degs = degs.reshape((out_data.shape[0],) + (1,) * (out_data.ndim - 1)).clip(1, float('inf'))
             out_data = out_data / degs
         else:
             degs = None
diff --git a/python/dgl/backend/pytorch/tensor.py b/python/dgl/backend/pytorch/tensor.py
index a237d694153a..837155fe3869 100644
--- a/python/dgl/backend/pytorch/tensor.py
+++ b/python/dgl/backend/pytorch/tensor.py
@@ -286,10 +286,13 @@ def forward(ctx, reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data,
         lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data)
         rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
         feat_shape = K.infer_binary_feature_shape(binary_op, lhs_data_nd, rhs_data_nd)
-        out_data = lhs_data.new_empty((out_size,) + feat_shape[:-1])
+        out_shape = feat_shape
+        if binary_op == 'dot':
+            out_shape = feat_shape[:-1]
+        out_data = lhs_data.new_empty((out_size,) + out_shape)
         out_data_nd = zerocopy_to_dgl_ndarray(out_data)
         K.binary_op_reduce(
-            reducer if reducer != 'mean' else 'sum', 
+            reducer if reducer != 'mean' else 'sum',
             binary_op, graph, lhs, rhs, lhs_data_nd, rhs_data_nd,
             out_data_nd, lhs_map[0], rhs_map[0], out_map[0])
         # normalize if mean reducer
@@ -308,7 +311,7 @@ def forward(ctx, reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data,
             in_ones = lhs_data.new_ones((n,))
             in_ones_nd = zerocopy_to_dgl_ndarray(in_ones)
             K.copy_reduce(
-                'sum', graph, target, in_ones_nd, degs_nd, in_map, out_map[0]) 
+                'sum', graph, target, in_ones_nd, degs_nd, in_map, out_map[0])
             # reshape
             degs = degs.reshape((out_data.shape[0],) + (1,) * (out_data.dim() - 1)).clamp(min=1)
             out_data = out_data / degs
diff --git a/python/dgl/function/message.py b/python/dgl/function/message.py
index 91c98485546b..a1785ca0411d 100644
--- a/python/dgl/function/message.py
+++ b/python/dgl/function/message.py
@@ -154,8 +154,8 @@ def copy_e(e, out):
 # v_add_e, v_sub_e, v_mul_e, v_div_e
 # e_add_u, e_sub_u, e_mul_u, e_div_u
 # e_add_v, e_sub_v, e_mul_v, e_div_v
-# 
-# masked-mm message functions:
+#
+# dot message functions:
 # u_dot_v, u_dot_e, v_dot_e
 
 _TARGET_MAP = {
@@ -209,7 +209,6 @@ def _register_builtin_message_func():
                 setattr(sys.modules[__name__], func.__name__, func)
                 __all__.append(func.__name__)
 
-    """Register builtin masked-mm functions"""
     for lhs, rhs in product(["u", "v"], ["v", "e"]):
         if lhs != rhs:
             for binary_op in ["dot"]:
diff --git a/python/dgl/kernel.py b/python/dgl/kernel.py
index 7abe2d38bf4e..4de76490e8d5 100644
--- a/python/dgl/kernel.py
+++ b/python/dgl/kernel.py
@@ -4,6 +4,7 @@
 from ._ffi.function import _init_api
 from .ndarray import empty
 
+# pylint: disable=invalid-name
 def infer_binary_feature_shape(op, lhs, rhs):
     """Infer the output feature shape after a binary operation between lhs and rhs.
 
diff --git a/src/kernel/binary_reduce.cc b/src/kernel/binary_reduce.cc
index fd04e1e84834..064f08fdad60 100644
--- a/src/kernel/binary_reduce.cc
+++ b/src/kernel/binary_reduce.cc
@@ -141,7 +141,9 @@ BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) {
     accum = 0;
   }
   std::reverse(ret.real_out_shape.begin(), ret.real_out_shape.end());
-  ret.real_out_shape.push_back(ret.data_len);
+  if (op == binary_op::kDot) {
+    ret.real_out_shape.push_back(ret.data_len);
+  }
   std::reverse(ret.lhs_shape.begin(), ret.lhs_shape.end());
   std::reverse(ret.rhs_shape.begin(), ret.rhs_shape.end());
   std::reverse(ret.out_shape.begin(), ret.out_shape.end());

From 037b1425bde51acfb45c78e8c2a3d7714237898e Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-3-16.us-east-2.compute.internal>
Date: Thu, 5 Sep 2019 04:47:01 +0000
Subject: [PATCH 25/32] Fix mxnet test code

---
 python/dgl/backend/mxnet/tensor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/dgl/backend/mxnet/tensor.py b/python/dgl/backend/mxnet/tensor.py
index 4760b02acaba..6db926b3b0bc 100644
--- a/python/dgl/backend/mxnet/tensor.py
+++ b/python/dgl/backend/mxnet/tensor.py
@@ -375,7 +375,7 @@ def forward(self, lhs_data, rhs_data):
         rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
         feat_shape = K.infer_binary_feature_shape(self.binary_op, lhs_data_nd, rhs_data_nd)
         out_shape = feat_shape
-        if binary_op == 'dot':
+        if self.binary_op == 'dot':
             out_shape = feat_shape[:-1]
         out_data = nd.empty((self.out_size,) + out_shape,
                             ctx=lhs_data.context, dtype=lhs_data.dtype)

From c59de935e67dc490d3c9b68a1511e645485be6a0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-3-16.us-east-2.compute.internal>
Date: Thu, 5 Sep 2019 07:25:33 +0000
Subject: [PATCH 26/32] Add u_dot_v, u_dot_e, v_dot_e unitest.

---
 src/kernel/binary_reduce.cc                   | 14 ++--
 .../cuda/backward_binary_reduce_impl.cuh      |  4 +-
 src/kernel/cuda/binary_reduce_impl.cuh        |  6 +-
 tests/backend/backend_unittest.py             |  4 ++
 tests/backend/mxnet/__init__.py               |  3 +
 tests/backend/pytorch/__init__.py             |  3 +
 tests/compute/test_kernel.py                  | 70 +++++++++++++------
 7 files changed, 72 insertions(+), 32 deletions(-)

diff --git a/src/kernel/binary_reduce.cc b/src/kernel/binary_reduce.cc
index 064f08fdad60..9a87093eec3e 100644
--- a/src/kernel/binary_reduce.cc
+++ b/src/kernel/binary_reduce.cc
@@ -95,20 +95,23 @@ bool HasBcast(NDArray lhs, NDArray rhs) {
 // See also: BcastInfo (kernel/binary_reduce.h)
 BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) {
   BcastInfo ret;
-  int max_ndim = std::max(lhs->ndim, rhs->ndim) - 1;
+  const int max_ndim = std::max(lhs->ndim, rhs->ndim) - 1;
+  int64_t accum = 0;
+  int j = 0;
   // for dot operation: vector [dot] vector
   // lhs_shape[ndim-1] == rhs_shape[ndim-1] = sizeof(vector)
   // out_shape[ndim-1] = 1
   if (op == binary_op::kDot) {
     // get size of vector
     ret.data_len = lhs->shape[lhs->ndim - 1];
-    --max_ndim;
+    // skip vector size dim
+    ++j;
+    ret.real_out_shape.push_back(ret.data_len);
   } else {  // op != binary_op::kDot
     ret.data_len = 1;
   }
 
-  int64_t accum = 0;
-  for (int j = 0; j < max_ndim; ++j) {
+  for (; j < max_ndim; ++j) {
     const int dl = (lhs->ndim - 1 - j < 1)? 1 : lhs->shape[lhs->ndim - 1 - j];
     const int dr = (rhs->ndim - 1 - j < 1)? 1 : rhs->shape[rhs->ndim - 1 - j];
     if (dl != dr) {
@@ -141,9 +144,6 @@ BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) {
     accum = 0;
   }
   std::reverse(ret.real_out_shape.begin(), ret.real_out_shape.end());
-  if (op == binary_op::kDot) {
-    ret.real_out_shape.push_back(ret.data_len);
-  }
   std::reverse(ret.lhs_shape.begin(), ret.lhs_shape.end());
   std::reverse(ret.rhs_shape.begin(), ret.rhs_shape.end());
   std::reverse(ret.out_shape.begin(), ret.out_shape.end());
diff --git a/src/kernel/cuda/backward_binary_reduce_impl.cuh b/src/kernel/cuda/backward_binary_reduce_impl.cuh
index 871e33dbb3bb..644171826503 100644
--- a/src/kernel/cuda/backward_binary_reduce_impl.cuh
+++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh
@@ -59,7 +59,7 @@ struct BackwardBinaryReduce {
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i); 
+          DType rhs = Functors::Read(rhs_base + i);
           DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
           AtomicAdd(gradlhsoff + tx * len + i, grad_lhs);
         }
@@ -118,7 +118,7 @@ struct BackwardBinaryReduceBcast {
         len);
       DType grad_e = grad_out * Functors::BackwardWrite(e, out);
 
-      DType* lhs_base = lhsoff + 
+      DType* lhs_base = lhsoff +
           Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len;
       DType* rhs_base = rhsoff +
           Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len;
diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh
index cee51710b960..3197ed3dd255 100644
--- a/src/kernel/cuda/binary_reduce_impl.cuh
+++ b/src/kernel/cuda/binary_reduce_impl.cuh
@@ -97,13 +97,13 @@ struct BinaryReduceBcast {
     }
     DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; //data with len size
     DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
-    DType* outoff = gdata->out_data + oid * gdata->out_len * len;
+    DType* outoff = gdata->out_data + oid * gdata->out_len;
     int64_t tmp[NDim];  // store unraveled idx.
     while (tx < gdata->out_len) {
       Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
       DType out = Functors::Op(
-          lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len, 
-          rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len, 
+          lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len,
+          rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
           len);
 
       Functors::Write(outoff + tx, out);
diff --git a/tests/backend/backend_unittest.py b/tests/backend/backend_unittest.py
index d80854bdcdf2..0caecb960548 100644
--- a/tests/backend/backend_unittest.py
+++ b/tests/backend/backend_unittest.py
@@ -115,6 +115,10 @@ def matmul(a, b):
     """Compute Matrix Multiplication between a and b"""
     pass
 
+def dot(a, b):
+    """Compute Dot between a and b"""
+    pass
+
 ###############################################################################
 # Tensor functions used *only* on index tensor
 # ----------------
diff --git a/tests/backend/mxnet/__init__.py b/tests/backend/mxnet/__init__.py
index 4b2f6a068d9a..e36add88ddea 100644
--- a/tests/backend/mxnet/__init__.py
+++ b/tests/backend/mxnet/__init__.py
@@ -86,6 +86,9 @@ def prod(x, dim):
 def matmul(a, b):
     return nd.dot(a, b)
 
+def dot(a, b):
+    return nd.sum(mul(a, b), axis=-1)
+
 record_grad = autograd.record
 
 
diff --git a/tests/backend/pytorch/__init__.py b/tests/backend/pytorch/__init__.py
index 01cc2a2e3c3e..099e9efba57d 100644
--- a/tests/backend/pytorch/__init__.py
+++ b/tests/backend/pytorch/__init__.py
@@ -82,6 +82,9 @@ def prod(x, dim):
 def matmul(a, b):
     return a @ b
 
+def dot(a, b):
+    return sum(mul(a, b), dim=-1)
+
 class record_grad(object):
     def __init__(self):
         pass
diff --git a/tests/compute/test_kernel.py b/tests/compute/test_kernel.py
index 88d80502fad0..0d663b75cc52 100644
--- a/tests/compute/test_kernel.py
+++ b/tests/compute/test_kernel.py
@@ -26,33 +26,52 @@ def udf_max(nodes):
 D1 = 5
 D2 = 3
 D3 = 4
+D4 = 10 # NOTE(xiang): used to dot feature vector
 builtin = {'sum': fn.sum, 'max': fn.max, 'mean': fn.mean}
 udf_reduce = {'sum': udf_sum, 'max': udf_max, 'mean': udf_mean}
 fill_value = {'sum': 0, 'max': float("-inf")}
 
 
-def generate_feature(g, broadcast='none'):
+def generate_feature(g, broadcast='none', binary_op='none'):
     """Create graph with src, edge, dst feature. broadcast can be 'u',
     'e', 'v', 'none'
     """
     nv = g.number_of_nodes()
     ne = g.number_of_edges()
-    if broadcast == 'e':
-        u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3)))
-        e = F.tensor(np.random.uniform(-1, 1, (ne, D2, 1)))
-        v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3)))
-    elif broadcast == 'u':
-        u = F.tensor(np.random.uniform(-1, 1, (nv, D2, 1)))
-        e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3)))
-        v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3)))
-    elif broadcast == 'v':
-        u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3)))
-        e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3)))
-        v = F.tensor(np.random.uniform(-1, 1, (nv, D2, 1)))
+    if binary_op == 'dot':
+        if broadcast == 'e':
+            u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3, D4)))
+            e = F.tensor(np.random.uniform(-1, 1, (ne, D2, 1, D4)))
+            v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3, D4)))
+        elif broadcast == 'u':
+            u = F.tensor(np.random.uniform(-1, 1, (nv, D2, 1, D4)))
+            e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3, D4)))
+            v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3, D4)))
+        elif broadcast == 'v':
+            u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3, D4)))
+            e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3, D4)))
+            v = F.tensor(np.random.uniform(-1, 1, (nv, D2, 1, D4)))
+        else:
+            u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3, D4)))
+            e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3, D4)))
+            v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3, D4)))
     else:
-        u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3)))
-        e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3)))
-        v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3)))
+        if broadcast == 'e':
+            u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3)))
+            e = F.tensor(np.random.uniform(-1, 1, (ne, D2, 1)))
+            v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3)))
+        elif broadcast == 'u':
+            u = F.tensor(np.random.uniform(-1, 1, (nv, D2, 1)))
+            e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3)))
+            v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3)))
+        elif broadcast == 'v':
+            u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3)))
+            e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3)))
+            v = F.tensor(np.random.uniform(-1, 1, (nv, D2, 1)))
+        else:
+            u = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3)))
+            e = F.tensor(np.random.uniform(-1, 1, (ne, D1, D2, D3)))
+            v = F.tensor(np.random.uniform(-1, 1, (nv, D1, D2, D3)))
     return u, v, e
 
 
@@ -62,7 +81,7 @@ def _test(red, partial):
         # NOTE(zihao): add self-loop to avoid zero-degree nodes.
         # https://github.com/dmlc/dgl/issues/761
         g.add_edges(g.nodes(), g.nodes())
-        hu, hv, he = generate_feature(g, 'none')
+        hu, hv, he = generate_feature(g, 'none', 'none')
         if partial:
             nid = F.tensor(list(range(0, 100, 2)))
 
@@ -123,7 +142,7 @@ def _test(red, partial):
         g = dgl.DGLGraph(nx.erdos_renyi_graph(100, 0.1))
         # NOTE(zihao): add self-loop to avoid zero-degree nodes.
         g.add_edges(g.nodes(), g.nodes())
-        hu, hv, he = generate_feature(g, 'none')
+        hu, hv, he = generate_feature(g, 'none', 'none')
         if partial:
             nid = F.tensor(list(range(0, 100, 2)))
 
@@ -182,7 +201,7 @@ def _print_error(a, b):
 def test_all_binary_builtins():
     def _test(g, lhs, rhs, binary_op, reducer, partial, nid, broadcast='none'):
         # initialize node/edge features with uniform(-1, 1)
-        hu, hv, he = generate_feature(g, broadcast)
+        hu, hv, he = generate_feature(g, broadcast, binary_op)
         if binary_op == 'div':
             # op = div
             # lhs range: [-1, 1]
@@ -327,8 +346,19 @@ def _print_error(a, b):
                     for partial in [False, True]:
                         _test(g, lhs, rhs, binary_op, reducer, partial, nid,
                               broadcast=broadcast)
-    
+
+    for lhs, rhs in product(["u", "v"], ["v", "e"]):
+        if lhs == rhs:
+            continue
+        binary_op = "dot"
+        for reducer in ["sum", "max", "min", "mean"]:
+            for broadcast in ["none", lhs, rhs]:
+                    for partial in [False, True]:
+                        _test(g, lhs, rhs, binary_op, reducer, partial, nid,
+                              broadcast=broadcast)
+
 if __name__ == '__main__':
     #test_copy_src_reduce()
     #test_copy_edge_reduce()
     test_all_binary_builtins()
+

From bed5bcd153265f3da5de4a6bdab7988450047284 Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Thu, 5 Sep 2019 15:51:25 +0800
Subject: [PATCH 27/32] Update doc

---
 docs/source/api/python/function.rst | 3 +++
 docs/source/features/builtin.rst    | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/docs/source/api/python/function.rst b/docs/source/api/python/function.rst
index 869eb50db95e..756f471d7a6c 100644
--- a/docs/source/api/python/function.rst
+++ b/docs/source/api/python/function.rst
@@ -40,6 +40,9 @@ Message functions
     e_sub_v
     e_mul_v
     e_div_v
+    u_dot_v
+    u_dot_e
+    v_dot_e
 
 Reduce functions
 ----------------
diff --git a/docs/source/features/builtin.rst b/docs/source/features/builtin.rst
index a6a36145a664..ce521de01f29 100644
--- a/docs/source/features/builtin.rst
+++ b/docs/source/features/builtin.rst
@@ -98,6 +98,8 @@ Here is a cheatsheet of all the DGL builtins.
 |                         +----------------------------------------------------+-----------------------+
 |                         | ``e_add_v``, ``e_sub_v``, ``e_mul_v``, ``e_div_v`` |                       |
 |                         +----------------------------------------------------+-----------------------+
+|                         | ``u_dot_v``, ``u_dot_e``, ``v_dot_e``              |                       |
+|                         +----------------------------------------------------+-----------------------+
 |                         | ``src_mul_edge``                                   |  alias of ``u_mul_e`` |
 +-------------------------+----------------------------------------------------+-----------------------+
 | Reduce function         | ``max``                                            |                       |

From 056cbf520bdb71be3200b6da5344c2251828bfe1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-3-16.us-east-2.compute.internal>
Date: Thu, 5 Sep 2019 14:13:18 +0000
Subject: [PATCH 28/32] Now also support v_dot_u, e_dot_u, e_dot_v

---
 docs/source/api/python/function.rst |  3 +++
 docs/source/features/builtin.rst    |  2 ++
 python/dgl/function/message.py      | 10 ++--------
 src/kernel/binary_reduce_common.h   | 20 +++++++++++++++++++-
 tests/compute/test_kernel.py        | 12 +-----------
 5 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/docs/source/api/python/function.rst b/docs/source/api/python/function.rst
index 756f471d7a6c..1187496616dc 100644
--- a/docs/source/api/python/function.rst
+++ b/docs/source/api/python/function.rst
@@ -43,6 +43,9 @@ Message functions
     u_dot_v
     u_dot_e
     v_dot_e
+    v_dot_u
+    e_dot_u
+    e_dot_v
 
 Reduce functions
 ----------------
diff --git a/docs/source/features/builtin.rst b/docs/source/features/builtin.rst
index ce521de01f29..558d56a122ce 100644
--- a/docs/source/features/builtin.rst
+++ b/docs/source/features/builtin.rst
@@ -100,6 +100,8 @@ Here is a cheatsheet of all the DGL builtins.
 |                         +----------------------------------------------------+-----------------------+
 |                         | ``u_dot_v``, ``u_dot_e``, ``v_dot_e``              |                       |
 |                         +----------------------------------------------------+-----------------------+
+|                         | ``v_dot_u``, ``e_dot_u``, ``e_dot_v``              |                       |
+|                         +----------------------------------------------------+-----------------------+
 |                         | ``src_mul_edge``                                   |  alias of ``u_mul_e`` |
 +-------------------------+----------------------------------------------------+-----------------------+
 | Reduce function         | ``max``                                            |                       |
diff --git a/python/dgl/function/message.py b/python/dgl/function/message.py
index a1785ca0411d..206d2d643295 100644
--- a/python/dgl/function/message.py
+++ b/python/dgl/function/message.py
@@ -157,6 +157,7 @@ def copy_e(e, out):
 #
 # dot message functions:
 # u_dot_v, u_dot_e, v_dot_e
+# v_dot_u, e_dot_u, e_dot_v
 
 _TARGET_MAP = {
     "u": TargetCode.SRC,
@@ -204,14 +205,7 @@ def _register_builtin_message_func():
     target = ["u", "v", "e"]
     for lhs, rhs in product(target, target):
         if lhs != rhs:
-            for binary_op in ["add", "sub", "mul", "div"]:
-                func = _gen_message_builtin(lhs, rhs, binary_op)
-                setattr(sys.modules[__name__], func.__name__, func)
-                __all__.append(func.__name__)
-
-    for lhs, rhs in product(["u", "v"], ["v", "e"]):
-        if lhs != rhs:
-            for binary_op in ["dot"]:
+            for binary_op in ["add", "sub", "mul", "div", "dot"]:
                 func = _gen_message_builtin(lhs, rhs, binary_op)
                 setattr(sys.modules[__name__], func.__name__, func)
                 __all__.append(func.__name__)
diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h
index b01fdcbb8069..a79755ae0883 100644
--- a/src/kernel/binary_reduce_common.h
+++ b/src/kernel/binary_reduce_common.h
@@ -340,6 +340,21 @@ struct BinaryDot {
     typedef SelectDst LeftType;                                \
     typedef SelectEdge RightType;                              \
     {__VA_ARGS__}                                              \
+  } else if (op == kDot && lhs == kDst && rhs == kSrc) {       \
+    typedef BinaryDot<DType> OpType;                           \
+    typedef SelectDst LeftType;                                \
+    typedef SelectSrc RightType;                               \
+    {__VA_ARGS__}                                              \
+  } else if (op == kDot && lhs == kEdge && rhs == kSrc) {      \
+    typedef BinaryDot<DType> OpType;                           \
+    typedef SelectEdge LeftType;                               \
+    typedef SelectSrc RightType;                               \
+    {__VA_ARGS__}                                              \
+  } else if (op == kDot && lhs == kEdge && rhs == kDst) {      \
+    typedef BinaryDot<DType> OpType;                           \
+    typedef SelectEdge LeftType;                               \
+    typedef SelectDst RightType;                               \
+    {__VA_ARGS__}                                              \
   } else {                                                     \
     LOG(FATAL) << "Unsupported operation: op=" << op           \
       << " lhs=" << lhs << " rhs=" << rhs;                     \
@@ -370,7 +385,10 @@ struct BinaryDot {
   MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectNone, BinaryUseLhs)) \
   MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinaryDot))      \
   MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinaryDot))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryDot))
+  MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryDot))     \
+  MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectSrc, BinaryDot))      \
+  MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectSrc, BinaryDot))     \
+  MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectDst, BinaryDot))
 
 //////////////////////////////////////////////////////////////////////////
 // Defines reducer category. Each category is an empty structure.
diff --git a/tests/compute/test_kernel.py b/tests/compute/test_kernel.py
index 0d663b75cc52..8d159e481f5b 100644
--- a/tests/compute/test_kernel.py
+++ b/tests/compute/test_kernel.py
@@ -340,23 +340,13 @@ def _print_error(a, b):
     for lhs, rhs in product(target, target):
         if lhs == rhs:
             continue
-        for binary_op in ["add", "sub", "mul", "div"]:
+        for binary_op in ["add", "sub", "mul", "div", "dot"]:
             for reducer in ["sum", "max", "min", "prod", "mean"]:
                 for broadcast in ["none", lhs, rhs]:
                     for partial in [False, True]:
                         _test(g, lhs, rhs, binary_op, reducer, partial, nid,
                               broadcast=broadcast)
 
-    for lhs, rhs in product(["u", "v"], ["v", "e"]):
-        if lhs == rhs:
-            continue
-        binary_op = "dot"
-        for reducer in ["sum", "max", "min", "mean"]:
-            for broadcast in ["none", lhs, rhs]:
-                    for partial in [False, True]:
-                        _test(g, lhs, rhs, binary_op, reducer, partial, nid,
-                              broadcast=broadcast)
-
 if __name__ == '__main__':
     #test_copy_src_reduce()
     #test_copy_edge_reduce()

From 9e94b310ec0ecf41fedead8cf52dd6a58d630430 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-3-16.us-east-2.compute.internal>
Date: Thu, 5 Sep 2019 14:29:00 +0000
Subject: [PATCH 29/32] Add unroll for some loop

---
 src/kernel/binary_reduce_common.h               | 3 +++
 src/kernel/cpu/backward_binary_reduce_impl.h    | 4 ++++
 src/kernel/cuda/backward_binary_reduce_impl.cuh | 4 ++++
 3 files changed, 11 insertions(+)

diff --git a/src/kernel/binary_reduce_common.h b/src/kernel/binary_reduce_common.h
index a79755ae0883..ee284e2cb9ef 100644
--- a/src/kernel/binary_reduce_common.h
+++ b/src/kernel/binary_reduce_common.h
@@ -198,6 +198,7 @@ struct BinaryDot {
   static DGLDEVICE DGLINLINE DType Call(DType *lhs, DType *rhs, int64_t len) {
     DType out = 0;
     // simple vector dot vector
+#pragma unroll
     for (int i = 0; i < len; i ++)
       out += lhs[i] * rhs[i];
 
@@ -220,6 +221,8 @@ struct BinaryDot {
 //  - Div(Src, Dst), Div(Src, Edge), Div(Dst, Edge)
 //    Div(Dst, Src), Div(Edge, Src), Div(Edge, Dst)
 //  - UseLhs(Src, None), UseLhs(Edge, None)
+//  - Dot(Src, Dst), Dot(Src, Edge), Dot(Dst, Edge)
+//  - Dot(Dst, Src), Dot(Edge, Src), Dot(Edge, Dst)
 // Note that for commutative operators (e.g. Add and Mul), we only generate
 // kernels for lhs code smaller than rhs code.
 #define OP_TARGET_SWITCH(op, lhs, rhs, DType, OpType, LeftType, RightType, ...)   \
diff --git a/src/kernel/cpu/backward_binary_reduce_impl.h b/src/kernel/cpu/backward_binary_reduce_impl.h
index 0f209dc24d6a..7e5ea940aa73 100644
--- a/src/kernel/cpu/backward_binary_reduce_impl.h
+++ b/src/kernel/cpu/backward_binary_reduce_impl.h
@@ -55,6 +55,7 @@ struct BackwardBinaryReduce {
       DType* lhs_base = lhsoff + tx * len;
       DType* rhs_base = rhsoff + tx * len;
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
+#pragma unroll
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
           DType rhs = Functors::Read(rhs_base + i);
@@ -64,6 +65,7 @@ struct BackwardBinaryReduce {
         }
       }
       if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
+#pragma unroll
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
           DType rhs = Functors::Read(rhs_base + i);
@@ -121,6 +123,7 @@ struct BackwardBinaryReduceBcast {
       DType* rhs_base = rhsoff +
           Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len;
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
+#pragma unroll
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
           DType rhs = Functors::Read(rhs_base + i);
@@ -130,6 +133,7 @@ struct BackwardBinaryReduceBcast {
         }
       }
       if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
+#pragma unroll
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
           DType rhs = Functors::Read(rhs_base + i);
diff --git a/src/kernel/cuda/backward_binary_reduce_impl.cuh b/src/kernel/cuda/backward_binary_reduce_impl.cuh
index 644171826503..1bc98e6041bc 100644
--- a/src/kernel/cuda/backward_binary_reduce_impl.cuh
+++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh
@@ -57,6 +57,7 @@ struct BackwardBinaryReduce {
       DType* lhs_base = lhsoff + tx * len;
       DType* rhs_base = rhsoff + tx * len;
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
+#pragma unroll
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
           DType rhs = Functors::Read(rhs_base + i);
@@ -65,6 +66,7 @@ struct BackwardBinaryReduce {
         }
       }
       if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
+#pragma unroll
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
           DType rhs = Functors::Read(rhs_base + i);
@@ -123,6 +125,7 @@ struct BackwardBinaryReduceBcast {
       DType* rhs_base = rhsoff +
           Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len;
       if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
+#pragma unroll
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
           DType rhs = Functors::Read(rhs_base + i);
@@ -131,6 +134,7 @@ struct BackwardBinaryReduceBcast {
         }
       }
       if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
+#pragma unroll
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
           DType rhs = Functors::Read(rhs_base + i);

From 89e61dd8a58b8cf3fc465b4ec32c4afeefdcb573 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-3-16.us-east-2.compute.internal>
Date: Fri, 6 Sep 2019 08:48:06 +0000
Subject: [PATCH 30/32] Add some Opt for cuda backward of dot builtin.

Backward is still slow for dot
---
 src/kernel/cpu/backward_binary_reduce_impl.h  | 34 +++++++++++++------
 .../cuda/backward_binary_reduce_impl.cuh      | 30 ++++++++++++----
 2 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/src/kernel/cpu/backward_binary_reduce_impl.h b/src/kernel/cpu/backward_binary_reduce_impl.h
index 7e5ea940aa73..76a2bad062da 100644
--- a/src/kernel/cpu/backward_binary_reduce_impl.h
+++ b/src/kernel/cpu/backward_binary_reduce_impl.h
@@ -54,8 +54,17 @@ struct BackwardBinaryReduce {
 
       DType* lhs_base = lhsoff + tx * len;
       DType* rhs_base = rhsoff + tx * len;
-      if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
-#pragma unroll
+      if (Mode == binary_op::kGradBoth) {
+        for (int64_t i = 0; i < len; ++i) {
+          DType lhs = Functors::Read(lhs_base + i);
+          DType rhs = Functors::Read(rhs_base + i);
+          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
+          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
+          DType grad = grad_lhs + grad_rhs;
+#pragma omp atomic
+          gradlhsoff[tx * len + i] += grad;
+        }
+      } else if (Mode == binary_op::kGradLhs) {
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
           DType rhs = Functors::Read(rhs_base + i);
@@ -63,9 +72,7 @@ struct BackwardBinaryReduce {
 #pragma omp atomic
           gradlhsoff[tx * len + i] += grad_lhs;
         }
-      }
-      if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
-#pragma unroll
+      } else if (Mode == binary_op::kGradRhs) {
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
           DType rhs = Functors::Read(rhs_base + i);
@@ -122,8 +129,17 @@ struct BackwardBinaryReduceBcast {
           Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len;
       DType* rhs_base = rhsoff +
           Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len;
-      if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
-#pragma unroll
+      if (Mode == binary_op::kGradBoth) {
+        for (int64_t i = 0; i < len; ++i) {
+          DType lhs = Functors::Read(lhs_base + i);
+          DType rhs = Functors::Read(rhs_base + i);
+          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
+          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
+          DType grad = grad_lhs + grad_rhs;
+#pragma omp atomic
+          gradlhsoff[tx * len + i] += grad;
+        }
+      } else if (Mode == binary_op::kGradLhs) {
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
           DType rhs = Functors::Read(rhs_base + i);
@@ -131,9 +147,7 @@ struct BackwardBinaryReduceBcast {
 #pragma omp atomic
           gradlhsoff[tx * len + i] += grad_lhs;
         }
-      }
-      if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
-#pragma unroll
+      } else if (Mode == binary_op::kGradRhs) {
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
           DType rhs = Functors::Read(rhs_base + i);
diff --git a/src/kernel/cuda/backward_binary_reduce_impl.cuh b/src/kernel/cuda/backward_binary_reduce_impl.cuh
index 1bc98e6041bc..af26a143cf73 100644
--- a/src/kernel/cuda/backward_binary_reduce_impl.cuh
+++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh
@@ -56,7 +56,17 @@ struct BackwardBinaryReduce {
 
       DType* lhs_base = lhsoff + tx * len;
       DType* rhs_base = rhsoff + tx * len;
-      if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
+      if (Mode == binary_op::kGradBoth) {
+#pragma unroll
+        for (int64_t i = 0; i < len; ++i) {
+          DType lhs = Functors::Read(lhs_base + i);
+          DType rhs = Functors::Read(rhs_base + i);
+          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
+          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
+          DType grad = grad_lhs + grad_rhs;
+          AtomicAdd(gradlhsoff + tx * len + i, grad);
+        }
+      } else if (Mode == binary_op::kGradLhs) {
 #pragma unroll
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
@@ -64,8 +74,7 @@ struct BackwardBinaryReduce {
           DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
           AtomicAdd(gradlhsoff + tx * len + i, grad_lhs);
         }
-      }
-      if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
+      } else if (Mode == binary_op::kGradRhs) {
 #pragma unroll
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
@@ -124,7 +133,17 @@ struct BackwardBinaryReduceBcast {
           Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len;
       DType* rhs_base = rhsoff +
           Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len;
-      if (Mode == binary_op::kGradLhs || Mode == binary_op::kGradBoth) {
+      if (Mode == binary_op::kGradBoth) {
+#pragma unroll
+        for (int64_t i = 0; i < len; ++i) {
+          DType lhs = Functors::Read(lhs_base + i);
+          DType rhs = Functors::Read(rhs_base + i);
+          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
+          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
+          DType grad = grad_lhs + grad_rhs;
+          AtomicAdd(gradlhsoff + tx * len + i, grad);
+        }
+      } else if (Mode == binary_op::kGradLhs) {
 #pragma unroll
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);
@@ -132,8 +151,7 @@ struct BackwardBinaryReduceBcast {
           DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
           AtomicAdd(gradlhsoff + tx * len + i, grad_lhs);
         }
-      }
-      if (Mode == binary_op::kGradRhs || Mode == binary_op::kGradBoth) {
+      } else if (Mode == binary_op::kGradRhs) {
 #pragma unroll
         for (int64_t i = 0; i < len; ++i) {
           DType lhs = Functors::Read(lhs_base + i);

From c96338a133ec4b4a1373fc970bb3f9bc8f32ca2f Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-3-16.us-east-2.compute.internal>
Date: Mon, 9 Sep 2019 02:50:55 +0000
Subject: [PATCH 31/32] Apply UnravelRavel opt for broadcast backward

---
 .../cuda/backward_binary_reduce_impl.cuh      | 36 +++++--------------
 src/kernel/cuda/binary_reduce_impl.cuh        |  5 ++-
 2 files changed, 10 insertions(+), 31 deletions(-)

diff --git a/src/kernel/cuda/backward_binary_reduce_impl.cuh b/src/kernel/cuda/backward_binary_reduce_impl.cuh
index de720b67c064..fddac5f2da14 100644
--- a/src/kernel/cuda/backward_binary_reduce_impl.cuh
+++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh
@@ -88,24 +88,6 @@ struct BackwardBinaryReduce {
   }
 };
 
-// Convert flattened index to multi-dimension index (assume row-major).
-__device__ __forceinline__ void Unravel(
-    int64_t idx, int ndim, const int64_t* shape, const int64_t* stride, int64_t* out) {
-  for (int d = 0; d < ndim; ++d) {
-    out[d] = (idx / stride[d]) % shape[d];
-  }
-}
-
-// Convert multi-dimension index to flattened index (assume row-major).
-__device__ __forceinline__ int64_t Ravel(
-    const int64_t* idx, int ndim, const int64_t* shape, const int64_t* stride) {
-  int64_t out = 0;
-  for (int d = 0; d < ndim; ++d) {
-    out += min(idx[d], shape[d] - 1) * stride[d];
-  }
-  return out;
-}
-
 // Minigun UDF to compute backward binary reduce with broadcasting.
 template <int Mode, int NDim, typename Idx, typename DType, typename Functors>
 struct BackwardBinaryReduceBcast {
@@ -136,21 +118,19 @@ struct BackwardBinaryReduceBcast {
     DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len * len;
     DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len * len;
     DType* gradoutoff = gdata->grad_out_data + oid * gdata->out_len;
-    int64_t tmp[NDim];  // store unraveled idx.
     while (tx < gdata->out_len) {
-      Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
+      int64_t lhs_add = 0;
+      int64_t rhs_add = 0;
+      UnravelRavel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride,
+          gdata->lhs_shape, gdata->lhs_stride,
+          gdata->rhs_shape, gdata->rhs_stride, &lhs_add, &rhs_add);
       DType out = Functors::Read(outoff + tx);
       DType grad_out = Functors::Read(gradoutoff + tx);
-      DType e = Functors::Op(
-        lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len,
-        rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
-        len);
+      DType e = Functors::Op(lhsoff + lhs_add * len, rhsoff + rhs_add * len, len);
       DType grad_e = grad_out * Functors::BackwardWrite(e, out);
 
-      DType* lhs_base = lhsoff +
-          Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len;
-      DType* rhs_base = rhsoff +
-          Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len;
+      DType* lhs_base = lhsoff + lhs_add * len;
+      DType* rhs_base = rhsoff + rhs_add * len;
       if (Mode == binary_op::kGradBoth) {
 #pragma unroll
         for (int64_t i = 0; i < len; ++i) {
diff --git a/src/kernel/cuda/binary_reduce_impl.cuh b/src/kernel/cuda/binary_reduce_impl.cuh
index ea942fa104ee..ec1bf2e1c8e8 100644
--- a/src/kernel/cuda/binary_reduce_impl.cuh
+++ b/src/kernel/cuda/binary_reduce_impl.cuh
@@ -71,13 +71,12 @@ __device__ __forceinline__ void UnravelRavel(
       int64_t o_st = out_stride[d];
       int64_t rhs_sh = rhs_shape[d];
       int64_t rhs_st = rhs_stride[d];
-      
       int64_t i = (idx / o_st) % o_sh;
       /*
        * Simplfied for rhs_out += min(i, rhs_sh - 1) * rhs_st;
        * rhs_sh be o_sh or 1
        */
-      if (rhs_sh > i) { 
+      if (rhs_sh > i) {
         *rhs_out += i * rhs_st;
       }
     }
@@ -89,7 +88,7 @@ __device__ __forceinline__ void UnravelRavel(
       int64_t o_st = out_stride[d];
       int64_t lhs_sh = lhs_shape[d];
       int64_t lhs_st = lhs_stride[d];
-  
+
       int64_t i = (idx / o_st) % o_sh;
       /*
        * Simplfied for lhs_out += min(i, lhs_sh - 1) * lhs_st;

From cb7d1acc99ce71c3f22450e77e98a02384689684 Mon Sep 17 00:00:00 2001
From: Song <classicxsong@gmail.com>
Date: Wed, 11 Sep 2019 23:02:27 +0800
Subject: [PATCH 32/32] update docstring

---
 python/dgl/kernel.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/dgl/kernel.py b/python/dgl/kernel.py
index 4de76490e8d5..0a9444c036cf 100644
--- a/python/dgl/kernel.py
+++ b/python/dgl/kernel.py
@@ -10,6 +10,8 @@ def infer_binary_feature_shape(op, lhs, rhs):
 
     Parameter
     ---------
+    op : string
+        The binary_op name.
     lhs : dgl.ndarray.NDArray
         The lhs tensor.
     rhs : dgl.ndarray.NDArray