[Relay][TOPI] Support of depthwise conv2d NHWC for Mali/Bifrost. (apa…

…che#8584) * [Relay][TOPI] Support of depthwise conv2d NHWC for Mali/Bifrost. Added initial tunable autotvm templates for depthwise conv2d with NHWC layout for Mali and Bifrost. * [Relay][TOPI] Misc fixes for depthwise conv2d Mali/Bifrost. - Fix assert for Bifrost. - Set reasonable default axis splits to avoid using tophub for NHWC. - Fixed typo: arm cpu -> Mali. * [Relay][TOPI] Fixed formatting in depthwise conv2d Mali/Bifrost.
ylc · Sep 29, 2021 · 3663e83 · 3663e83
1 parent 94e2172
commit 3663e83
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 71 deletions.
diff --git a/python/tvm/relay/op/strategy/bifrost.py b/python/tvm/relay/op/strategy/bifrost.py
@@ -83,6 +83,14 @@ def conv2d_strategy_bifrost(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.bifrost.schedule_depthwise_conv2d_nchw),
                 name="depthwise_conv2d_nchw.bifrost",
             )
+        elif layout == "NHWC":
+            assert kernel_layout == "HWOI"
+            # For now just reuse general Mali strategy.
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.mali.depthwise_conv2d_nhwc),
+                wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nhwc),
+                name="depthwise_conv2d_nchw.bifrost",
+            )
         else:
             raise RuntimeError(
                 "Unsupported depthwise_conv2d layout {} for Mali(Bifrost)".format(layout)

diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py
@@ -120,14 +120,17 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target):
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
             if not is_auto_scheduler_enabled():
-                raise RuntimeError(
-                    "depthwise_conv2d NHWC layout is not enabled for mali without auto_scheduler."
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.mali.depthwise_conv2d_nhwc),
+                    wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nhwc),
+                    name="depthwise_conv2d_nhwc.mali",
+                )
+            else:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                    naive_schedule,
+                    name="depthwise_conv2d_nhwc.mali",
                 )
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
-                naive_schedule,
-                name="depthwise_conv2d_nhwc.mali",
-            )
         else:
             raise RuntimeError("Unsupported depthwise_conv2d layout {} for mali".format(layout))
     else:  # group_conv2d

diff --git a/python/tvm/topi/mali/depthwise_conv2d.py b/python/tvm/topi/mali/depthwise_conv2d.py
@@ -30,7 +30,7 @@ def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dty
     return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
 
 
-# register customized schedule for arm cpu.
+# register customized schedule for Mali.
 @autotvm.register_topi_schedule("depthwise_conv2d_nchw.mali")
 def schedule_depthwise_conv2d_nchw(cfg, outs):
     """Schedule depthwise conv2d
@@ -51,86 +51,158 @@ def schedule_depthwise_conv2d_nchw(cfg, outs):
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     s = te.create_schedule([x.op for x in outs])
 
-    def _schedule(pad_data, kernel, conv):
-        """schedule depthwise_conv2d"""
-        max_unroll = 16
-        vec_size = [1, 2, 4, 8, 16]
+    def _callback(op):
+        """traverse to find op to schedule"""
+        # schedule depthwise_conv2d
+        if op.tag == "depthwise_conv2d_nchw":
+            pad_data = op.input_tensors[0]
+            kernel = op.input_tensors[1]
+            conv = op.output(0)
+            _schedule(cfg, s, pad_data, kernel, conv, "NCHW")
 
-        ##### space definition begin #####
-        n, c, y, x = s[conv].op.axis
-        bc, tc, ci = cfg.define_split("tile_c", c, num_outputs=3)
-        by, ty, yi = cfg.define_split("tile_y", y, num_outputs=3)
-        bx, tx, xi = cfg.define_split("tile_x", x, num_outputs=3)
-        cfg.define_annotate("ann_spatial", [ci, yi, xi], policy="try_unroll_vec")
+    traverse_inline(s, outs[0].op, _callback)
+    return s
 
-        # fallback support
-        if cfg.is_fallback:
-            ref_log = autotvm.tophub.load_reference_log(
-                "mali", "rk3399", "depthwise_conv2d_nchw.mali"
-            )
-            cfg.fallback_with_reference_log(ref_log)
-        ###### space definition end ######
 
-        # schedule padding
-        n, c, y, x = s[pad_data].op.axis
-        tile_and_bind3d(s, pad_data, c, y, x, cfg["tile_c"].size[1], 1, 1)
+# register original implementation of depthwise_conv2d_nhwc since we don't need to change this part
+@autotvm.register_topi_compute("depthwise_conv2d_nhwc.mali")
+def depthwise_conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    return nn.depthwise_conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
 
-        # schedule dilation
-        if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-            s[kernel].compute_inline()
 
-        # schedule conv
-        if conv.op not in s.outputs:
-            s[conv].set_scope("local")
-            OL = conv
-            output = s.outputs[0].output(0)
-        else:
-            OL = s.cache_write(conv, "local")
-            output = conv
-
-        n, c, y, x = s[output].op.axis
-        bc, tc, ci = cfg["tile_c"].apply(s, output, c)
-        by, ty, yi = cfg["tile_y"].apply(s, output, y)
-        bx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-        bc = s[output].fuse(n, bc)
-        s[output].bind(bc, te.thread_axis("blockIdx.z"))
-        s[output].bind(tc, te.thread_axis("threadIdx.z"))
-        s[output].bind(by, te.thread_axis("blockIdx.y"))
-        s[output].bind(ty, te.thread_axis("threadIdx.y"))
-        s[output].bind(bx, te.thread_axis("blockIdx.x"))
-        s[output].bind(tx, te.thread_axis("threadIdx.x"))
-
-        di, dj = s[OL].op.reduce_axis
-        s[OL].unroll(di)
-        s[OL].unroll(dj)
-
-        s[OL].compute_at(s[output], tx)
-        n, ci, yi, xi = s[OL].op.axis
-
-        cfg["ann_spatial"].apply(
-            s,
-            OL,
-            [ci, yi, xi],
-            axis_lens=[cfg["tile_c"].size[2], cfg["tile_y"].size[2], cfg["tile_x"].size[2]],
-            max_unroll=max_unroll,
-            vec_size=vec_size,
-            cfg=cfg,
-        )
+# register customized schedule for Mali.
+@autotvm.register_topi_schedule("depthwise_conv2d_nhwc.mali")
+def schedule_depthwise_conv2d_nhwc(cfg, outs):
+    """Schedule depthwise conv2d
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The configuration of this template
+    outs: Array of Tensor
+        The computation graph description of depthwise convolution2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for depthwise_conv2d nchw.
+    """
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         """traverse to find op to schedule"""
         # schedule depthwise_conv2d
-        if op.tag == "depthwise_conv2d_nchw":
+        if op.tag == "depthwise_conv2d_nhwc":
             pad_data = op.input_tensors[0]
             kernel = op.input_tensors[1]
             conv = op.output(0)
-            _schedule(pad_data, kernel, conv)
+            _schedule(cfg, s, pad_data, kernel, conv, "NHWC")
 
     traverse_inline(s, outs[0].op, _callback)
     return s
 
 
+def _schedule(cfg, s, pad_data, kernel, conv, layout):
+    """schedule depthwise_conv2d"""
+    assert layout in ("NCHW", "NHWC")
+
+    max_unroll = 16
+    vec_size = [1, 2, 4, 8, 16]
+
+    ##### space definition begin #####
+    if layout == "NCHW":
+        n, c, h, w = s[conv].op.axis
+    else:
+        n, h, w, c = s[conv].op.axis
+
+    bc, tc, ci = cfg.define_split("tile_c", c, num_outputs=3)
+    bh, th, hi = cfg.define_split("tile_y", h, num_outputs=3)
+    bw, tw, wi = cfg.define_split("tile_x", w, num_outputs=3)
+    cfg.define_annotate("ann_spatial", [ci, hi, wi], policy="try_unroll_vec")
+
+    # fallback support
+    if cfg.is_fallback:
+        if layout == "NCHW":
+            ref_log = autotvm.tophub.load_reference_log(
+                "mali", "rk3399", "depthwise_conv2d_nchw.mali"
+            )
+            cfg.fallback_with_reference_log(ref_log)
+        else:
+            cfg.fallback_split("tile_c", [-1, 4, 2])
+            cfg.fallback_split("tile_y", [-1, 4, 2])
+            cfg.fallback_split("tile_x", [-1, 4, 2])
+    ###### space definition end ######
+
+    # schedule padding
+    if layout == "NCHW":
+        n, c, h, w = s[pad_data].op.axis
+        z, y, x = c, h, w
+        z_factor, y_factor, x_factor = cfg["tile_c"].size[1], 1, 1
+    else:
+        n, h, w, c = s[pad_data].op.axis
+        z, y, x = h, w, c
+        z_factor, y_factor, x_factor = 1, 1, cfg["tile_c"].size[1]
+    tile_and_bind3d(s, pad_data, z, y, x, z_factor, y_factor, x_factor)
+
+    # schedule dilation
+    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    # schedule conv
+    if conv.op not in s.outputs:
+        s[conv].set_scope("local")
+        OL = conv
+        output = s.outputs[0].output(0)
+    else:
+        OL = s.cache_write(conv, "local")
+        output = conv
+
+    if layout == "NCHW":
+        n, c, h, w = s[output].op.axis
+    else:
+        n, h, w, c = s[output].op.axis
+
+    bc, tc, ci = cfg["tile_c"].apply(s, output, c)
+    bh, th, hi = cfg["tile_y"].apply(s, output, h)
+    bw, tw, wi = cfg["tile_x"].apply(s, output, w)
+
+    if layout == "NCHW":
+        bz, tz, by, ty, bx, tx = bc, tc, bh, th, bw, tw
+    else:
+        bz, tz, by, ty, bx, tx = bh, th, bw, tw, bc, tc
+
+    bz = s[output].fuse(n, bz)
+    s[output].bind(bz, te.thread_axis("blockIdx.z"))
+    s[output].bind(tz, te.thread_axis("threadIdx.z"))
+    s[output].bind(by, te.thread_axis("blockIdx.y"))
+    s[output].bind(ty, te.thread_axis("threadIdx.y"))
+    s[output].bind(bx, te.thread_axis("blockIdx.x"))
+    s[output].bind(tx, te.thread_axis("threadIdx.x"))
+
+    di, dj = s[OL].op.reduce_axis
+    s[OL].unroll(di)
+    s[OL].unroll(dj)
+
+    s[OL].compute_at(s[output], tx)
+
+    if layout == "NCHW":
+        n, ci, hi, wi = s[OL].op.axis
+    else:
+        n, hi, wi, ci = s[OL].op.axis
+
+    cfg["ann_spatial"].apply(
+        s,
+        OL,
+        [ci, hi, wi],
+        axis_lens=[cfg["tile_c"].size[2], cfg["tile_y"].size[2], cfg["tile_x"].size[2]],
+        max_unroll=max_unroll,
+        vec_size=vec_size,
+        cfg=cfg,
+    )
+
+
 def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
     """tile and bind 3d"""
     y_factor = y_factor or z_factor

diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py
@@ -61,6 +61,8 @@
             )
         ],
         "gpu": [(topi.nn.depthwise_conv2d_nhwc, topi.cuda.schedule_depthwise_conv2d_nhwc)],
+        "mali": [(topi.mali.depthwise_conv2d_nhwc, topi.mali.schedule_depthwise_conv2d_nhwc)],
+        "bifrost": [(topi.mali.depthwise_conv2d_nhwc, topi.mali.schedule_depthwise_conv2d_nhwc)],
     },
     "NCHWc": {
         "generic": [(topi.x86.depthwise_conv2d_NCHWc, topi.x86.schedule_depthwise_conv2d_NCHWc)],