Skip to content

Commit

Permalink
[AUTOTVM] TOPI integration for ARM CPU (apache#1487)
Browse files Browse the repository at this point in the history
  • Loading branch information
merrymercy authored and sergei-mironov committed Aug 8, 2018
1 parent 21b3c07 commit 0354c31
Show file tree
Hide file tree
Showing 78 changed files with 3,504 additions and 2,306 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,6 @@ build*

# Jetbrain
.idea

# tmp file
.nfs*
70 changes: 70 additions & 0 deletions apps/benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Performance Benchmark

## Results

See results on wiki page https://github.com/dmlc/tvm/wiki/Benchmark

## How to Reproduce

### ARM CPU
We use RPC infrastructure in TVM to make device management easy. So you need to use it for reproducing benchmark results.

1. Start an RPC Tracker on the host machine
```bash
python3 -m tvm.exec.rpc_tracker
```

2. Register devices to the tracker
* For Linux device
* Build tvm runtime on your device [Help](https://docs.tvm.ai/tutorials/nnvm/deploy_model_on_rasp.html#build-tvm-runtime-on-device)
* Register your device to tracker by
```bash
python3 -m tvm.exec.rpc_sever --tracker=[HOST_IP]:9190 --key=[DEVICE_KEY]
```
replace `[HOST_IP]` with the IP address of the host machine, `[DEVICE_KEY]` with the name of device.

E.g. Here is an example command for RK3399,
`python3 -m tvm.exec.rpc_sever --tracker=10.77.1.123:9190 --key=rk3399`, where 10.77.1.123 is the IP address of the tracker.

* For Android device
* Build and install tvm RPC apk on your device [Help](https://github.com/dmlc/tvm/tree/master/apps/android_rpc).
Make sure you can pass the android rpc test. Then you have alreadly known how to register.

3. Verify the device registration
We can query all registered devices by
```bash
python3 -m tvm.exec.query_rpc_tracker
```
You should be able to find your devices in `Queue Status`. Make sure the registration is correct before going ahead.

For our test environment, one sample output can be
```bash
Queue Status
------------------------------
key free pending
------------------------------
mate10pro 1 0
p20pro 2 0
pixel2 2 0
rk3399 2 0
rasp3b 8 0
```

4. Run benchmark
We did auto-tuning for Huawei P20/Mate10 Pro, Google Pixel2, Raspberry Pi3 and Firefly-RK3399,
and release pre-tuned parameters in [this repo](https://github.com/uwsaml/tvm-distro).
During compilation, TVM will download these operator parameters automatically.

```bash
python3 arm_cpu_imagenet_bench.py --device rasp3b --rpc-key rasp3b
python3 arm_cpu_imagenet_bench.py --device rk3399 --rpc-key rk3399
python3 arm_cpu_imagenet_bench.py --device pixel2 --rpc-key pixel2
python3 arm_cpu_imagenet_bench.py --device p20pro --rpc-key p20pro
python3 arm_cpu_imagenet_bench.py --device mate10pro --rpc-key mate10pro
```

If your device has a same SoC of the above device, you can reuse these parameters
(e.g. use `llvm -device=arm_cpu -mode=rk3399 -target=aarch64-linux-gnu` as target).
Otherwise, you need to tune for your own device, please follow this
[tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html).

96 changes: 96 additions & 0 deletions apps/benchmark/arm_cpu_imagenet_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""Benchmark script for performance on ARM CPU.
see README.md for the usage and results of this script.
"""

import argparse
import time

import numpy as np

import nnvm.testing
import nnvm.compiler
import tvm
from tvm import autotvm
from tvm.contrib.util import tempdir
import tvm.contrib.graph_runtime as runtime

def get_network(name, batch_size):
"""Get the symbol definition and random weight of a network"""
input_shape = (batch_size, 3, 224, 224)
output_shape = (batch_size, 1000)

if name == 'resnet-18':
net, params = nnvm.testing.resnet.get_workload(num_layers=18,
batch_size=batch_size, image_shape=(3, 224, 224))
elif name == 'mobilenet':
net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
elif name == 'squeezenet v1.1':
net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size,
version='1.1')
elif name == 'vgg-16':
net, params = nnvm.testing.vgg.get_workload(batch_size=batch_size, num_layers=16)
else:
raise RuntimeError("Unsupported network: " + name)

return net, params, input_shape, output_shape


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--network", type=str, choices=['resnet-18', 'mobilenet', 'squeezenet v1.1', 'vgg-16'])
parser.add_argument("--device", type=str, required=True, choices=['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro',
'pixel2', 'rasp3b', 'pynq'])
parser.add_argument("--host", type=str, default='localhost')
parser.add_argument("--port", type=int, default=9190)
parser.add_argument("--rpc-key", type=str, required=True)
parser.add_argument("--number", type=int, default=6)
args = parser.parse_args()

dtype = 'float32'

if args.network is None:
networks = ['squeezenet v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
else:
networks = [args.network]

target = tvm.target.arm_cpu(model=args.device)

# connect to remote device
tracker = tvm.rpc.connect_tracker(args.host, args.port)
remote = tracker.request(args.rpc_key)

print("--------------------------------------------------")
print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
print("--------------------------------------------------")
for network in networks:
net, params, input_shape, output_shape = get_network(network, batch_size=1)

with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
graph, lib, params = nnvm.compiler.build(
net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)

tmp = tempdir()
if 'android' in str(target):
from tvm.contrib import ndk
filename = "%s.so" % network
lib.export_library(tmp.relpath(filename), ndk.create_shared)
else:
filename = "%s.tar" % network
lib.export_library(tmp.relpath(filename))

# upload library and params
ctx = remote.context(str(target), 0)
remote.upload(tmp.relpath(filename))
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}

rlib = remote.load_module(filename)
module = runtime.create(graph, rlib, ctx)
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
module.set_input('data', data_tvm)
module.set_input(**rparams)

# evaluate
ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
prof_res = np.array(ftimer().results) * 1000 # multiply 1000 for converting to millisecond
print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))

76 changes: 0 additions & 76 deletions apps/benchmark/rasp_imagenet_bench.py

This file was deleted.

12 changes: 12 additions & 0 deletions docs/api/python/autotvm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ tvm.autotvm.tuner
.. automodule:: tvm.autotvm.tuner.callback
:members:

.. automodule:: tvm.autotvm.tuner.graph_tuning
:members:

tvm.autotvm.task
~~~~~~~~~~~~~~~~
.. automodule:: tvm.autotvm.task
Expand All @@ -55,6 +58,15 @@ tvm.autotvm.task
.. automodule:: tvm.autotvm.task.space
:members:

.. automodule:: tvm.autotvm.task.dispatcher
:members:

.. automodule:: tvm.autotvm.task.topi_integration
:members:

.. automodule:: tvm.autotvm.task.nnvm_integration
:members:

tvm.autotvm.record
~~~~~~~~~~~~~~~~~~
.. automodule:: tvm.autotvm.record
Expand Down
4 changes: 3 additions & 1 deletion docs/install/from_source.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ The configuration of tvm can be modified by `config.cmake`.
- Edit ``build/config.cmake`` to customize the compilation options

- On macOS, for some versions of XCode, you need to add ``-lc++abi`` in the LDFLAGS or you'll get link errors.
- Change ``set(USE_CUDA OFF)`` to ``set(USE_CUDA ON)`` to enable CUDA backend. So do other backends and libraries
(OpenCL, RCOM, METAL, VULKAN, ...).

- TVM optionally depends on LLVM. LLVM is required for CPU codegen that needs LLVM.

Expand All @@ -84,7 +86,7 @@ The configuration of tvm can be modified by `config.cmake`.
cmake ..
make -j4
If everything goes well, we can go to :ref:`python-package-installation`_
If everything goes well, we can go to :ref:`python-package-installation`

Building on Windows
~~~~~~~~~~~~~~~~~~~
Expand Down
71 changes: 71 additions & 0 deletions nnvm/include/nnvm/top/nn.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,77 @@ struct Conv2DParam : public dmlc::Parameter<Conv2DParam> {
static const constexpr int kBias = 2;
};

struct WinogradWeightTransformParam : public dmlc::Parameter<WinogradWeightTransformParam> {
int tile_size;

DMLC_DECLARE_PARAMETER(WinogradWeightTransformParam) {
DMLC_DECLARE_FIELD(tile_size)
.describe("Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
}

static const constexpr int kWeight = 0;
};

struct WinogradConv2DParam : public dmlc::Parameter<WinogradConv2DParam> {
int channels;
TShape kernel_size;
TShape strides;
TShape padding;
TShape dilation;
int groups;
std::string layout;
std::string kernel_layout;
std::string out_layout;
int out_dtype;
bool use_bias;
int tile_size;

DMLC_DECLARE_PARAMETER(WinogradConv2DParam) {
DMLC_DECLARE_FIELD(channels)
.describe("The dimensionality of the output space"
"i.e. the number of output channels in the convolution.");
DMLC_DECLARE_FIELD(kernel_size)
.describe("Specifies the dimensions of the convolution window.");
DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
.describe("Specifies the strides of the convolution.");
DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
.describe("If padding is non-zero, then the input is implicitly zero-padded"
"on both sides for padding number of points");
DMLC_DECLARE_FIELD(dilation).set_default(TShape({1, 1}))
.describe("Specifies the dilation rate to use for dilated convolution.");
DMLC_DECLARE_FIELD(groups).set_default(1)
.describe("Controls the connections between inputs and outputs."
"At groups=1, all inputs are convolved to all outputs."
"At groups=2, the operation becomes equivalent to having two convolution"
"layers side by side, each seeing half the input channels, and producing"
"half the output channels, and both subsequently concatenated.");
DMLC_DECLARE_FIELD(layout).set_default("NCHW")
.describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
"'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
"dimensions respectively. Convolution is applied on the 'H' and"
"'W' dimensions.");
DMLC_DECLARE_FIELD(out_layout).set_default("__undef__")
.describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
"'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
"dimensions respectively. Default to be same as input layout.");
DMLC_DECLARE_FIELD(kernel_layout).set_default("OIHW")
.describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
"'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
"dimensions respectively.");
DMLC_DECLARE_DTYPE_FIELD(out_dtype)
.add_enum("same", -1)
.set_default(-1)
.describe("Output data type, set to explicit type under mixed precision setting");
DMLC_DECLARE_FIELD(use_bias).set_default(true)
.describe("Whether the layer uses a bias vector.");
DMLC_DECLARE_FIELD(tile_size)
.describe("Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
}
// constants
static const constexpr int kData = 0;
static const constexpr int kWeight = 1;
static const constexpr int kBias = 2;
};

struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
int channels;
Expand Down
Loading

0 comments on commit 0354c31

Please sign in to comment.