Skip to content

Commit

Permalink
[NPU] support npu profiler (PaddlePaddle#31684)
Browse files Browse the repository at this point in the history
* support npu profiler

* add python api

* fix bugs

* add wrapper for incomplete type

* update profile proto

* record npu wait

* add xpu placeholder
  • Loading branch information
zhiqiu committed Apr 15, 2021
1 parent 4668f1e commit 2b4d669
Show file tree
Hide file tree
Showing 9 changed files with 204 additions and 10 deletions.
8 changes: 5 additions & 3 deletions cmake/external/ascend.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,13 @@ if(WITH_ASCEND_CL)

set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)

message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})

ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/operators/expand_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,11 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
expand_times.size(), static_cast<size_t>(in_dims.size())));
auto* out0 = context.Output<framework::LoDTensor>("Out");
framework::DDim out_dims(in_dims);

for (size_t i = 0; i < expand_times.size(); ++i) {
out_dims[i] *= expand_times[i];
}

out0->Resize(out_dims);
out0->mutable_data<T>(context.device_context().GetPlace());
auto runner =
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/platform/device_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif

#include "glog/logging.h"
#include "paddle/fluid/platform/profiler.h"

namespace paddle {
namespace memory {
Expand Down Expand Up @@ -254,6 +254,7 @@ NPUDeviceContext::~NPUDeviceContext() {
}

void NPUDeviceContext::Wait() const {
platform::RecordEvent record_event("NPUDeviceContext/wait");
NPUDeviceGuard guard(place_.device);
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
}
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/platform/device_tracer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,8 @@ class DeviceTracerImpl : public DeviceTracer {
BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId());
} else if (platform::is_cuda_pinned_place(r.place)) {
event->set_place(proto::MemEvent::CUDAPinnedPlace);
} else if (platform::is_npu_place(r.place)) {
event->set_place(proto::MemEvent::NPUPlace);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"The current place is not supported."));
Expand Down
91 changes: 91 additions & 0 deletions paddle/fluid/platform/npu_profiler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <string>
#include <vector>

#include "acl/acl_prof.h"
#include "paddle/fluid/platform/enforce.h"

namespace paddle {
namespace platform {

// For ACL 20.1
// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats
// ACL_AICORE_PIPELINE = 1, record pipeline
// ACL_AICORE_SYNCHRONIZATION = 2, record sync
// ACL_AICORE_MEMORY = 3, recore memory
// ACL_AICORE_INTERNAL_MEMORY = 4, recore internal memory
// ACL_AICORE_STALL = 5, record pipeline ratio
constexpr aclprofAicoreMetrics default_metrics =
ACL_AICORE_ARITHMATIC_THROUGHPUT;

// ACL_PROF_ACL_API, record ACL API stats
// ACL_PROF_TASK_TIME, record AI core stats
// ACL_PROF_AICORE_METRICS, must include
// ACL_PROF_AICPU_TRACE, recore AICPU, not supported yet
constexpr uint64_t default_type =
ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS | ACL_PROF_TASK_TIME;

aclprofConfig *NPUProfilerCreateConfig(
std::vector<uint32_t> devices = {},
aclprofAicoreMetrics metrics = default_metrics, uint64_t c = default_type,
aclprofAicoreEvents *events = nullptr) {
if (devices.size() == 0) {
int device_id = GetCurrentNPUDeviceId();
devices.emplace_back(device_id);
}
aclprofConfig *config =
aclprofCreateConfig(devices.data(), devices.size(), metrics, events, c);
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::External(
"Failed to create prof config for NPU"));
return config;
}

void NPUProfilerDestroyConfig(const aclprofConfig *config) {
PADDLE_ENFORCE_NPU_SUCCESS(aclprofDestroyConfig(config));
}

void NPUProfilerInit(std::string output_path) {
PADDLE_ENFORCE_NPU_SUCCESS(
aclprofInit(output_path.c_str(), output_path.size()));
}

void NPUProfilerStart(const aclprofConfig *config) {
if (config == nullptr) {
// NOTE(zhiqiu): support single device by default.
int device_id = GetCurrentNPUDeviceId();
std::vector<uint32_t> devices = {static_cast<uint32_t>(device_id)};
config = NPUProfilerCreateConfig(devices);
}
PADDLE_ENFORCE_NPU_SUCCESS(aclprofStart(config));
}

void NPUProfilerStop(const aclprofConfig *config) {
PADDLE_ENFORCE_NPU_SUCCESS(aclprofStop(config));
NPUProfilerDestroyConfig(config);
}

void NPUProfilerFinalize() { PADDLE_ENFORCE_NPU_SUCCESS(aclprofFinalize()); }

struct NPUProfConfigWrapper {
aclprofConfig *p_;
explicit NPUProfConfigWrapper(aclprofConfig *p) : p_(p) {}
aclprofConfig *ptr() { return p_; }
};

} // namespace platform
} // namespace paddle
3 changes: 3 additions & 0 deletions paddle/fluid/platform/profiler.proto
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ message Event {
enum EventType {
CPU = 0;
GPUKernel = 1;
NPUKernel = 2;
}
optional EventType type = 8;
optional string name = 1;
Expand All @@ -39,6 +40,8 @@ message MemEvent {
CUDAPlace = 0;
CPUPlace = 1;
CUDAPinnedPlace = 2;
XPUPlace = 3;
NPUPlace = 4;
}
optional uint64 start_ns = 1;
optional uint64 end_ns = 2;
Expand Down
31 changes: 26 additions & 5 deletions paddle/fluid/pybind/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ limitations under the License. */

#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/platform/npu_profiler.h"
#endif

#ifdef PADDLE_WITH_XPU
Expand Down Expand Up @@ -581,11 +582,6 @@ PYBIND11_MODULE(core_noavx, m) {
make_ddim(x_dim), make_ddim(y_dim), -1));
});

#ifdef PADDLE_WITH_ASCEND_CL
m.def("_npu_finalize",
[]() { platform::AclInstance::Instance().Finalize(); });
#endif

m.def(
"_append_python_callable_object_and_return_id",
[](py::object py_obj) -> size_t {
Expand Down Expand Up @@ -2180,6 +2176,31 @@ All parameter, weight, gradient are variables in Paddle.
#endif
#endif

#ifdef PADDLE_WITH_ASCEND_CL
m.def("get_npu_device_count", platform::GetNPUDeviceCount);
m.def("_npu_finalize", []() {
platform::AclInstance::Instance().Finalize();
}); // private interface

py::class_<platform::NPUProfConfigWrapper>(m, "NPUProfConfigWrapper");

m.def("npu_prof_init", platform::NPUProfilerInit);
m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) {
platform::NPUProfilerStart(c.ptr());
});
m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) {
platform::NPUProfilerStop(c.ptr());
});
m.def("npu_prof_finalize", platform::NPUProfilerFinalize);
m.def("npu_prof_create_config", []() {
return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig());
});

m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) {
platform::NPUProfilerDestroyConfig(c.ptr());
});
#endif

py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
.value("kDefault", platform::TracerOption::kDefault)
.value("kOpDetail", platform::TracerOption::kOpDetail)
Expand Down
59 changes: 59 additions & 0 deletions python/paddle/fluid/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,65 @@ def cuda_profiler(output_file, output_mode=None, config=None):
os.remove(config_file)


@signature_safe_contextmanager
def npu_profiler(output_file, config=None):
"""
The NPU profiler.
This fuctions is used to profile NPU program by NPU runtime application
programming interface. The profiling result will be written into
`output_file`. The users can set set the NPU profiling config by `config` argument.
After getting the profiling result file, users can use
`tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_
to load this output file to visualize results.
Args:
output_file (str) : The output file name, the result will be
written into this file. It should be absolute path.
config (list<str>, optional) : NPU profile config. For more details, please
refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ .
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
import numpy as np
epoc = 8
dshape = [4, 3, 28, 28]
data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
place = fluid.NPUPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output_file = 'npu.txt'
with profiler.npu_profiler(output_file) as npu_prof:
for i in range(epoc):
input = np.random.random(dshape).astype('float32')
exe.run(fluid.default_main_program(), feed={'data': input})
# then use NPU profiler tools to load this output file
# to visualize results.
"""
# TODO: support config in python.
if not config:
config = core.npu_prof_create_config()

core.npu_prof_init(output_file)
# Enables profiler collection by the active NPU profiling tool.
core.npu_prof_start(config)
try:
yield
# Disables profiler collection.
finally:
core.npu_prof_stop(config)
core.npu_prof_finalize()


def reset_profiler():
"""
Clear the previous time record. This interface does not work for
Expand Down
15 changes: 14 additions & 1 deletion tools/timeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,13 @@ def _allocate_pids(self):
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" %
(k, mevent.device_id), pid)
elif mevent.place == profiler_pb2.MemEvent.NPUPlace:
if (k, mevent.device_id, "NPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id, "NPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:npu:%d" % (k, mevent.device_id),
pid)
if (k, 0, "CPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "CPU")] = pid
Expand All @@ -201,6 +208,11 @@ def _allocate_pids(self):
self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
if (k, 0, "NPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "NPU")] = pid
self._chrome_trace.emit_pid("memory usage on %s:npu:%d" %
(k, 0), pid)

def _allocate_events(self):
for k, profile_pb in six.iteritems(self._profile_dict):
Expand All @@ -227,7 +239,8 @@ def _allocate_memory_event(self):
place_to_str = {
profiler_pb2.MemEvent.CPUPlace: "CPU",
profiler_pb2.MemEvent.CUDAPlace: "GPU",
profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace",
profiler_pb2.MemEvent.NPUPlace: "NPU"
}
for k, profile_pb in six.iteritems(self._profile_dict):
mem_list = []
Expand Down

0 comments on commit 2b4d669

Please sign in to comment.