Skip to content

Commit

Permalink
[Refactor] Replace the MLU ops implementation with mlu-ops (#2750)
Browse files Browse the repository at this point in the history
  • Loading branch information
defei-coder committed May 18, 2023
1 parent e197eff commit 8725e68
Show file tree
Hide file tree
Showing 15 changed files with 301 additions and 6,398 deletions.
431 changes: 0 additions & 431 deletions mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu

This file was deleted.

695 changes: 0 additions & 695 deletions mmcv/ops/csrc/common/mlu/iou3d_utils.hpp

This file was deleted.

2,094 changes: 0 additions & 2,094 deletions mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu

This file was deleted.

483 changes: 0 additions & 483 deletions mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu

This file was deleted.

553 changes: 0 additions & 553 deletions mmcv/ops/csrc/common/mlu/nms_utils.hpp

This file was deleted.

493 changes: 0 additions & 493 deletions mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu

This file was deleted.

649 changes: 0 additions & 649 deletions mmcv/ops/csrc/common/mlu/voxelization_mlu_kernel.mlu

This file was deleted.

136 changes: 35 additions & 101 deletions mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,114 +10,30 @@
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/

#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"

void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_dram,
const int input_box_num, const float iou_threshold,
void *workspace, void *output_size, void *output);

int selectType(uint32_t use_job, int box_num_per_core) {
// the box_num_per_core should be at least 256, otherwise the real IO
// bandwidth would be very low
while (box_num_per_core < 256 && use_job >= 4) {
box_num_per_core *= 2;
use_job /= 2;
}
return use_job;
}
static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
int &core_num_per_class,
const int input_box_num) {
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
uint32_t job_limit = getJobLimitCapability();
uint32_t core_number = job_limit;

int box_num_per_core = (input_box_num + core_number - 1) / core_number;
int use_job = selectType(job_limit, box_num_per_core);
// initiate k_type as Union1
k_dim->x = core_dim;
k_dim->y = 1;
k_dim->z = 1;
*k_type = CNRT_FUNC_TYPE_UNION1;
switch (job_limit) {
case CN_KERNEL_CLASS_BLOCK:
case CN_KERNEL_CLASS_UNION:
case CN_KERNEL_CLASS_UNION2:
case CN_KERNEL_CLASS_UNION4:
case CN_KERNEL_CLASS_UNION8:
case CN_KERNEL_CLASS_UNION16: {
if (use_job < 4) {
k_dim->x = 1;
*k_type = CNRT_FUNC_TYPE_BLOCK;
} else if (use_job == 4) {
k_dim->x = core_dim;
*k_type = CNRT_FUNC_TYPE_UNION1;
} else {
k_dim->x = use_job;
*k_type = (cnrtFunctionType_t)use_job;
}
}; break;
default:
LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
<< " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
}
return CNNL_STATUS_SUCCESS;
}
#include "mlu_common_helper.h"

void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
float iou_threshold) {
// dimension parameters check
TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
boxes.dim(), "D");
TORCH_CHECK(boxes.size(1) == 7,
"boxes should have 7 elements in dimension 1, got ",
boxes.size(1));

// data type check
TORCH_CHECK(
boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
"data type of boxes should be Float or Half, got ", boxes.scalar_type());

if (boxes.numel() == 0) {
return;
}
const size_t max_input_num = 2147483648; // 2^31, 2G num
TORCH_CHECK(boxes.numel() < max_input_num,
"boxes.numel() should be less than 2147483648, got ",
boxes.numel());
int input_box_num = boxes.size(0);

cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
cnrtDim3_t k_dim;
cnrtJobType_t k_type;

int core_num_per_class;
policyFunc(&k_dim, &k_type, core_num_per_class, input_box_num);

// transpose boxes (n, 7) to (7, n) for better performance
auto boxes_t = boxes.transpose(0, 1);
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);

auto output = at::empty({input_box_num}, boxes.options().dtype(at::kLong));
int input_box_num = boxes.size(0);
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
auto output = keep.to(boxes.options().dtype(at::kInt));
auto output_size = at::empty({1}, boxes.options().dtype(at::kInt));

// workspace
const int info_num = 7; // x, y,z, dx, dy, dz,angle
size_t space_size = 0;
if (boxes.scalar_type() == at::kHalf) {
space_size = input_box_num * sizeof(int16_t) * info_num +
input_box_num * sizeof(float) + sizeof(float);
} else {
space_size = input_box_num * sizeof(float) * (info_num + 1) + sizeof(float);
}
MluOpTensorDescriptor boxes_desc, output_desc;
boxes_desc.set(boxes_);
output_desc.set(output);

auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
// workspace
size_t workspace_size = 0;
auto handle = mluOpGetCurrentHandle();
mluOpGetNmsWorkspaceSize(handle, boxes_desc.desc(), NULL, &workspace_size);
auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));

// get compute queue
auto queue = torch_mlu::getCurQueue();

auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
auto boxes_ptr = boxes_impl->cnnlMalloc();
auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
Expand All @@ -127,11 +43,29 @@ void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
auto output_size_impl = torch_mlu::getMluTensorImpl(keep_num);
auto output_size_ptr = output_size_impl->cnnlMalloc();

uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
CNLOG(INFO) << "Launch Kernel KernelIou3d<<<Union" << k_type / core_dim
<< ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
KernelIou3d(k_dim, k_type, queue, data_type_input, boxes_ptr, input_box_num,
iou_threshold, workspace_ptr, output_size_ptr, output_ptr);
// nms desc
mluOpNmsDescriptor_t nms_desc;
const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
const float soft_nms_sigma = 0.0;
const float confidence_threshold = 0.0;
const int input_layout = 0;
const bool pad_to_max_output_size = false;
const int max_output_size = input_box_num;
const float offset = 0.0;

mluOpCreateNmsDescriptor(&nms_desc);
mluOpSetNmsDescriptor(nms_desc, box_mode, output_mode, algo, method_mode,
iou_threshold, soft_nms_sigma, max_output_size,
confidence_threshold, offset, input_layout,
pad_to_max_output_size);

mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr, NULL, NULL,
workspace_ptr, workspace_size, output_desc.desc(), output_ptr,
output_size_ptr);
mluOpDestroyNmsDescriptor(nms_desc);
}

void iou3d_nms3d_forward_mlu(const Tensor boxes, Tensor &keep, Tensor &keep_num,
Expand Down
4 changes: 2 additions & 2 deletions mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
#include "pytorch_device_registry.hpp"

#define MLUOP_MAJOR 0
#define MLUOP_MINOR 5
#define MLUOP_PATCHLEVEL 302
#define MLUOP_MINOR 6
#define MLUOP_PATCHLEVEL 0

mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type);
mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input);
Expand Down
Loading

0 comments on commit 8725e68

Please sign in to comment.