[Refactor] Replace the MLU ops implementation with mlu-ops (#2750)

open-mmlab · May 18, 2023 · 8725e68 · 8725e68
1 parent e197eff
commit 8725e68
Show file tree

Hide file tree

Showing 15 changed files with 301 additions and 6,398 deletions.
diff --git a/mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
diff --git a/mmcv/ops/csrc/common/mlu/iou3d_utils.hpp b/mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
diff --git a/mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
diff --git a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
diff --git a/mmcv/ops/csrc/common/mlu/nms_utils.hpp b/mmcv/ops/csrc/common/mlu/nms_utils.hpp
diff --git a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
diff --git a/mmcv/ops/csrc/common/mlu/voxelization_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/voxelization_mlu_kernel.mlu
diff --git a/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
@@ -10,114 +10,30 @@
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *************************************************************************/
 
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-                 const cnrtDataType_t data_type_input, const void *boxes_dram,
-                 const int input_box_num, const float iou_threshold,
-                 void *workspace, void *output_size, void *output);
-
-int selectType(uint32_t use_job, int box_num_per_core) {
-  // the box_num_per_core should be at least 256, otherwise the real IO
-  // bandwidth would be very low
-  while (box_num_per_core < 256 && use_job >= 4) {
-    box_num_per_core *= 2;
-    use_job /= 2;
-  }
-  return use_job;
-}
-static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
-                               int &core_num_per_class,
-                               const int input_box_num) {
-  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  uint32_t job_limit = getJobLimitCapability();
-  uint32_t core_number = job_limit;
-
-  int box_num_per_core = (input_box_num + core_number - 1) / core_number;
-  int use_job = selectType(job_limit, box_num_per_core);
-  // initiate k_type as Union1
-  k_dim->x = core_dim;
-  k_dim->y = 1;
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  switch (job_limit) {
-    case CN_KERNEL_CLASS_BLOCK:
-    case CN_KERNEL_CLASS_UNION:
-    case CN_KERNEL_CLASS_UNION2:
-    case CN_KERNEL_CLASS_UNION4:
-    case CN_KERNEL_CLASS_UNION8:
-    case CN_KERNEL_CLASS_UNION16: {
-      if (use_job < 4) {
-        k_dim->x = 1;
-        *k_type = CNRT_FUNC_TYPE_BLOCK;
-      } else if (use_job == 4) {
-        k_dim->x = core_dim;
-        *k_type = CNRT_FUNC_TYPE_UNION1;
-      } else {
-        k_dim->x = use_job;
-        *k_type = (cnrtFunctionType_t)use_job;
-      }
-    }; break;
-    default:
-      LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
-                   << " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
-  }
-  return CNNL_STATUS_SUCCESS;
-}
+#include "mlu_common_helper.h"
 
 void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
                                  float iou_threshold) {
-  // dimension parameters check
-  TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
-              boxes.dim(), "D");
-  TORCH_CHECK(boxes.size(1) == 7,
-              "boxes should have 7 elements in dimension 1, got ",
-              boxes.size(1));
-
-  // data type check
-  TORCH_CHECK(
-      boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
-      "data type of boxes should be Float or Half, got ", boxes.scalar_type());
-
   if (boxes.numel() == 0) {
     return;
   }
-  const size_t max_input_num = 2147483648;  // 2^31, 2G num
-  TORCH_CHECK(boxes.numel() < max_input_num,
-              "boxes.numel() should be less than 2147483648, got ",
-              boxes.numel());
-  int input_box_num = boxes.size(0);
-
-  cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
-  cnrtDim3_t k_dim;
-  cnrtJobType_t k_type;
-
-  int core_num_per_class;
-  policyFunc(&k_dim, &k_type, core_num_per_class, input_box_num);
 
-  // transpose boxes (n, 7) to (7, n) for better performance
-  auto boxes_t = boxes.transpose(0, 1);
-  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
-
-  auto output = at::empty({input_box_num}, boxes.options().dtype(at::kLong));
+  int input_box_num = boxes.size(0);
+  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
+  auto output = keep.to(boxes.options().dtype(at::kInt));
   auto output_size = at::empty({1}, boxes.options().dtype(at::kInt));
 
-  // workspace
-  const int info_num = 7;  // x, y,z, dx, dy, dz,angle
-  size_t space_size = 0;
-  if (boxes.scalar_type() == at::kHalf) {
-    space_size = input_box_num * sizeof(int16_t) * info_num +
-                 input_box_num * sizeof(float) + sizeof(float);
-  } else {
-    space_size = input_box_num * sizeof(float) * (info_num + 1) + sizeof(float);
-  }
+  MluOpTensorDescriptor boxes_desc, output_desc;
+  boxes_desc.set(boxes_);
+  output_desc.set(output);
 
-  auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
+  // workspace
+  size_t workspace_size = 0;
+  auto handle = mluOpGetCurrentHandle();
+  mluOpGetNmsWorkspaceSize(handle, boxes_desc.desc(), NULL, &workspace_size);
+  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
 
   // get compute queue
-  auto queue = torch_mlu::getCurQueue();
-
   auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
   auto boxes_ptr = boxes_impl->cnnlMalloc();
   auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
@@ -127,11 +43,29 @@ void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
   auto output_size_impl = torch_mlu::getMluTensorImpl(keep_num);
   auto output_size_ptr = output_size_impl->cnnlMalloc();
 
-  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  CNLOG(INFO) << "Launch Kernel KernelIou3d<<<Union" << k_type / core_dim
-              << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-  KernelIou3d(k_dim, k_type, queue, data_type_input, boxes_ptr, input_box_num,
-              iou_threshold, workspace_ptr, output_size_ptr, output_ptr);
+  // nms desc
+  mluOpNmsDescriptor_t nms_desc;
+  const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
+  const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
+  const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
+  const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
+  const float soft_nms_sigma = 0.0;
+  const float confidence_threshold = 0.0;
+  const int input_layout = 0;
+  const bool pad_to_max_output_size = false;
+  const int max_output_size = input_box_num;
+  const float offset = 0.0;
+
+  mluOpCreateNmsDescriptor(&nms_desc);
+  mluOpSetNmsDescriptor(nms_desc, box_mode, output_mode, algo, method_mode,
+                        iou_threshold, soft_nms_sigma, max_output_size,
+                        confidence_threshold, offset, input_layout,
+                        pad_to_max_output_size);
+
+  mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr, NULL, NULL,
+           workspace_ptr, workspace_size, output_desc.desc(), output_ptr,
+           output_size_ptr);
+  mluOpDestroyNmsDescriptor(nms_desc);
 }
 
 void iou3d_nms3d_forward_mlu(const Tensor boxes, Tensor &keep, Tensor &keep_num,

diff --git a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
@@ -18,8 +18,8 @@
 #include "pytorch_device_registry.hpp"
 
 #define MLUOP_MAJOR 0
-#define MLUOP_MINOR 5
-#define MLUOP_PATCHLEVEL 302
+#define MLUOP_MINOR 6
+#define MLUOP_PATCHLEVEL 0
 
 mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type);
 mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input);