npu knn/tnn bugfix

open-mmlab · Jun 13, 2024 · b5fa6db · b5fa6db
1 parent a65aa0f
commit b5fa6db
Show file tree

Hide file tree

Showing 4 changed files with 92 additions and 4 deletions.
diff --git a/mmcv/ops/csrc/common/pytorch_npu_util.hpp b/mmcv/ops/csrc/common/pytorch_npu_util.hpp
@@ -583,4 +583,72 @@ typedef void (*ReleaseHugeMem)(void *, bool);
     }                                                                         \
   } while (false)
 
+#define EXEC_NPU_CMD_SYNC(aclnn_api, ...)                                     \
+  do {                                                                        \
+    static const auto getWorkspaceSizeFuncAddr =                              \
+        GetOpApiFuncAddr(#aclnn_api "GetWorkspaceSize");                      \
+    static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api);           \
+    static const auto initMemAddr =                                           \
+        GetOpApiFuncAddr("InitHugeMemThreadLocal");                           \
+    static const auto unInitMemAddr =                                         \
+        GetOpApiFuncAddr("UnInitHugeMemThreadLocal");                         \
+    static const auto releaseMemAddr = GetOpApiFuncAddr("ReleaseHugeMem");    \
+    TORCH_CHECK(                                                              \
+        getWorkspaceSizeFuncAddr != nullptr && opApiFuncAddr != nullptr,      \
+        #aclnn_api, " or ", #aclnn_api "GetWorkspaceSize", " not in ",        \
+        GetOpApiLibName(), ", or ", GetOpApiLibName(), "not found.");         \
+    auto acl_stream = c10_npu::getCurrentNPUStream().stream(false);           \
+    uint64_t workspace_size = 0;                                              \
+    uint64_t *workspace_size_addr = &workspace_size;                          \
+    aclOpExecutor *executor = nullptr;                                        \
+    aclOpExecutor **executor_addr = &executor;                                \
+    InitHugeMemThreadLocal initMemFunc =                                      \
+        reinterpret_cast<InitHugeMemThreadLocal>(initMemAddr);                \
+    UnInitHugeMemThreadLocal unInitMemFunc =                                  \
+        reinterpret_cast<UnInitHugeMemThreadLocal>(unInitMemAddr);            \
+    if (initMemFunc) {                                                        \
+      initMemFunc(nullptr, false);                                            \
+    }                                                                         \
+    auto converted_params =                                                   \
+        ConvertTypes(__VA_ARGS__, workspace_size_addr, executor_addr);        \
+    static auto getWorkspaceSizeFunc =                                        \
+        ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr);       \
+    auto workspace_status = call(getWorkspaceSizeFunc, converted_params);     \
+    TORCH_CHECK(workspace_status == 0,                                        \
+                "call " #aclnn_api " failed, detail:", aclGetRecentErrMsg()); \
+    void *workspace_addr = nullptr;                                           \
+    if (workspace_size != 0) {                                                \
+      at::TensorOptions options =                                             \
+          at::TensorOptions(torch_npu::utils::get_npu_device_type());         \
+      auto workspace_tensor =                                                 \
+          at::empty({workspace_size}, options.dtype(kByte));                  \
+      workspace_addr = const_cast<void *>(workspace_tensor.storage().data()); \
+    }                                                                         \
+    auto acl_call = [converted_params, workspace_addr, workspace_size,        \
+                     acl_stream, executor]() -> int {                         \
+      typedef int (*OpApiFunc)(void *, uint64_t, aclOpExecutor *,             \
+                               const aclrtStream);                            \
+      OpApiFunc opApiFunc = reinterpret_cast<OpApiFunc>(opApiFuncAddr);       \
+      auto api_ret =                                                          \
+          opApiFunc(workspace_addr, workspace_size, executor, acl_stream);    \
+      TORCH_CHECK(api_ret == 0, "call " #aclnn_api " failed, detail:",        \
+                  aclGetRecentErrMsg());                                      \
+      ReleaseConvertTypes(converted_params);                                  \
+      ReleaseHugeMem releaseMemFunc =                                         \
+          reinterpret_cast<ReleaseHugeMem>(releaseMemAddr);                   \
+      if (releaseMemFunc) {                                                   \
+        releaseMemFunc(nullptr, false);                                       \
+      }                                                                       \
+      return api_ret;                                                         \
+    };                                                                        \
+    at_npu::native::OpCommand cmd;                                            \
+    cmd.Name(#aclnn_api);                                                     \
+    cmd.SetCustomHandler(acl_call);                                           \
+    cmd.Run();                                                                \
+    cmd.Sync();                                                               \
+    if (unInitMemFunc) {                                                      \
+      unInitMemFunc(nullptr, false);                                          \
+    }                                                                         \
+  } while (false)
+
 #endif  // MMCV_OPS_CSRC_COMMON_PYTORCH_NPU_UTIL_HPP_
diff --git a/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp b/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp
@@ -8,11 +8,19 @@ using namespace std;
 void knn_forward_npu(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
   // transpose known from [B, N, 3] to [B, 3, N]
-  at::Tensor source = xyz.transpose(1, 2).contiguous();
+  at::Tensor source = xyz.transpose(2, 1).contiguous();
   at::Tensor target = new_xyz.contiguous();
 
+  at::Tensor dist = at::zeros({target.sizes()[0], target.sizes()[1], source.sizes()[2]}, target.options());
   bool is_from_knn = true;
-  EXEC_NPU_CMD(aclnnKnn, source, target, nsample, is_from_knn, idx, dist2);
+  EXEC_NPU_CMD_SYNC(aclnnKnn, source, target, is_from_knn, dist);
+
+  idx = idx.to(at::kLong);
+  int64_t dim = 2;
+  bool largest = false;
+  bool sorted = true;
+  EXEC_NPU_CMD_SYNC(aclnnTopk, dist, nsample, dim, largest, sorted, dist2, idx);
+  idx = idx.to(at::kInt);
 }
 
 void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,

diff --git a/mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp b/mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp
@@ -8,17 +8,26 @@ using namespace std;
 void three_nn_forward_npu(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx) {
   // transpose known  [B, N, 3] -> [B, 3, N]
-  at::Tensor source = known.transpose(1, 2).contiguous();
+  at::Tensor source = known.transpose(2, 1).contiguous();
   at::Tensor target = unknown.contiguous();
   auto originDtype = source.scalar_type();
   if (originDtype == at::kHalf) {
     source = source.to(at::kFloat);
     target = target.to(at::kFloat);
   }
 
+  at::Tensor dist = at::zeros({target.sizes()[0], target.sizes()[1], source.sizes()[2]}, target.options().dtype(at::kFloat));
   bool is_from_knn = false;
+  EXEC_NPU_CMD_SYNC(aclnnKnn, source, target, is_from_knn, dist);
+
+  idx = idx.to(at::kLong);
+  int64_t dim = 2;
+  bool largest = false;
+  bool sorted = true;
   uint32_t nsample = 3;
-  EXEC_NPU_CMD(aclnnKnn, source, target, nsample, is_from_knn, idx, dist2);
+  EXEC_NPU_CMD_SYNC(aclnnTopk, dist, nsample, dim, largest, sorted, dist2, idx);
+  idx = idx.to(at::kInt);
+
   if (originDtype == at::kHalf) {
     dist2 = dist2.to(at::kHalf);
   }

diff --git a/mmcv/ops/knn.py b/mmcv/ops/knn.py
@@ -67,6 +67,9 @@ def forward(ctx,
 
         ext_module.knn_forward(
             xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k)
+        if xyz.device.type != 'npu':
+            zeros_idx = torch.zeros(B, npoint, k, dtype=torch.int32).npu()
+            idx.where(dist2 >= 1e10, zeros_idx)
         # idx shape to [B, k, npoint]
         idx = idx.transpose(2, 1).contiguous()
         if torch.__version__ != 'parrots':