diff --git a/mmcv/ops/csrc/pytorch/npu/ball_query_npu.cpp b/mmcv/ops/csrc/pytorch/npu/ball_query_npu.cpp
new file mode 100644
index 0000000000..9167875376
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/npu/ball_query_npu.cpp
@@ -0,0 +1,39 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void ball_query_forward_npu(int b, int n, int m, float min_radius,
+                            float max_radius, int nsample, const Tensor new_xyz,
+                            const Tensor xyz, Tensor idx) {
+  int64_t nsample_i64 = nsample;
+
+  // transpose new_xyz from [B, M, 3] to [M, B, 3]
+  at::Tensor new_xyz_transpose = new_xyz.transpose(0, 1);
+
+  // transpose xyz from [B, N, 3] to [B, 3, N]
+  at::Tensor xyz_transpose = xyz.transpose(1, 2);
+
+  // transpose idx from [B, M, nsample] to [M, B, nsample]
+  at::Tensor idx_transpose = NpuUtils::format_contiguous(idx.transpose(0, 1));
+
+  OpCommand cmd;
+  cmd.Name("BallQuery")
+      .Input(xyz_transpose)
+      .Input(new_xyz_transpose)
+      .Output(idx_transpose)
+      .Attr("min_radius", min_radius)
+      .Attr("max_radius", max_radius)
+      .Attr("sample_num", nsample_i64)
+      .Run();
+
+  idx_transpose = NpuUtils::format_contiguous(idx_transpose.transpose(0, 1));
+  idx.copy_(idx_transpose);
+}
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+
+REGISTER_NPU_IMPL(ball_query_forward_impl, ball_query_forward_npu);
diff --git a/tests/test_ops/test_ball_query.py b/tests/test_ops/test_ball_query.py
index a3f6518197..25899f2e1f 100644
--- a/tests/test_ops/test_ball_query.py
+++ b/tests/test_ops/test_ball_query.py
@@ -3,7 +3,7 @@
 import torch
 
 from mmcv.ops import ball_query
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
 
 
 @pytest.mark.parametrize('device', [
@@ -14,7 +14,11 @@
     pytest.param(
         'mlu',
         marks=pytest.mark.skipif(
-            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+            not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
 ])
 def test_ball_query(device):
     new_xyz = torch.tensor(