diff --git a/mmcv/ops/box_iou_rotated.py b/mmcv/ops/box_iou_rotated.py
index a811531d42..0f8a8e298b 100644
--- a/mmcv/ops/box_iou_rotated.py
+++ b/mmcv/ops/box_iou_rotated.py
@@ -143,10 +143,11 @@ def box_iou_rotated(bboxes1: torch.Tensor,
         bboxes1 = bboxes1 * flip_mat
         bboxes2 = bboxes2 * flip_mat
     if bboxes1.device.type == 'npu':
-        scale_mat = bboxes1.new_ones(bboxes1.shape[-1])
-        scale_mat[-1] = 1.0 / 0.01745329252
-        bboxes1 = bboxes1 * scale_mat
-        bboxes2 = bboxes2 * scale_mat
+        if (mode_flag == 1 or aligned or not clockwise):
+            scale_mat = bboxes1.new_ones(bboxes1.shape[-1])
+            scale_mat[-1] = 1.0 / 0.01745329252
+            bboxes1 = bboxes1 * scale_mat
+            bboxes2 = bboxes2 * scale_mat
     bboxes1 = bboxes1.contiguous()
     bboxes2 = bboxes2.contiguous()
     ext_module.box_iou_rotated(
diff --git a/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp b/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp
index c6e6b66478..14df358080 100644
--- a/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp
+++ b/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp
@@ -8,6 +8,14 @@ void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
 
 void box_iou_rotated_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned) {
+  if (mode_flag == 0 && aligned == false) {
+    auto trans = false;
+    auto is_clockwise = false;
+    auto need_iou = true;
+    EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes1, boxes2, trans, is_clockwise,
+                 need_iou, ious);
+    return;
+  }
   at::Tensor boxes = at::ones_like(boxes1);
   at::Tensor query_boxes = at::ones_like(boxes2);
   boxes = boxes1.transpose(0, 1).unsqueeze(0);
diff --git a/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp b/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp
new file mode 100644
index 0000000000..286f91ac8b
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp
@@ -0,0 +1,27 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+constexpr int32_t N_IDX = 0;
+constexpr int32_t BOX_DIM = 7;
+
+void iou3d_boxes_overlap_bev_forward_npu(const int num_a, const Tensor boxes_a,
+                                         const int num_b, const Tensor boxes_b,
+                                         Tensor ans_overlap) {
+  TORCH_CHECK((boxes_a.sizes()[1] == BOX_DIM),
+              "Input boxes shape should be (N, 7)");
+  TORCH_CHECK((boxes_b.sizes()[1] == BOX_DIM),
+              "Input boxes shape should be (N, 7)");
+
+  auto trans = false;
+  auto is_clockwise = false;
+  auto need_iou = false;
+  EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes_a, boxes_b, trans, is_clockwise, need_iou, area_overlap);
+}
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap);
+
+REGISTER_NPU_IMPL(iou3d_boxes_overlap_bev_forward_impl, iou3d_boxes_overlap_bev_forward_npu);
diff --git a/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp b/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
index ffd9b4c43b..2abe7c8f95 100644
--- a/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
+++ b/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
@@ -11,6 +11,11 @@ int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
                                const int max_points, const int max_voxels,
                                const int NDim = 3);
 
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim = 3);
+
 int hard_voxelize_forward_npu(const at::Tensor &points, at::Tensor &voxels,
                               at::Tensor &coors,
                               at::Tensor &num_points_per_voxel,
@@ -53,4 +58,34 @@ int hard_voxelize_forward_npu(const at::Tensor &points, at::Tensor &voxels,
   return voxel_num_int;
 }
 
+void dynamic_voxelize_forward_npu(const at::Tensor &points, at::Tensor &coors,
+                                  const std::vector<float> voxel_size,
+                                  const std::vector<float> coors_range,
+                                  const int NDim = 3) {
+  uint32_t ptsNum = points.size(0);
+  uint32_t ptsFeature = points.size(1);
+  at::Tensor ptsTrans = at::transpose(points, 0, 1);
+  double coors_min_x = coors_range[0];
+  double coors_min_y = coors_range[1];
+  double coors_min_z = coors_range[2];
+  double coors_max_x = coors_range[3];
+  double coors_max_y = coors_range[4];
+  double coors_max_z = coors_range[5];
+  double voxel_x = voxel_size[0];
+  double voxel_y = voxel_size[1];
+  double voxel_z = voxel_size[2];
+  int grid_x = std::round((coors_max_x - coors_min_x) / voxel_x);
+  int grid_y = std::round((coors_max_y - coors_min_y) / voxel_y);
+  int grid_z = std::round((coors_max_z - coors_min_z) / voxel_z);
+
+  at::Tensor tmp_coors =
+      at::zeros({3, ptsNum}, points.options().dtype(at::kInt));
+  EXEC_NPU_CMD(aclnnDynamicVoxelization, ptsTrans, coors_min_x, coors_min_y,
+               coors_min_z, voxel_x, voxel_y, voxel_z, grid_x, grid_y, grid_z,
+               tmp_coors);
+  tmp_coors.transpose_(0, 1);
+  coors.copy_(tmp_coors);
+}
+
 REGISTER_NPU_IMPL(hard_voxelize_forward_impl, hard_voxelize_forward_npu);
+REGISTER_NPU_IMPL(dynamic_voxelize_forward_impl, dynamic_voxelize_forward_npu);
diff --git a/tests/test_ops/test_voxelization.py b/tests/test_ops/test_voxelization.py
index cd01eb46e6..f62fcfb9e3 100644
--- a/tests/test_ops/test_voxelization.py
+++ b/tests/test_ops/test_voxelization.py
@@ -192,6 +192,9 @@ def test_voxelization_npu(device_type):
     points = voxel_dict['points']
 
     points = torch.tensor(points)
+    max_num_points = -1
+    dynamic_voxelization = Voxelization(voxel_size, point_cloud_range,
+                                        max_num_points)
     max_num_points = 1000
     hard_voxelization = Voxelization(voxel_size, point_cloud_range,
                                      max_num_points)
@@ -207,3 +210,15 @@ def test_voxelization_npu(device_type):
     assert np.all(coors == expected_coors)
     assert np.all(voxels == expected_voxels)
     assert np.all(num_points_per_voxel == expected_num_points_per_voxel)
+
+    # test dynamic_voxelization on npu
+    coors = dynamic_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy()
+    points = points.cpu().detach().numpy()
+    for i in range(expected_voxels.shape[0]):
+        indices = _get_voxel_points_indices(points, coors, expected_voxels[i])
+        num_points_current_voxel = points[indices].shape[0]
+        assert num_points_current_voxel > 0
+        assert np.all(
+            points[indices] == expected_coors[i][:num_points_current_voxel])
+        assert num_points_current_voxel == expected_num_points_per_voxel[i]