diff --git a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
index 58e695a015..0f273d2508 100644
--- a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
@@ -88,14 +88,14 @@ __mlu_func__ void bboxOverlapsWorkflow(
 
       // right - left + offset ---> left
       __bang_sub(vec_left, vec_right, vec_left, batches_stride);
-      __bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
+      __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
 
       // bottom - top + offset ---> right
       __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
-      __bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
+      __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
 
       // zero vector ---> bottom
-      __nramset(vec_bottom, batches_stride, 0.f);
+      __bang_write_value(vec_bottom, batches_stride, 0.f);
 
       // width --> vec_left
       __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
@@ -107,11 +107,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
       // get the b1_area
       // (b1_x2 - b1_x1 + offset)  --->  vec_top
       __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
-      __bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
+      __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
 
       // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
       __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
-      __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
+      __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
 
       // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
       // --->  vec_top;
@@ -121,11 +121,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
       // get the b2_area
       // (b2_x2 - b2_x1 + offset)  --->  b2_x1
       __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
-      __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
+      __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
 
       // (b2_y2 - b2_y1 + offset)  --->  b2_y1
       __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
-      __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
+      __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
 
       // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
       // --->  b2_x1;
@@ -137,7 +137,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
       T *inter_s = height;
 
       // offset vector ---> vec_b2_y1
-      __nramset(vec_b2_y1, batches_stride, T(offset));
+      __bang_write_value(vec_b2_y1, batches_stride, T(offset));
       T *vec_offset = vec_b2_y1;
 
       if (mode == 0) {
@@ -164,10 +164,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
       int32_t base1 = b1 * COORD_NUM;
 
       // set bbox1 and bbox2 to nram
-      __nramset(vec_b1_x1, batches_stride, bbox1[base1]);
-      __nramset(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
-      __nramset(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
-      __nramset(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
+      __bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]);
+      __bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
+      __bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
+      __bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
 
       for (int32_t j = 0; j < num_loop_cpy; j++) {
         int32_t index2 = j * batches_stride;
@@ -195,13 +195,13 @@ __mlu_func__ void bboxOverlapsWorkflow(
 
         // right - left + offset ---> left
         __bang_sub(vec_left, vec_right, vec_left, batches_stride);
-        __bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
+        __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
         // bottom - top + offset ---> right
         __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
-        __bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
+        __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
 
         // zero vector ---> bottom
-        __nramset(vec_bottom, batches_stride, (T)0);
+        __bang_write_value(vec_bottom, batches_stride, (T)0);
 
         // width --> vec_left
         __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
@@ -213,10 +213,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
         // get the b1_area
         // (b1_x2 - b1_x1 + offset)  --->  vec_top
         __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
-        __bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
+        __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
         // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
         __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
-        __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
+        __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
         // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
         // --->  vec_top;
         __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
@@ -225,10 +225,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
         // get the b2_area
         // (b2_x2 - b2_x1 + offset)  --->  b2_x1
         __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
-        __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
+        __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
         // (b2_y2 - b2_y1 + offset)  --->  b2_y1
         __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
-        __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
+        __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
         // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
         // --->  b2_x1;
         __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
@@ -239,7 +239,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
         T *inter_s = height;
 
         // offset vector ---> vec_b2_y1
-        __nramset(vec_b2_y1, batches_stride, T(offset));
+        __bang_write_value(vec_b2_y1, batches_stride, T(offset));
         T *vec_offset = vec_b2_y1;
 
         if (mode == 0) {
diff --git a/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
index ac5ea0d653..8dd6a8e582 100644
--- a/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
@@ -139,7 +139,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
     blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1;
 
     // set output_nram to zero
-    __nramset(output_nram, param.output_nram_size, T(0));
+    __bang_write_value(output_nram, param.output_nram_size, T(0));
 
     // loop blocks of kernel window: grid_dim.(Kh, Kw)
     for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) {
@@ -313,8 +313,8 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
                 T *sum = sum_array;
 
                 for (int g = 0; g < blkSize.G; ++g) {
-                  __bang_mul_const(sum, src, mask_array[mask_index],
-                                   param.block_Cg_NFU);
+                  __bang_mul_scalar(sum, src, mask_array[mask_index],
+                                    param.block_Cg_NFU);
                   //
                   // NOTE: Since block_Cg_NFU >= block_Cg_stride,
                   // overlapped writing may occur on sum_array.
@@ -446,8 +446,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
           T *base_grad_input = (T *)grad_input + input_index;
           __memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T),
                    GDRAM2NRAM);
-          __bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff,
-                           ((T *)mask_buff)[mask_index], num_align);
+          __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
+                            ((T *)mask_buff)[mask_index], num_align);
           __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
                             (T *)grad_input_buff, num_align);
           __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
@@ -485,8 +485,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
           T *base_grad_input = (T *)grad_input + input_index;
           __memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T),
                    GDRAM2NRAM);
-          __bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff,
-                           ((T *)mask_buff)[mask_index], rem_for_loop_align);
+          __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
+                            ((T *)mask_buff)[mask_index], rem_for_loop_align);
           __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
                             (T *)grad_input_buff, rem_for_loop);
           __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
@@ -541,12 +541,12 @@ void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
                           const int wi, const int c, const int k_up,
                           const int group, const int scale) {
   if (dtype == CNRT_FLOAT16) {
-    backward::MLUUnion1KernelCarafeBackward<half>
-        <<<k_dim, k_type, queue>>>(input, mask, grad_output, grad_input,
-                                   grad_mask, n, hi, wi, c, k_up, group, scale);
+    backward::MLUUnion1KernelCarafeBackward<half><<<k_dim, k_type, queue>>>(
+        input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
+        group, scale);
   } else {
-    backward::MLUUnion1KernelCarafeBackward<float>
-        <<<k_dim, k_type, queue>>>(input, mask, grad_output, grad_input,
-                                   grad_mask, n, hi, wi, c, k_up, group, scale);
+    backward::MLUUnion1KernelCarafeBackward<float><<<k_dim, k_type, queue>>>(
+        input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
+        group, scale);
   }
 }
diff --git a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
index e59099ae8f..e372515985 100644
--- a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
+++ b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
@@ -211,51 +211,52 @@ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
   // get sign bit
   const float move_23bit = 8388608.0;
   // 0x80000000 = 1,000000000,0000000000000000000000000000
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x80000000);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000000);
   __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
                     src_count * sizeof(float), NFU_ALIGN_SIZE);
   // get 1 or 0 from sign bit
   // judg is Odd
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x00000001);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x00000001);
   __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
                    (char *)src_addition, src_count * sizeof(float),
                    NFU_ALIGN_SIZE);
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x80000001);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000001);
   __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));
   // minus xor, positive num invariant
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0xffffffff);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xffffffff);
   __bang_cycle_mul(dst, dst_addition, src_addition, src_count,
                    NFU_ALIGN_SIZE / sizeof(float));
   __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
   // convert int32 to float32
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x7fffff);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x7fffff);
   __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
                     src_count * sizeof(float), NFU_ALIGN_SIZE);
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x4b000000);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x4b000000);
   __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
                    src_count * sizeof(float), NFU_ALIGN_SIZE);
-  __bang_sub_const(dst, dst, move_23bit, src_count);
+  __bang_sub_scalar(dst, dst, move_23bit, src_count);
   // add one
   __bang_add(dst, dst, dst_addition, src_count);
   // set sign for float32
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0xffffffff);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xffffffff);
   __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
                    NFU_ALIGN_SIZE / sizeof(float));
 
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x00000001);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x00000001);
   __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
                    NFU_ALIGN_SIZE / sizeof(float));
 
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x80000000);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000000);
   __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
                     (char *)src_addition, src_count * 4, 128);
   __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
@@ -291,18 +292,20 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
   // dst_addition = abs(src)
   __bang_mul(dst_addition, src, (float *)dst, src_count);
   // if dst_addition < 1.0 , then src_addition + 1, to fix add error.
-  __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 1.0f);
+  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     1.0f);
   __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));
   __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0xbf800000);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xbf800000);
   // set negative flag -1.0 = 0xbf80000
   __bang_cycle_eq(
       (float *)dst, (float *)dst, (float *)src_addition, src_count,
       NFU_ALIGN_SIZE / sizeof(float));  //  to mark all src in [x<-1.0]
   __bang_active_abs(dst_addition, src, src_count);
-  __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 8388608.0f);
+  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     8388608.0f);
   // mask shift move 23
   __bang_cycle_add_tz(
       dst_addition, dst_addition, src_addition, src_count,
@@ -314,12 +317,12 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
   // to fix max value
   // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
   // means max value.
-  __bang_mul_const((float *)dst, (float *)dst, 16777215.0, src_count);
+  __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);
   __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
               src_count * floatDchar);
   // get low 23bit
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            (unsigned)0x007fffff);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     (unsigned)0x007fffff);
   // mask low 23bit is 1
   __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
                     (char *)src_addition, src_count * floatDchar,
@@ -327,16 +330,36 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
   // set 9 high bit ===> dst
   // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
   //  1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
-  __nramset(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
+  __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
   __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
                    NFU_ALIGN_SIZE / sizeof(float));
   // src or dst_addition
   __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
              src_count * floatDchar);
-  __bang_mul_const((float *)dst, (float *)dst, -2.0, src_count);
+  __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);
   __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
              src_count * floatDchar);
 #endif  // __BANG_ARCH__ >= 300
 }
 
+/*!
+ * @brief Converts float32 to half data type,
+ * the rounding mode on MLU200 is rd, on MLU300 is rn.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores half type data.
+ * @param[in] src
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ inline void convertFloat2half(half *dst, float *src,
+                                           int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_float2half_rn(dst, src, src_count);
+#else
+  __bang_float2half_rd(dst, src, src_count);
+#endif
+}
+
 #endif  // COMMON_MLU_HELPER_HPP_
diff --git a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
index 7cb16bb100..fb6185048a 100644
--- a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
@@ -9,14 +9,9 @@
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *************************************************************************/
-#include "common_mlu_helper.hpp"
+#include "nms_utils.hpp"
 
-#define NMS_SIZE (64)
 #define COORD_DIM (4)
-#define MEMORY_CORE (0x80)
-#define INFO_NUM (5)  // 5 means x1, x2, y1, y2 and score
-#define REDUCE_NUM \
-  (7)  // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
 
 #define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
 #define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
@@ -24,348 +19,129 @@
 __nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
 __mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
 
-__mlu_func__ void pvLock() {
-#if __BANG_ARCH__ == 270
-  if (coreId != MEMORY_CORE) {
-    __bang_lock(0, 0);
-  }
-#endif
-}
-
-__mlu_func__ void pvUnlock() {
-#if __BANG_ARCH__ == 270
-  if (coreId != MEMORY_CORE) {
-    __bang_unlock(0, 0);
-  }
-#endif
-}
-
 enum Addr { SRAM, GDRAM };
 
 template <typename IN_DT, typename OUT_DT>
 __mlu_func__ void nms_detection(
-    uint32_t *output_box_num, const int output_mode, const int input_layout,
-    OUT_DT *output_data, const Addr dst, IN_DT *input_data_score,
-    const IN_DT *input_data_box, const Addr src, IN_DT *buffer,
-    const int buffer_size, IN_DT *sram, const int core_limit,
-    const int input_box_num, const int input_stride, const int output_stride,
-    const int keepNum, const float thresh_iou, const float thresh_score,
+    uint32_t &output_box_num, const int output_mode, OUT_DT *output_dram,
+    IN_DT *input_data_score, const IN_DT *input_data_box, const Addr input_ram,
+    IN_DT *sram, const int core_limit, const int input_num_boxes,
+    const int max_output_size, const float thresh_iou, const float thresh_score,
     const float offset, const int algo) {
-  // global value, it is stored in sram with a offset from the begin.
-  const int flag_offset_size = 28;
-  int32_t *loop_end_flag = (int32_t *)(sram + flag_offset_size);
-  loop_end_flag[0] = 0;
+  // global value
+  int32_t *exit_flag = (int32_t *)(sram + 28);
+  exit_flag[0] = 0;
   // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
-  const int nms_buffer_count1 = 9;
+  int nms_buffer_count1 = 9;
   // temp nram buffer to store selected target.
-  const int nram_save_limit_count = 256;
+  int nram_save_limit_count = 256;
   float div_thresh_iou = 1.0 / thresh_iou;
 
   // input data ptr
-  IN_DT *input_score_ptr;
-  const IN_DT *input_x1_ptr;
-  const IN_DT *input_y1_ptr;
-  const IN_DT *input_x2_ptr;
-  const IN_DT *input_y2_ptr;
-  input_score_ptr = input_data_score;
-  input_x1_ptr = input_data_box;
-  if (input_layout == 0) {
-    // [boxes_num, 4]
-    input_y1_ptr = input_x1_ptr + 1;
-    input_x2_ptr = input_x1_ptr + 2;
-    input_y2_ptr = input_x1_ptr + 3;
-  } else if (input_layout == 1) {
-    // [4, boxes_num]
-    input_y1_ptr = input_x1_ptr + input_stride;
-    input_x2_ptr = input_y1_ptr + input_stride;
-    input_y2_ptr = input_x2_ptr + input_stride;
-  }
-
-  // nram data ptr
-  IN_DT *x1;
-  IN_DT *y1;
-  IN_DT *x2;
-  IN_DT *y2;
-  IN_DT *score;
-  IN_DT *inter_x1;
-  IN_DT *inter_y1;
-  IN_DT *inter_x2;
-  IN_DT *inter_y2;
-  IN_DT *max_box;  // the max score, x1, y1, x2, y2
-  IN_DT *x1_mask;
-  IN_DT *y1_mask;
-  IN_DT *x2_mask;
-  IN_DT *y2_mask;
-  OUT_DT *nram_save;
+  const IN_DT *input_x1_ptr = input_data_box;
+  const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
+  const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
+  const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
 
   int limit = 0;        // find limit when GDRAM or SRAM
-  int len_core = 0;     // the length deal by every core
   int max_seg_pad = 0;  // the max length every repeat
   int repeat = 0;
   int remain = 0;
   int remain_pad = 0;
   int input_offset = 0;  // offset of input_data for current core
   int nram_save_count = 0;
-  // mask for collect x1, y1, x2, y2. each mask has 128 elements
-  const int mask_size = 128;
-  const int total_mask_size = 512;
 
   if (output_mode == 0) {
-    limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
-             nram_save_limit_count * sizeof(OUT_DT) -
-             total_mask_size * sizeof(IN_DT)) /
+    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * sizeof(OUT_DT)) /
             (nms_buffer_count1 * sizeof(IN_DT));
   } else {
-    limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
-             nram_save_limit_count * INFO_NUM * sizeof(OUT_DT) -
-             total_mask_size * sizeof(IN_DT)) /
+    // 5 maens: score, x1, y1, x2, y2
+    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * 5 * sizeof(OUT_DT)) /
             (nms_buffer_count1 * sizeof(IN_DT));
   }
 
-  if (core_limit == 1) {
-    len_core = input_box_num;
-    input_offset = 0;
-  } else {
-    int avg_core = input_box_num / core_limit;
-    int rem = input_box_num % core_limit;
-    len_core = avg_core + (taskId < rem ? 1 : 0);
-    input_offset = avg_core * taskId + (taskId <= rem ? taskId : rem);
-  }
-  max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
-  repeat = len_core / max_seg_pad;
-  remain = len_core % max_seg_pad;
-  remain_pad = PAD_UP(remain, NMS_SIZE);
+  int max_seg_iou_compute = 0;
+  int repeat_iou_compute = 0;
+  int remain_iou_compute = 0;
+  int remain_pad_iou_compute = 0;
 
-  // if datatype is half, we should convert it to float when compute the IoU
-  int max_seg_iou_compute =
-      PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
-  int repeat_iou_compute = len_core / max_seg_iou_compute;
-  int remain_iou_compute = len_core % max_seg_iou_compute;
-  int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
-  // initial the address point
-  score = buffer;
-  x1 = score + max_seg_pad;
-  y1 = x1 + max_seg_pad;
-  x2 = y1 + max_seg_pad;
-  y2 = x2 + max_seg_pad;
-  inter_x1 = y2 + max_seg_pad;
-  inter_y1 = inter_x1 + max_seg_pad;
-  inter_x2 = inter_y1 + max_seg_pad;
-  inter_y2 = inter_x2 + max_seg_pad;
-  x1_mask = inter_y2 + max_seg_pad;
-  y1_mask = x1_mask + mask_size;
-  x2_mask = y1_mask + mask_size;
-  y2_mask = x2_mask + mask_size;
-  max_box = y2_mask + mask_size;  // the max score, x1, y1, x2, y2
-  // offset two line from max_box
-  nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE);
+  getComputeParamsBlockOrU1(sizeof(IN_DT), input_num_boxes, limit, core_limit,
+                            input_offset, max_seg_pad, repeat, remain,
+                            remain_pad, max_seg_iou_compute, repeat_iou_compute,
+                            remain_iou_compute, remain_pad_iou_compute);
 
-  // set mask for __bang_collect instruction
-  if (input_layout == 0) {
-    __nramset((IN_DT *)x1_mask, total_mask_size, (IN_DT)0);
-    for (int idx = 0; idx < mask_size; idx++) {
-      int index = (idx % COORD_DIM) * mask_size + idx;
-      x1_mask[index] = (IN_DT)1.0;
-    }
-  }
+  // init the data ptr
+  IN_DT *score = (IN_DT *)nram_buffer;
+  IN_DT *x1 = score + max_seg_pad;
+  IN_DT *y1 = x1 + max_seg_pad;
+  IN_DT *x2 = y1 + max_seg_pad;
+  IN_DT *y2 = x2 + max_seg_pad;
+  IN_DT *inter_x1 = y2 + max_seg_pad;
+  IN_DT *inter_y1 = inter_x1 + max_seg_pad;
+  IN_DT *inter_x2 = inter_y1 + max_seg_pad;
+  IN_DT *inter_y2 = inter_x2 + max_seg_pad;
+  IN_DT *max_box = inter_y2 + max_seg_pad;  // the max score, x1, y1, x2, y2
+  OUT_DT *nram_save =
+      (OUT_DT *)((char *)max_box +
+                 NFU_ALIGN_SIZE);  // offset two line from max_box
 
-  for (int keep = 0; keep < keepNum; keep++) {  // loop until the max_score <= 0
+#if __BANG_ARCH__ >= 300
+  float max_box_x1 = 0;
+  float max_box_y1 = 0;
+  float max_box_x2 = 0;
+  float max_box_y2 = 0;
+#endif
+  mluMemcpyDirection_t load_dir = SRAM2NRAM;
+  mluMemcpyDirection_t store_dir = NRAM2SRAM;
+  load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
+  store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
+
+  for (int keep = 0; keep < max_output_size;
+       keep++) {  // loop until the max_score <= 0
     if (core_limit != 1) {
       __sync_cluster();  // sync before current loop
     }
 
-    /******find max start******/
+    /******FIND MAX START******/
     int max_index = 0;         // the max score index
     int global_max_index = 0;  // for U1
-    float max_area = 0;        // the max score area
+    float max_area = 0;        // the max socre area
     max_box[0] = 0;            // init 0
-
-    for (int i = 0; i <= repeat; i++) {
-      if (i == repeat && remain == 0) {
-        break;
-      }
-      int seg_len = 0;  // the length every nms compute
-      int cpy_len = 0;  // the length every nms memcpy
-      i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
-      // check seg_len exceeds the limit of fp16 or not. 65536 is the largest
-      // num that half data type could express.
-      if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
-        // seg length exceeds the max num for fp16 datatype!
-        return;
-      }
-      i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
-      /******nms load start******/
-      mluMemcpyDirection_t load_dir = SRAM2NRAM;
-      if (src == SRAM) {
-        load_dir = SRAM2NRAM;
-      } else {
-        load_dir = GDRAM2NRAM;
-      }
-      __nramset(score, seg_len, (IN_DT)0);
-      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-               cpy_len * sizeof(IN_DT), 0);
-
-      /******nms load end******/
-
-      __bang_max(inter_x1, score, seg_len);
-      if (inter_x1[0] > max_box[0]) {
-        max_box[0] = inter_x1[0];
-
-        if (sizeof(IN_DT) == sizeof(half)) {
-          max_index = ((uint16_t *)inter_x1)[1] + input_offset +
-                      i * max_seg_pad;  // offset start from head of input_data
-        } else if (sizeof(IN_DT) == sizeof(float)) {
-          max_index = ((uint32_t *)inter_x1)[1] + input_offset +
-                      i * max_seg_pad;  // offset start from head of input_data
-        }
-      }
-    }  // for repeat
-
-    int stride = 1;
-    if (input_layout == 0) {
-      stride = input_stride;
-    } else if (input_layout == 1) {
-      stride = 1;
-    }
+    findCoreMaxBox(input_data_score, score, inter_x1, max_box, input_x1_ptr,
+                   input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
+                   input_offset, repeat, remain, remain_pad, max_seg_pad,
+                   max_index);
 
     if (core_limit == 1) {
-      max_box[1] = input_x1_ptr[max_index * stride];
-      max_box[2] = input_y1_ptr[max_index * stride];
-      max_box[3] = input_x2_ptr[max_index * stride];
-      max_box[4] = input_y2_ptr[max_index * stride];
-      if (algo == 0 || offset == 0.0) {
-        max_area = ((float)max_box[3] - (float)max_box[1]) *
-                   ((float)max_box[4] - (float)max_box[2]);
-      } else {
-        max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
-                   ((float)max_box[4] - (float)max_box[2] + offset);
-      }
-      input_score_ptr[max_index] = 0;
+#if __BANG_ARCH__ >= 300
+      calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
+                 max_box_x2, max_box_y2);
+#else
+      calMaxArea(max_box, algo, offset, max_area);
+#endif
+      input_data_score[max_index] = 0;
       global_max_index = max_index;
-      ((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
     } else if (core_limit == 4) {
-      // find the max with sram
-      // the max box's x1, y1, x2, y2 on every core
-      if (coreId != MEMORY_CORE) {
-        max_box[1] = input_x1_ptr[max_index * stride];
-        max_box[2] = input_y1_ptr[max_index * stride];
-        max_box[3] = input_x2_ptr[max_index * stride];
-        max_box[4] = input_y2_ptr[max_index * stride];
-      }
-      ((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
-      // copy every core's box info to sram, form: score---x1---y1---x2---y2---
-      for (int i = 0; i < INFO_NUM; i++) {
-        __memcpy(sram + i * core_limit + taskId, max_box + i, 1 * sizeof(IN_DT),
-                 NRAM2SRAM);
-      }
-      // copy every core's max_index to sram, use 2 half to store max_index
-      __memcpy(sram + INFO_NUM * core_limit + taskId * 2, max_box + INFO_NUM,
-               sizeof(uint32_t),
-               NRAM2SRAM);  // int32_t datatype
       __sync_cluster();
+      findClusterMaxBox(sram, max_box, inter_x1, input_data_score, core_limit);
 
-      // copy score from sram to nram and find the max
-      __nramset(inter_x1, NMS_SIZE, (IN_DT)0);
-      __memcpy(inter_x1, sram, core_limit * sizeof(IN_DT), SRAM2NRAM);
-      __bang_max(max_box, inter_x1, NMS_SIZE);
-      int max_core = 0;
-      if (sizeof(IN_DT) == sizeof(half)) {
-        max_core = ((uint16_t *)max_box)[1];
-      } else if (sizeof(IN_DT) == sizeof(float)) {
-        max_core = ((uint32_t *)max_box)[1];
-      }
-
-      // copy the max box from SRAM to NRAM
-      __memcpy(max_box + 1, sram + 1 * core_limit + max_core, 1 * sizeof(IN_DT),
-               SRAM2NRAM);  // x1
-      __memcpy(max_box + 2, sram + 2 * core_limit + max_core, 1 * sizeof(IN_DT),
-               SRAM2NRAM);  // y1
-      __memcpy(max_box + 3, sram + 3 * core_limit + max_core, 1 * sizeof(IN_DT),
-               SRAM2NRAM);  // x2
-      __memcpy(max_box + 4, sram + 4 * core_limit + max_core, 1 * sizeof(IN_DT),
-               SRAM2NRAM);  // y2
-      __memcpy(max_box + 5, sram + 5 * core_limit + 2 * max_core,
-               sizeof(uint32_t), SRAM2NRAM);
-      if (algo == 0 || offset == 0.0) {
-        max_area = ((float)max_box[3] - (float)max_box[1]) *
-                   ((float)max_box[4] - (float)max_box[2]);
-      } else {
-        max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
-                   ((float)max_box[4] - (float)max_box[2] + offset);
-      }
-      global_max_index = ((uint32_t *)(max_box + INFO_NUM))[0];
-      input_score_ptr[global_max_index] = 0;
+#if __BANG_ARCH__ >= 300
+      calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
+                 max_box_x2, max_box_y2);
+#else
+      calMaxArea(max_box, algo, offset, max_area);
+#endif
+      global_max_index = ((uint32_t *)(max_box + 5))[0];
+      input_data_score[global_max_index] = 0;
     }
     // by now, we get: max_score|max_index|max_box|max_area
-    /******find max end******/
-
-    /******nms store start******/
-    // store to nram
-    if (float(max_box[0]) > thresh_score) {
-      OUT_DT *save_ptr;
-      int save_offset = 0;
-      int save_str_num = 0;
-      save_ptr = nram_save;
-      save_offset = nram_save_count;
-      save_str_num = nram_save_limit_count;
-      if (coreId == 0) {
-        if (output_mode == 0) {  // index1, index2, ...
-          __memcpy(save_ptr + save_offset, (uint32_t *)(max_box + INFO_NUM),
-                   1 * sizeof(uint32_t), NRAM2NRAM, 1 * sizeof(uint32_t),
-                   1 * sizeof(uint32_t), 0);
-        } else if (output_mode == 1) {  // score, x1, y1, x2, y2
-          __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
-                   INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
-                   INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
-        } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
-          __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
-                   NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
-                   4);
-        }
-      }
-      nram_save_count++;
-      (*output_box_num)++;
-    }
+    /******FIND MAX END******/
 
-    // store to sram/gdram
-    if (*output_box_num != 0) {
-      mluMemcpyDirection_t store_dir = NRAM2GDRAM;
-      if (dst == SRAM) {
-        store_dir = NRAM2SRAM;
-      } else {  // dst == GDRAM
-        store_dir = NRAM2GDRAM;
-      }
-      if ((nram_save_count == nram_save_limit_count) ||
-          (float(max_box[0]) <= thresh_score) || keep == keepNum - 1) {
-        if (nram_save_count != 0) {
-          if (coreId == 0) {
-            if (output_mode == 0) {  // index1, index2, ...
-              pvLock();
-              __memcpy(output_data, nram_save,
-                       nram_save_count * sizeof(uint32_t), store_dir);
-              pvUnlock();
-              output_data += nram_save_count;
-            } else if (output_mode == 1) {  // score, x1, y1, x2, y2
-              pvLock();
-              __memcpy(output_data, nram_save,
-                       nram_save_count * INFO_NUM * sizeof(IN_DT), store_dir);
-              pvUnlock();
-              output_data += nram_save_count * INFO_NUM;
-            } else if (output_mode ==
-                       2) {  // score---, x1---, y1---, x2---, y2---
-              pvLock();
-              __memcpy(output_data, nram_save, nram_save_count * sizeof(IN_DT),
-                       store_dir, output_stride * sizeof(IN_DT),
-                       nram_save_limit_count * sizeof(IN_DT), 4);
-              pvUnlock();
-              output_data += nram_save_count;
-            }
-            nram_save_count = 0;
-          }
-        }
-      }  // if move data nram->sram/gdram
-    }    // if dst
+    storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
+                max_output_size, thresh_score, output_mode, nram_save_count,
+                output_box_num);
 
     // if the max score <= 0, end
     if (core_limit == 1) {
@@ -375,190 +151,40 @@ __mlu_func__ void nms_detection(
     } else {
       if (float(max_box[0]) <= thresh_score) {
         if (coreId == 0) {
-          loop_end_flag[0] = 1;
+          exit_flag[0] = 1;
         }
       }
       __sync_cluster();
-      if (loop_end_flag[0] == 1) {
+      if (exit_flag[0] == 1) {
         break;
       }
     }
-    /******nms store end******/
-
-    // To solve half data accuracy, we convert half to float to calculate IoU.
-    for (int i = 0; i <= repeat_iou_compute; i++) {
-      if (i == repeat_iou_compute && remain_iou_compute == 0) {
-        break;
-      }
-      int seg_len = 0;  // the length every nms compute
-      int cpy_len = 0;  // the length every nms memcpy
-      i == repeat_iou_compute ? seg_len = remain_pad_iou_compute
-                              : seg_len = max_seg_iou_compute;
-      i == repeat_iou_compute ? cpy_len = remain_iou_compute
-                              : cpy_len = max_seg_iou_compute;
-
-      /******nms load start******/
-      mluMemcpyDirection_t load_dir = SRAM2NRAM;
-      if (src == SRAM) {
-        load_dir = SRAM2NRAM;
-      } else {
-        load_dir = GDRAM2NRAM;
-      }
-
-      __nramset((float *)score, seg_len, 0.0f);
-      int dt_offset = 0;
-      if (sizeof(IN_DT) == sizeof(float)) {
-        __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-                 cpy_len * sizeof(IN_DT), 0);
-        dt_offset = 0;
-      } else if (sizeof(IN_DT) == sizeof(half)) {
-        __nramset(x1, seg_len, half(0));
-        __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
-                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-                 cpy_len * sizeof(IN_DT), 0);
-        __bang_half2float((float *)score, (half *)x1, seg_len);
-        dt_offset = max_seg_iou_compute;
-      }
-
-      if (input_layout == 0) {
-        // the following number 4 means x1, y1, x2, y2
-        __memcpy(
-            inter_x1,
-            input_x1_ptr + (input_offset + i * max_seg_iou_compute) * COORD_DIM,
-            cpy_len * COORD_DIM * sizeof(IN_DT), load_dir,
-            cpy_len * COORD_DIM * sizeof(IN_DT),
-            cpy_len * COORD_DIM * sizeof(IN_DT), 0);
-        // here use collect instruction to transpose the [n, 4] shape into [4,
-        // n] shape to avoid
-        // discrete memory accessing.
-        for (int c_i = 0; c_i < COORD_DIM * seg_len / mask_size; c_i++) {
-          // the following number 32 means 32 elements will be selected out by
-          // once operation
-          __bang_collect(x1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
-                         x1_mask, mask_size);
-          __bang_collect(y1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
-                         y1_mask, mask_size);
-          __bang_collect(x2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
-                         x2_mask, mask_size);
-          __bang_collect(y2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
-                         y2_mask, mask_size);
-        }
-      } else if (input_layout == 1) {
-        __memcpy(x1 + dt_offset,
-                 input_x1_ptr + input_offset + i * max_seg_iou_compute,
-                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-                 cpy_len * sizeof(IN_DT), 0);
-        __memcpy(y1 + dt_offset,
-                 input_y1_ptr + input_offset + i * max_seg_iou_compute,
-                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-                 cpy_len * sizeof(IN_DT), 0);
-        __memcpy(x2 + dt_offset,
-                 input_x2_ptr + input_offset + i * max_seg_iou_compute,
-                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-                 cpy_len * sizeof(IN_DT), 0);
-        __memcpy(y2 + dt_offset,
-                 input_y2_ptr + input_offset + i * max_seg_iou_compute,
-                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-                 cpy_len * sizeof(IN_DT), 0);
-      }
-      /******nms load end******/
-
-      /******nms compute start******/
-      if (sizeof(IN_DT) == sizeof(half)) {
-        __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
-                          seg_len);
-        __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
-                          seg_len);
-        __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
-                          seg_len);
-        __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
-                          seg_len);
-      }
-      // 1、 compute IOU
-      // get the area_I
-      __nramset((float *)inter_y1, seg_len, float(max_box[1]));  // max_x1
-      __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
-                      seg_len);                                  // inter_x1
-      __nramset((float *)inter_y2, seg_len, float(max_box[3]));  // max_x2
-      __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
-                      seg_len);  // inter_x2
-      __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
-                 seg_len);
-      if (algo == 1 && offset != 0.0) {
-        __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
-      }
-      __bang_active_relu((float *)inter_x1, (float *)inter_x1,
-                         seg_len);                               // inter_w
-      __nramset((float *)inter_x2, seg_len, float(max_box[2]));  // max_y1
-      __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
-                      seg_len);                                  // inter_y1
-      __nramset((float *)inter_x2, seg_len, float(max_box[4]));  // max_y2
-      __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
-                      seg_len);  // inter_y2
-      __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
-                 seg_len);
-      if (algo == 1 && offset != 0.0) {
-        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-      }
-      __bang_active_relu((float *)inter_y1, (float *)inter_y1,
-                         seg_len);  // inter_h
-      __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
-                 seg_len);  // area_I
-      // get the area of input_box: area = (x2 - x1) * (y2 - y1);
-      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
-      __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
-      if (algo == 1 && offset != 0.0) {
-        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-        __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
-      }
-      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
-                 seg_len);  // area
-      // get the area_U: area + max_area - area_I
-      __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
-                       seg_len);
-      __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
-                 seg_len);  // area_U
-      // 2、 select the box
-      // if IOU greater than thres, set the score to zero, abort it: area_U >
-      // area_I * (1 / thresh)?
-      if (thresh_iou > 0.0) {
-        __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
-                         seg_len);
-      } else {
-        __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
-                         seg_len);
-      }
-      __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
-                seg_len);
-      __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
-      /******nms compute end******/
-
-      // update the score
-      mluMemcpyDirection_t update_dir = NRAM2SRAM;
-      if (dst == SRAM) {
-        update_dir = NRAM2SRAM;
-      } else {
-        update_dir = NRAM2GDRAM;
-      }
-      if (sizeof(IN_DT) == sizeof(half)) {
-        __bang_float2half_rd((half *)score, (float *)score, seg_len);
-      }
-      pvLock();
-      __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
-               cpy_len * sizeof(IN_DT), update_dir, cpy_len * sizeof(IN_DT),
-               cpy_len * sizeof(IN_DT), 0);
-      pvUnlock();
-    }  // for repeat
-  }    // for keepNum
+/******NMS STORE END******/
+#if __BANG_ARCH__ >= 300
+    scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
+                input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
+                inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box_x1,
+                max_box_y1, max_box_x2, max_box_y2, nram_save,
+                repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
+                max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
+                input_offset, offset, max_area, input_num_boxes, algo);
+#else
+    scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
+                input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
+                inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box[1],
+                max_box[2], max_box[3], max_box[4], nram_save,
+                repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
+                max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
+                input_offset, offset, max_area, input_num_boxes, algo);
+#endif
+  }  // for max_output_size
 }
 
 __mlu_global__ void MLUUnion1KernelNMS(
     const void *input_boxes, const void *input_confidence,
-    const int input_num_boxes, const int input_stride,
-    const int max_output_size, const float iou_threshold,
-    const float confidence_threshold, const int mode, const int input_layout,
-    void *workspace, void *result_num, void *output,
+    const int input_num_boxes, const int max_output_size,
+    const float iou_threshold, const float confidence_threshold,
+    const int output_mode, void *workspace, void *result_num, void *output,
     const cnrtDataType_t data_type_input, const float offset, const int algo) {
   if (data_type_input == CNRT_FLOAT16) {
     __memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
@@ -569,82 +195,48 @@ __mlu_global__ void MLUUnion1KernelNMS(
   } else {
   }
 
-  int output_stride = max_output_size;
-  uint32_t result_box_num = 0;
-  if (mode == 0) {
-    uint32_t *out_data = (uint32_t *)output;
-    switch (data_type_input) {
-      default: { return; }
-      case CNRT_FLOAT16: {
-        half *boxes_data = (half *)input_boxes;
-        half *confi_data = (half *)workspace;
-        half *buffer = (half *)nram_buffer;
-        half *sram = (half *)sram_buffer;
-
-        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
-                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
-                      sram, taskDim, input_num_boxes, input_stride,
-                      output_stride, max_output_size, iou_threshold,
-                      confidence_threshold, offset, algo);
-        ((uint32_t *)result_num)[0] = result_box_num;
-      }; break;
-      case CNRT_FLOAT32: {
-        float *boxes_data = (float *)input_boxes;
-        float *confi_data = (float *)workspace;
-        float *buffer = (float *)nram_buffer;
-        float *sram = (float *)sram_buffer;
+  uint32_t output_box_num = 0;
+  float *score_data = (float *)workspace;
+  float *boxes_data = (float *)input_boxes;
+  float *sram = (float *)sram_buffer;
 
-        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
-                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
-                      sram, taskDim, input_num_boxes, input_stride,
-                      output_stride, max_output_size, iou_threshold,
-                      confidence_threshold, offset, algo);
-        ((uint32_t *)result_num)[0] = result_box_num;
-      }; break;
+  if (output_mode == 0) {
+    if (data_type_input == CNRT_FLOAT32) {
+      nms_detection(output_box_num, output_mode, (uint32_t *)output, score_data,
+                    boxes_data, GDRAM, sram, taskDim, input_num_boxes,
+                    max_output_size, iou_threshold, confidence_threshold,
+                    offset, algo);
+    } else {
+      nms_detection(output_box_num, output_mode, (uint32_t *)output,
+                    (half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
+                    taskDim, input_num_boxes, max_output_size, iou_threshold,
+                    confidence_threshold, offset, algo);
     }
   } else {
-    switch (data_type_input) {
-      default: { return; }
-      case CNRT_FLOAT16: {
-        half *boxes_data = (half *)input_boxes;
-        half *confi_data = (half *)workspace;
-        half *out_data = (half *)output;
-        half *buffer = (half *)nram_buffer;
-        half *sram = (half *)sram_buffer;
-
-        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
-                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
-                      sram, taskDim, input_num_boxes, input_stride,
-                      output_stride, max_output_size, iou_threshold,
-                      confidence_threshold, offset, algo);
-        ((uint32_t *)result_num)[0] = result_box_num;
-      }; break;
-      case CNRT_FLOAT32: {
-        float *boxes_data = (float *)input_boxes;
-        float *confi_data = (float *)workspace;
-        float *out_data = (float *)output;
-        float *buffer = (float *)nram_buffer;
-        float *sram = (float *)sram_buffer;
-
-        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
-                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
-                      sram, taskDim, input_num_boxes, input_stride,
-                      output_stride, max_output_size, iou_threshold,
-                      confidence_threshold, offset, algo);
-        ((uint32_t *)result_num)[0] = result_box_num;
-      }; break;
+    if (data_type_input == CNRT_FLOAT32) {
+      nms_detection(output_box_num, output_mode, (float *)output, score_data,
+                    boxes_data, GDRAM, sram, taskDim, input_num_boxes,
+                    max_output_size, iou_threshold, confidence_threshold,
+                    offset, algo);
+    } else {
+      nms_detection(output_box_num, output_mode, (half *)output,
+                    (half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
+                    taskDim, input_num_boxes, max_output_size, iou_threshold,
+                    confidence_threshold, offset, algo);
     }
   }
+  ((uint32_t *)result_num)[0] = output_box_num;
 }
 
 template <typename IN_DT, typename OUT_DT>
 __mlu_func__ void nms_detection_ux(
-    int32_t *loop_end_flag, uint32_t &output_box_num, OUT_DT *output_dram,
+    int32_t *exit_flag, uint32_t &output_box_num, OUT_DT *output_dram,
     IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
-    const int input_layout, const int input_num_boxes, const int input_stride,
-    const int max_output_size, const float thresh_iou, const float thresh_score,
-    const float offset, const int output_mode, const int algo) {
-  loop_end_flag[0] = 0;
+    const int input_num_boxes, const int max_output_size,
+    const float thresh_iou, const float thresh_score, const float offset,
+    const int output_mode, const int algo) {
+  exit_flag[0] = 0;
+
   IN_DT *sram = (IN_DT *)sram_buffer;
 
   // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
@@ -654,16 +246,10 @@ __mlu_func__ void nms_detection_ux(
   float div_thresh_iou = 1.0 / thresh_iou;
 
   // input data ptr
-  IN_DT *input_score_ptr;
-  const IN_DT *input_x1_ptr;
-  const IN_DT *input_y1_ptr;
-  const IN_DT *input_x2_ptr;
-  const IN_DT *input_y2_ptr;
-  input_score_ptr = score_data;
-  input_x1_ptr = boxes_data;
-  input_y1_ptr = input_x1_ptr + input_stride;
-  input_x2_ptr = input_y1_ptr + input_stride;
-  input_y2_ptr = input_x2_ptr + input_stride;
+  const IN_DT *input_x1_ptr = boxes_data;
+  const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
+  const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
+  const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
 
   int limit = 0;        // find limit when GDRAM or SRAM
   int max_seg_pad = 0;  // the max length every repeat
@@ -682,41 +268,16 @@ __mlu_func__ void nms_detection_ux(
             (nms_buffer_count1 * sizeof(IN_DT));
   }
 
-  // data split
-  int avg_cluster = input_num_boxes / clusterDim;
-  int rem_cluster = input_num_boxes % clusterDim;
-  int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
-  int cluster_offset = avg_cluster * clusterId +
-                       (clusterId <= rem_cluster ? clusterId : rem_cluster);
-
-  int avg_core = len_cluster / coreDim;
-  int rem_core = len_cluster % coreDim;
-  int len_core = avg_core + (coreId < rem_core ? 1 : 0);
-  int core_offset =
-      avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
-  int input_offset = cluster_offset + core_offset;
-
-  max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
-
-  // core 0 of each cluster calculate the max score index
-  int max_index_avg_core = input_num_boxes / clusterDim;
-  int max_index_rem_core = input_num_boxes % clusterDim;
-  int max_index_len_core =
-      max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0);
-  int max_index_input_offset =
-      max_index_avg_core * clusterId +
-      (clusterId <= max_index_rem_core ? clusterId : max_index_rem_core);
-  repeat = max_index_len_core / max_seg_pad;
-  remain = max_index_len_core % max_seg_pad;
-  remain_pad = PAD_UP(remain, NMS_SIZE);
-
-  // if datatype is fp16, we should cvt to fp32 when compute iou
-  int max_seg_iou_compute =
-      PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
-  int repeat_iou_compute = len_core / max_seg_iou_compute;
-  int remain_iou_compute = len_core % max_seg_iou_compute;
-  int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
+  int input_offset = 0;
+  int max_seg_iou_compute = 0;
+  int repeat_iou_compute = 0;
+  int remain_iou_compute = 0;
+  int remain_pad_iou_compute = 0;
 
+  getComputeParamsUx(sizeof(IN_DT), input_num_boxes, limit, input_offset,
+                     max_seg_pad, repeat, remain, remain_pad,
+                     max_seg_iou_compute, repeat_iou_compute,
+                     remain_iou_compute, remain_pad_iou_compute);
   // init the nram ptr
   IN_DT *score = (IN_DT *)nram_buffer;
   IN_DT *x1 = score + max_seg_pad;
@@ -731,320 +292,94 @@ __mlu_func__ void nms_detection_ux(
   OUT_DT *nram_save =
       (OUT_DT *)((char *)max_box +
                  NFU_ALIGN_SIZE);  // offset two line from max_box
-
-  mluMemcpyDirection_t input_load_dir = SRAM2NRAM;
-  mluMemcpyDirection_t input_store_dir = NRAM2SRAM;
-  input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
-  input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
+#if __BANG_ARCH__ >= 300
+  float max_box_x1 = 0;
+  float max_box_y1 = 0;
+  float max_box_x2 = 0;
+  float max_box_y2 = 0;
+#endif
+  mluMemcpyDirection_t load_dir = SRAM2NRAM;
+  mluMemcpyDirection_t store_dir = NRAM2SRAM;
+  load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
+  store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
 
   for (int keep = 0; keep < max_output_size;
        keep++) {  // loop until the max_score <= 0
     __sync_all();
 
-    /******FIND MAX START******/
     int max_index = 0;
     int global_max_index = 0;  // for Ux
     float max_area = 0;        // the max socre area
     max_box[0] = 0;            // init 0
 
     if (coreId == 0) {
-      for (int i = 0; i <= repeat; i++) {
-        if (i == repeat && remain == 0) {
-          break;
-        }
-
-        int seg_len = (i == repeat)
-                          ? remain_pad
-                          : max_seg_pad;  // the length every nms compute
-        // check seg_len exceeds the limit of fp16 or not. 65536 is the largest
-        // num
-        // that fp16 could express.
-        if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
-          return;
-        }
-        int cpy_len = (i == repeat)
-                          ? remain
-                          : max_seg_pad;  // the length every nms memcpy
-
-        /******NMS LOAD START******/
-        __bang_write_zero(score, seg_len);
-        __memcpy(score,
-                 input_score_ptr + max_index_input_offset + i * max_seg_pad,
-                 cpy_len * sizeof(IN_DT), input_load_dir,
-                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
-
-        /******NMS LOAD END******/
-
-        __bang_max(inter_x1, score, seg_len);
-        if (inter_x1[0] > max_box[0]) {
-          max_box[0] = inter_x1[0];
-          if (sizeof(IN_DT) == sizeof(half)) {
-            max_index =
-                ((uint16_t *)inter_x1)[1] + max_index_input_offset +
-                i * max_seg_pad;  // offset start from head of input_data
-          } else if (sizeof(IN_DT) == sizeof(float)) {
-            max_index =
-                ((uint32_t *)inter_x1)[1] + max_index_input_offset +
-                i * max_seg_pad;  // offset start from head of input_data
-          }
-        }
-      }  // for repeat
-
-      // the max box's x1, y1, x2, y2 on every cluster
-      max_box[1] = input_x1_ptr[max_index];
-      max_box[2] = input_y1_ptr[max_index];
-      max_box[3] = input_x2_ptr[max_index];
-      max_box[4] = input_y2_ptr[max_index];
-      ((uint32_t *)(max_box + 5))[0] = max_index;
+      findCoreMaxBox(score_data, score, inter_x1, max_box, input_x1_ptr,
+                     input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
+                     input_offset, repeat, remain, remain_pad, max_seg_pad,
+                     max_index);
       // copy max box info to sram
       __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
     }
     __sync_all();
-    // copy all partial max to the sram of cluster 0
-    if (clusterId != 0) {
-      __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
-               SRAM2SRAM, 0);
-    }
-    __sync_all();
-
-    // reduce between clusters to get the global max box
-    if (clusterId == 0) {
-      if (coreId == 0) {
-        __bang_write_zero(inter_x1, NMS_SIZE);
-        __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
-                 REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
-        __bang_max(max_box, inter_x1, NMS_SIZE);
-        int max_cluster = (sizeof(IN_DT) == sizeof(half))
-                              ? ((uint16_t *)max_box)[1]
-                              : ((uint32_t *)max_box)[1];
-        __memcpy(max_box, sram + max_cluster * REDUCE_NUM,
-                 REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
-        __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
-      }
-      __sync_cluster();
-      if (coreId == 0x80 && clusterDim > 1) {
-        // broadcast global max box to each cluster's sram
-        for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
-          __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
-                   cluster_idx);
-        }
-      }
-      __sync_cluster();
-    }
-    __sync_all();
+#if __BANG_ARCH__ <= 372
+    findGlobalMaxBox(max_box, sram, inter_x1);
+#endif
 
-    // copy the global max box to max_box
-    __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
-    if (algo == 0 || offset == 0.0) {
-      max_area = ((float)max_box[3] - (float)max_box[1]) *
-                 ((float)max_box[4] - (float)max_box[2]);
-    } else {
-      max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
-                 ((float)max_box[4] - (float)max_box[2] + offset);
-    }
+#if __BANG_ARCH__ >= 300
+    calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
+               max_box_x2, max_box_y2);
+#else
+    calMaxArea(max_box, algo, offset, max_area);
+#endif
     global_max_index = ((uint32_t *)(max_box + 5))[0];
-    if (coreId != 0x80) {
-      input_score_ptr[global_max_index] = 0;
+    if (coreId != MEMORY_CORE) {
+      score_data[global_max_index] = 0;
     }
-    // by now, we get: max_score|max_index|max_box|max_area
-    /******FIND MAX END******/
 
-    /******NMS STORE START******/
-    // store to nram
-    if (float(max_box[0]) > thresh_score) {
-      OUT_DT *save_ptr;
-      int save_offset = 0;
-      int save_str_num = 0;
-      save_ptr = nram_save;
-      save_offset = nram_save_count;
-      save_str_num = nram_save_limit_count;
-      if (clusterId == 0 && coreId == 0) {
-        if (output_mode == 0) {  // index1, index2, ...
-          save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
-        } else if (output_mode == 1) {  // score, x1, y1, x2, y2
-          __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
-                   INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
-                   INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
-        } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
-          __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
-                   NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
-                   4);
-        }
-      }
-      nram_save_count++;
-      output_box_num++;
-    }
-
-    // store to sram/gdram
-    if (output_box_num != 0) {
-      if ((nram_save_count == nram_save_limit_count) ||
-          (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
-        if (nram_save_count != 0) {
-          if (clusterId == 0 && coreId == 0) {
-            if (output_mode == 0) {  // index1, index2, ...
-              pvLock();
-              __memcpy(output_dram, nram_save,
-                       nram_save_count * sizeof(uint32_t), NRAM2GDRAM);
-              pvUnlock();
-              output_dram += nram_save_count;
-            } else if (output_mode == 1) {  // score, x1, y1, x2, y2
-              pvLock();
-              __memcpy(output_dram, nram_save,
-                       nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
-              pvUnlock();
-              output_dram += nram_save_count * INFO_NUM;
-            } else if (output_mode ==
-                       2) {  // score---, x1---, y1---, x2---, y2---
-              pvLock();
-              __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
-                       NRAM2GDRAM, max_output_size * sizeof(IN_DT),
-                       nram_save_limit_count * sizeof(IN_DT), 4);
-              pvUnlock();
-              output_dram += nram_save_count;
-            }
-            nram_save_count = 0;
-          }
-        }
-      }  // if move data nram->sram/gdram
-    }    // if dst
+    storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
+                max_output_size, thresh_score, output_mode, nram_save_count,
+                output_box_num);
 
     if (float(max_box[0]) <= thresh_score) {
       if (clusterId == 0 && coreId == 0) {
-        loop_end_flag[0] = 1;  // dram
+        exit_flag[0] = 1;  // dram
       }
     }
     __sync_all();
-    if (loop_end_flag[0] == 1) {
+    if (exit_flag[0] == 1) {
       break;
     }
-    /******NMS STORE END******/
-
-    // To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU.
-    for (int i = 0; i <= repeat_iou_compute; i++) {
-      if (i == repeat_iou_compute && remain_iou_compute == 0) {
-        break;
-      }
-      int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
-                                              : max_seg_iou_compute;
-      int cpy_len =
-          (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
-
-      /******NMS LOAD START******/
-      __nramset((float *)score, seg_len, 0.0f);
-      int dt_offset = 0;
-      if (sizeof(IN_DT) == sizeof(float)) {
-        __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-                 cpy_len * sizeof(IN_DT), input_load_dir,
-                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
-        dt_offset = 0;
-      } else if (sizeof(IN_DT) == sizeof(half)) {
-        __nramset(x1, seg_len, half(0));
-        __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
-                 cpy_len * sizeof(IN_DT), input_load_dir,
-                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
-        __bang_half2float((float *)score, (half *)x1, seg_len);
-        dt_offset = max_seg_iou_compute;
-      }
-
-      __memcpy(x1 + dt_offset,
-               input_x1_ptr + input_offset + i * max_seg_iou_compute,
-               cpy_len * sizeof(IN_DT), input_load_dir,
-               max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3);
-      /******NMS LOAD END******/
-
-      /******NMS COMPUTE START******/
-      if (sizeof(IN_DT) == sizeof(half)) {
-        __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
-                          seg_len);
-        __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
-                          seg_len);
-        __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
-                          seg_len);
-        __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
-                          seg_len);
-      }
-      // 1、 compute IOU
-      // get the area_I
-      __nramset((float *)inter_y1, seg_len, float(max_box[1]));  // max_x1
-      __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
-                      seg_len);                                  // inter_x1
-      __nramset((float *)inter_y2, seg_len, float(max_box[3]));  // max_x2
-      __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
-                      seg_len);  // inter_x2
-      __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
-                 seg_len);
-      if (algo == 1 && offset != 0.0) {
-        __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
-      }
-      __bang_active_relu((float *)inter_x1, (float *)inter_x1,
-                         seg_len);                               // inter_w
-      __nramset((float *)inter_x2, seg_len, float(max_box[2]));  // max_y1
-      __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
-                      seg_len);                                  // inter_y1
-      __nramset((float *)inter_x2, seg_len, float(max_box[4]));  // max_y2
-      __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
-                      seg_len);  // inter_y2
-      __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
-                 seg_len);
-      if (algo == 1 && offset != 0.0) {
-        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-      }
-      __bang_active_relu((float *)inter_y1, (float *)inter_y1,
-                         seg_len);  // inter_h
-      __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
-                 seg_len);  // area_I
-      // get the area of input_box: area = (x2 - x1) * (y2 - y1);
-      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
-      __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
-      if (algo == 1 && offset != 0.0) {
-        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-        __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
-      }
-      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
-                 seg_len);  // area
-      // get the area_U: area + max_area - area_I
-      __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
-                       seg_len);
-      __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
-                 seg_len);  // area_U
-      // 2、 select the box
-      // if IOU greater than thres, set the score to zero, abort it: area_U >
-      // area_I * (1 / thresh)?
-      if (thresh_iou > 0.0) {
-        __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
-                         seg_len);
-      } else {
-        __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
-                         seg_len);
-      }
-      __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
-                seg_len);
-      __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
-      /******NMS COMPUTE END******/
-
-      if (sizeof(IN_DT) == 2) {
-        __bang_float2half_rd((half *)score, (float *)score, seg_len);
-      }
-      pvLock();
-      __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
-               cpy_len * sizeof(IN_DT), input_store_dir,
-               cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
-      pvUnlock();
-    }  // for repeat
-  }    // for max_output_size
+/******NMS STORE END******/
+#if __BANG_ARCH__ >= 300
+    scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
+                input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
+                inter_y1, inter_x2, inter_y2, max_box, max_box_x1, max_box_y1,
+                max_box_x2, max_box_y2, nram_save, repeat_iou_compute,
+                remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
+                max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
+                max_area, input_num_boxes, algo);
+#else
+    scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
+                input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
+                inter_y1, inter_x2, inter_y2, max_box, max_box[1], max_box[2],
+                max_box[3], max_box[4], nram_save, repeat_iou_compute,
+                remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
+                max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
+                max_area, input_num_boxes, algo);
+#endif
+  }  // for max_output_size
 }
 
 __mlu_global__ void MLUUionXKernelNMS(
     const void *input_boxes, const void *input_confidence,
-    const int input_num_boxes, const int input_layout, const int input_stride,
-    const int max_output_size, const float iou_threshold,
-    const float confidence_threshold, const float offset,
-    const cnrtDataType_t data_type_input, const int output_mode, const int algo,
-    void *workspace, void *result_num, void *output) {
+    const int input_num_boxes, const int max_output_size,
+    const float iou_threshold, const float confidence_threshold,
+    const float offset, const cnrtDataType_t data_type_input,
+    const int output_mode, const int algo, void *workspace, void *result_num,
+    void *output) {
   int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
-  int32_t *loop_end_flag =
-      (int32_t *)((char *)workspace +
-                  INFO_NUM * input_num_boxes * input_dwidth);
+  int32_t *exit_flag = (int32_t *)((char *)workspace +
+                                   INFO_NUM * input_num_boxes * input_dwidth);
   int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
   int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
 
@@ -1062,88 +397,55 @@ __mlu_global__ void MLUUionXKernelNMS(
     __memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
   }
   __sync_cluster();
+
   uint32_t output_box_num = 0;
+  float *score_data;
+  float *boxes_data;
+  score_data = (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
+  boxes_data = (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
+
   if (output_mode == 0) {
-    uint32_t *output_dram = (uint32_t *)output;
-    switch (data_type_input) {
-      default: { return; }
-      case CNRT_FLOAT16: {
-        half *score_data;
-        half *boxes_data;
-        score_data =
-            (input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
-        boxes_data =
-            (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
-        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
-                         boxes_data, input_ram, input_layout, input_num_boxes,
-                         input_stride, max_output_size, iou_threshold,
-                         confidence_threshold, offset, output_mode, algo);
-        ((uint32_t *)result_num)[0] = output_box_num;
-      }; break;
-      case CNRT_FLOAT32: {
-        float *score_data;
-        float *boxes_data;
-        score_data =
-            (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
-        boxes_data =
-            (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
-        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
-                         boxes_data, input_ram, input_layout, input_num_boxes,
-                         input_stride, max_output_size, iou_threshold,
-                         confidence_threshold, offset, output_mode, algo);
-        ((uint32_t *)result_num)[0] = output_box_num;
-      }; break;
+    if (data_type_input == CNRT_FLOAT32) {
+      nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
+                       score_data, boxes_data, input_ram, input_num_boxes,
+                       max_output_size, iou_threshold, confidence_threshold,
+                       offset, output_mode, algo);
+    } else {
+      nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
+                       (half *)score_data, (half *)boxes_data, input_ram,
+                       input_num_boxes, max_output_size, iou_threshold,
+                       confidence_threshold, offset, output_mode, algo);
     }
   } else {
-    switch (data_type_input) {
-      default: { return; }
-      case CNRT_FLOAT16: {
-        half *output_dram = (half *)output;
-        half *score_data;
-        half *boxes_data;
-        score_data =
-            (input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
-        boxes_data =
-            (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
-        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
-                         boxes_data, input_ram, input_layout, input_num_boxes,
-                         input_stride, max_output_size, iou_threshold,
-                         confidence_threshold, offset, output_mode, algo);
-        ((uint32_t *)result_num)[0] = output_box_num;
-      }; break;
-      case CNRT_FLOAT32: {
-        float *output_dram = (float *)output;
-        float *score_data;
-        float *boxes_data;
-        score_data =
-            (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
-        boxes_data =
-            (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
-        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
-                         boxes_data, input_ram, input_layout, input_num_boxes,
-                         input_stride, max_output_size, iou_threshold,
-                         confidence_threshold, offset, output_mode, algo);
-        ((uint32_t *)result_num)[0] = output_box_num;
-      }; break;
+    if (data_type_input == CNRT_FLOAT32) {
+      nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data,
+                       boxes_data, input_ram, input_num_boxes, max_output_size,
+                       iou_threshold, confidence_threshold, offset, output_mode,
+                       algo);
+    } else {
+      nms_detection_ux(exit_flag, output_box_num, (half *)output,
+                       (half *)score_data, (half *)boxes_data, input_ram,
+                       input_num_boxes, max_output_size, iou_threshold,
+                       confidence_threshold, offset, output_mode, algo);
     }
   }
+  ((uint32_t *)result_num)[0] = output_box_num;
 }
 
 void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
                const cnrtDataType_t data_type_input, const void *boxes_ptr,
                const void *scores_ptr, const int input_num_boxes,
-               const int input_stride, const int max_output_boxes,
-               const float iou_threshold, const float offset,
-               void *workspace_ptr, void *output_size_ptr, void *output_ptr) {
+               const int max_output_boxes, const float iou_threshold,
+               const float offset, void *workspace_ptr, void *output_size_ptr,
+               void *output_ptr) {
   switch (k_type) {
     default: { return; }
     case CNRT_FUNC_TYPE_BLOCK:
     case CNRT_FUNC_TYPE_UNION1: {
       MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
-          boxes_ptr, scores_ptr, input_num_boxes, input_stride,
+          (void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
           max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
-          /*output_mode=*/0,
-          /*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr,
+          /*output_mode=*/0, workspace_ptr, output_size_ptr, output_ptr,
           data_type_input, offset, /*algo=*/1);
     }; break;
     case CNRT_FUNC_TYPE_UNION2:
@@ -1151,11 +453,10 @@ void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
     case CNRT_FUNC_TYPE_UNION8:
     case CNRT_FUNC_TYPE_UNION16: {
       MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
-          boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1,
-          input_stride, max_output_boxes, iou_threshold,
-          /*confidence_threshold=*/0.0, offset, data_type_input,
-          /*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr,
-          output_ptr);
+          (void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
+          max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, offset,
+          data_type_input, /*output_mode=*/0, /*algo=*/1, workspace_ptr,
+          output_size_ptr, output_ptr);
     }; break;
   }
 }
diff --git a/mmcv/ops/csrc/common/mlu/nms_utils.hpp b/mmcv/ops/csrc/common/mlu/nms_utils.hpp
new file mode 100644
index 0000000000..61f5ba95df
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/nms_utils.hpp
@@ -0,0 +1,553 @@
+/*************************************************************************
+ * Copyright (C) [2019-2022] by Cambricon, Inc.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef NMS_UTILS_HPP_
+#define NMS_UTILS_HPP_
+#include "common_mlu_helper.hpp"
+
+#define NMS_SIZE (64)
+#define NMS_UP(x, y) (x / y + (int)(x % y > 0)) * y
+#define NMS_DOWN(x, y) (x / y) * y
+#define INFO_NUM (5)  // 5 means x1, x2, y1, y2 and score
+#define MEMORY_CORE (0x80)
+#define REDUCE_NUM \
+  (7)  // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
+
+__mlu_func__ void pvLock() {
+#if __BANG_ARCH__ == 270
+  if (coreId != MEMORY_CORE) {
+    __bang_lock(0, 0);
+  }
+#endif
+}
+
+__mlu_func__ void pvUnlock() {
+#if __BANG_ARCH__ == 270
+  if (coreId != MEMORY_CORE) {
+    __bang_unlock(0, 0);
+  }
+#endif
+}
+
+template <typename T>
+static __mlu_func__ void computeReluN(T *nram_dst, T *nram_src, void *nram_tmp,
+                                      const int deal_num,
+                                      const T threshold = 0) {
+  if (threshold < 0) {
+    return;
+  }
+  if (threshold) {
+#if __BANG_ARCH__ >= 300
+    __bang_relun(nram_dst, nram_src, deal_num, threshold);
+#else
+    int align_num = NFU_ALIGN_SIZE / sizeof(T);
+    T *nram_aux_a = (T *)nram_tmp;
+    T *nram_aux_b = nram_aux_a + deal_num;
+    T *nram_zero = nram_aux_b + align_num;
+    __bang_write_value(nram_aux_b, align_num, threshold);
+    __bang_write_zero(nram_zero, align_num);
+    __bang_cycle_lt((T *)nram_aux_a, nram_src, (T *)nram_aux_b, deal_num,
+                    align_num);
+    __bang_mul(nram_dst, nram_src, (T *)nram_aux_a, deal_num);
+    __bang_cycle_eq((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_zero, deal_num,
+                    align_num);
+    __bang_cycle_mul((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_aux_b,
+                     deal_num, align_num);
+    __bang_add(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
+    __bang_cycle_gt((T *)nram_aux_a, nram_dst, (T *)nram_zero, deal_num,
+                    align_num);
+    __bang_mul(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
+#endif
+  } else {
+#if __BANG_ARCH__ >= 300
+    __bang_relu(nram_dst, nram_src, deal_num);
+#else
+    __bang_active_relu(nram_dst, nram_src, deal_num);
+#endif
+  }
+}
+
+__mlu_func__ void getComputeParamsBlockOrU1(
+    const int input_dwidth, const int input_box_num, const int limit,
+    const int core_limit, int &input_offset, int &max_seg_pad, int &repeat,
+    int &remain, int &remain_pad, int &max_seg_iou_compute,
+    int &repeat_iou_compute, int &remain_iou_compute,
+    int &remain_pad_iou_compute) {
+  int avg_core = input_box_num / core_limit;
+  int rem = input_box_num % core_limit;
+  int len_core = avg_core + (coreId < rem ? 1 : 0);
+  input_offset = avg_core * coreId + (coreId <= rem ? coreId : rem);
+  max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
+  repeat = len_core / max_seg_pad;
+  remain = len_core % max_seg_pad;
+  remain_pad = NMS_UP(remain, NMS_SIZE);
+
+  // if datatype is fp16, we should cvt to fp32 when compute iou
+  max_seg_iou_compute = NMS_DOWN(max_seg_pad / (4 / input_dwidth), NMS_SIZE);
+  repeat_iou_compute = len_core / max_seg_iou_compute;
+  remain_iou_compute = len_core % max_seg_iou_compute;
+  remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
+}
+
+__mlu_func__ void getComputeParamsUx(
+    const int input_dwidth, const int input_num_boxes, const int limit,
+    int &input_offset, int &max_seg_pad, int &repeat, int &remain,
+    int &remain_pad, int &max_seg_iou_compute, int &repeat_iou_compute,
+    int &remain_iou_compute, int &remain_pad_iou_compute) {
+  // data split
+  int avg_cluster = input_num_boxes / clusterDim;
+  int rem_cluster = input_num_boxes % clusterDim;
+  int len_cluster = avg_cluster + (clusterId < rem_cluster);
+  int cluster_offset = avg_cluster * clusterId +
+                       (clusterId <= rem_cluster ? clusterId : rem_cluster);
+
+  int avg_core = len_cluster / coreDim;
+  int rem_core = len_cluster % coreDim;
+  int len_core = avg_core + (coreId < rem_core);
+  int core_offset =
+      avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
+  input_offset = cluster_offset + core_offset;
+
+  max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
+
+  // core 0 of each cluster calculate the max score index
+  int max_index_len_core = avg_cluster + (clusterId < rem_cluster);
+  repeat = max_index_len_core / max_seg_pad;
+  remain = max_index_len_core % max_seg_pad;
+  remain_pad = NMS_UP(remain, NMS_SIZE);
+  // if datatype is fp16, we should cvt to fp32 when compute iou
+  max_seg_iou_compute =
+      NMS_DOWN(max_seg_pad / (sizeof(float) / input_dwidth), NMS_SIZE);
+  repeat_iou_compute = len_core / max_seg_iou_compute;
+  remain_iou_compute = len_core % max_seg_iou_compute;
+  remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
+}
+
+template <typename IN_DT>
+__mlu_func__ void findGlobalMaxBox(IN_DT *max_box, IN_DT *sram,
+                                   IN_DT *inter_x1) {
+  // copy all partial max to the sram of cluster 0
+  if (clusterId != 0) {
+    __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
+             SRAM2SRAM, 0);
+  }
+  __sync_all();
+
+  // reduce between clusters to get the global max box
+  if (clusterId == 0) {
+    if (coreId == 0) {
+      __bang_write_zero(inter_x1, NMS_SIZE);
+      __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
+               REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
+      __bang_max(max_box, inter_x1, NMS_SIZE);
+      int max_cluster = (sizeof(IN_DT) == sizeof(half))
+                            ? ((uint16_t *)max_box)[1]
+                            : ((uint32_t *)max_box)[1];
+      __memcpy(max_box, sram + max_cluster * REDUCE_NUM,
+               REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
+      __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
+    }
+    __sync_cluster();
+    if (coreId == 0x80 && clusterDim > 1) {
+      // broadcast global max box to each cluster's sram
+      for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
+        __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
+                 cluster_idx);
+      }
+    }
+    __sync_cluster();
+  }
+  __sync_all();
+
+  // copy the global max box to max_box
+  __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
+}
+
+template <typename IN_DT>
+__mlu_func__ void findCoreMaxBox(
+    IN_DT *input_score_ptr, IN_DT *score, IN_DT *inter_x1, IN_DT *max_box,
+    const IN_DT *input_x1_ptr, const IN_DT *input_y1_ptr,
+    const IN_DT *input_x2_ptr, const IN_DT *input_y2_ptr,
+    const mluMemcpyDirection_t load_dir, const int input_offset,
+    const int repeat, const int remain, const int remain_pad,
+    const int max_seg_pad, int &max_index) {
+  if (coreId != 0x80) {
+    for (int i = 0; i <= repeat; i++) {
+      if (i == repeat && remain == 0) {
+        break;
+      }
+      int seg_len = 0;  // the length every nms compute
+      int cpy_len = 0;  // the length every nms memcpy
+      i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
+      i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
+      /******NMS LOAD START******/
+      __bang_write_zero(score, seg_len);
+      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+               cpy_len * sizeof(IN_DT), 0);
+
+      /******NMS LOAD END******/
+
+      __bang_max(inter_x1, score, seg_len);
+      if (inter_x1[0] > max_box[0]) {
+        max_box[0] = inter_x1[0];
+        if (sizeof(IN_DT) == sizeof(half)) {
+          max_index = ((uint16_t *)inter_x1)[1] + input_offset +
+                      i * max_seg_pad;  // offset start from head of input_data
+        } else if (sizeof(IN_DT) == sizeof(float)) {
+          max_index = ((uint32_t *)inter_x1)[1] + input_offset +
+                      i * max_seg_pad;  // offset start from head of input_data
+        }
+      }
+    }  // for repeat
+    // the max box's x1, y1, x2, y2 on every core
+    max_box[1] = input_x1_ptr[max_index];
+    max_box[2] = input_y1_ptr[max_index];
+    max_box[3] = input_x2_ptr[max_index];
+    max_box[4] = input_y2_ptr[max_index];
+    ((uint32_t *)(max_box + 5))[0] = max_index;
+  }
+}
+
+template <typename IN_DT>
+__mlu_func__ void findClusterMaxBox(IN_DT *sram, IN_DT *max_box,
+                                    IN_DT *inter_x1, IN_DT *input_data_score,
+                                    const int core_limit) {
+  // find the max with sram
+  // copy every core's box info to sram, form: score---x1---y1---x2---y2---
+  __memcpy(sram + REDUCE_NUM * coreId, max_box, REDUCE_NUM * sizeof(IN_DT),
+           NRAM2SRAM);  // int32_t datatype
+  __sync_cluster();
+
+  // copy score from sram to nram and find the max
+  __bang_write_zero(inter_x1, 64);
+  __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
+           REDUCE_NUM * sizeof(IN_DT), coreDim - 1);
+  __bang_max(max_box, inter_x1, 64);
+  int max_core = sizeof(IN_DT) == sizeof(half) ? ((uint16_t *)max_box)[1]
+                                               : ((uint32_t *)max_box)[1];
+  // copy the max box to max_box
+  __memcpy(max_box, sram + max_core * REDUCE_NUM, REDUCE_NUM * sizeof(IN_DT),
+           SRAM2NRAM);
+}
+
+/*****************************************************************************/
+/*******************************CALCULATE MAX AREA****************************/
+/*****************************************************************************/
+
+template <typename IN_DT>
+__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
+                             float &max_area) {
+  if (algo == 0 || offset == 0.0) {
+    max_area = ((float)max_box[3] - (float)max_box[1]) *
+               ((float)max_box[4] - (float)max_box[2]);
+  } else {
+    max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
+               ((float)max_box[4] - (float)max_box[2] + offset);
+  }
+}
+
+template <typename IN_DT>
+__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
+                             float &max_area, float &max_box_x1,
+                             float &max_box_y1, float &max_box_x2,
+                             float &max_box_y2) {
+  // the case of random inf will break the requirement of x1<=x2, y1<=y2
+  // so exchange it if it happens.
+  max_box_x1 = float(max_box[1]);
+  max_box_x2 = float(max_box[3]);
+  if (max_box[1] > max_box[3]) {
+    max_box_x1 = float(max_box[3]);
+    max_box_x2 = float(max_box[1]);
+  }
+  max_box_y1 = float(max_box[2]);
+  max_box_y2 = float(max_box[4]);
+  if (max_box[2] > max_box[4]) {
+    max_box_y1 = float(max_box[4]);
+    max_box_y2 = float(max_box[2]);
+  }
+  if (algo == 0 || offset == 0.0) {
+    max_area = (max_box_x2 - max_box_x1) * (max_box_y2 - max_box_y1);
+  } else {
+    max_area =
+        (max_box_x2 - max_box_x1 + offset) * (max_box_y2 - max_box_y1 + offset);
+  }
+}
+
+/***********************************************************************/
+/*******************************STORE RESULT****************************/
+/***********************************************************************/
+template <typename IN_DT, typename OUT_DT>
+__mlu_func__ void storeResult(IN_DT *max_box, OUT_DT *nram_save,
+                              OUT_DT *&output_dram, const int keep,
+                              const int nram_save_limit_count,
+                              const int max_output_size,
+                              const float thresh_score, const int output_mode,
+                              int &nram_save_count, uint32_t &output_box_num) {
+  /******NMS STORE START******/
+  // store to nram
+  if (float(max_box[0]) > thresh_score) {
+    OUT_DT *save_ptr;
+    int save_offset = 0;
+    int save_str_num = 0;
+    save_ptr = nram_save;
+    save_offset = nram_save_count;
+    save_str_num = nram_save_limit_count;
+    if (clusterId == 0 && coreId == 0) {
+      if (output_mode == 0) {  // index1, index2, ...
+        save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
+      } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+        __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
+                 INFO_NUM * sizeof(IN_DT), NRAM2NRAM, INFO_NUM * sizeof(IN_DT),
+                 INFO_NUM * sizeof(IN_DT), 0);
+      } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
+        __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), NRAM2NRAM,
+                 save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), 4);
+      }
+    }
+    nram_save_count++;
+    output_box_num++;
+  }
+
+  // store to sram/gdram
+  if (output_box_num != 0) {
+    if ((nram_save_count == nram_save_limit_count) ||
+        (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
+      if (nram_save_count != 0) {
+        if (clusterId == 0 && coreId == 0) {
+          if (output_mode == 0) {  // index1, index2, ...
+            pvLock();
+            __memcpy(output_dram, nram_save, nram_save_count * sizeof(uint32_t),
+                     NRAM2GDRAM);
+            pvUnlock();
+            output_dram += nram_save_count;
+          } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+            pvLock();
+            __memcpy(output_dram, nram_save,
+                     nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
+            pvUnlock();
+            output_dram += nram_save_count * INFO_NUM;
+          } else if (output_mode ==
+                     2) {  // score---, x1---, y1---, x2---, y2---
+            pvLock();
+            __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
+                     NRAM2GDRAM, max_output_size * sizeof(IN_DT),
+                     nram_save_limit_count * sizeof(IN_DT), 4);
+            pvUnlock();
+            output_dram += nram_save_count;
+          }
+          nram_save_count = 0;
+        }
+      }
+    }  // if move data nram->sram/gdram
+  }    // if dst
+}
+
+template <typename IN_DT, typename OUT_DT>
+__mlu_func__ void scoreUpdate(
+    IN_DT *input_score_ptr, const mluMemcpyDirection_t load_dir,
+    const mluMemcpyDirection_t store_dir, const IN_DT *input_x1_ptr,
+    const IN_DT *input_y1_ptr, const IN_DT *input_x2_ptr,
+    const IN_DT *input_y2_ptr, IN_DT *x1, IN_DT *y1, IN_DT *x2, IN_DT *y2,
+    IN_DT *score, IN_DT *inter_x1, IN_DT *inter_y1, IN_DT *inter_x2,
+    IN_DT *inter_y2, IN_DT *max_box, const float max_box_x1,
+    const float max_box_y1, const float max_box_x2, const float max_box_y2,
+    OUT_DT *nram_save, int repeat_iou_compute, int remain_iou_compute,
+    int remain_pad_iou_compute, int max_seg_iou_compute, int max_seg_pad,
+    const float thresh_iou, const float div_thresh_iou, const int input_offset,
+    const float offset, const float max_area, const int input_num_boxes,
+    const int algo) {
+  for (int i = 0; i <= repeat_iou_compute; i++) {
+    if (i == repeat_iou_compute && remain_iou_compute == 0) {
+      break;
+    }
+    int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
+                                            : max_seg_iou_compute;
+    int cpy_len =
+        (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
+    /******NMS LOAD START******/
+    int dt_offset = 0;
+    if (sizeof(IN_DT) == sizeof(float)) {
+      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+               cpy_len * sizeof(IN_DT), 0);
+      dt_offset = 0;
+    } else if (sizeof(IN_DT) == sizeof(half)) {
+      __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
+               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+               cpy_len * sizeof(IN_DT), 0);
+      __bang_half2float((float *)score, (half *)x1, seg_len);
+      dt_offset = max_seg_iou_compute;
+    }
+#if __BANG_ARCH__ >= 300
+    __memcpy(inter_x1 + dt_offset,
+             input_x1_ptr + input_offset + i * max_seg_iou_compute,
+             cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
+             input_num_boxes * sizeof(IN_DT), 3);
+
+    if (sizeof(IN_DT) == sizeof(half)) {
+      __bang_half2float((float *)inter_x1,
+                        (half *)inter_x1 + max_seg_iou_compute, seg_len);
+      __bang_half2float((float *)inter_y1,
+                        (half *)inter_y1 + max_seg_iou_compute, seg_len);
+      __bang_half2float((float *)inter_x2,
+                        (half *)inter_x2 + max_seg_iou_compute, seg_len);
+      __bang_half2float((float *)inter_y2,
+                        (half *)inter_y2 + max_seg_iou_compute, seg_len);
+    }
+    // box transfer
+    __bang_minequal((float *)x1, (float *)inter_x1, (float *)inter_x2, seg_len);
+    __bang_maxequal((float *)x2, (float *)inter_x1, (float *)inter_x2, seg_len);
+    __bang_minequal((float *)y1, (float *)inter_y1, (float *)inter_y2, seg_len);
+    __bang_maxequal((float *)y2, (float *)inter_y1, (float *)inter_y2, seg_len);
+    // 1、 compute IOU
+    // get the area_I
+    __bang_maxeq_scalar((float *)inter_x1, (float *)x1, max_box_x1,
+                        seg_len);  // inter_x1
+    __bang_mineq_scalar((float *)inter_x2, (float *)x2, max_box_x2,
+                        seg_len);  // inter_x2
+    __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+               seg_len);
+    if (algo == 1 && offset != 0.0) {
+      __bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
+    }
+    computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
+                 seg_len);  // inter_w
+    __bang_maxeq_scalar((float *)inter_y1, (float *)y1, float(max_box_y1),
+                        seg_len);  // inter_y1
+    __bang_mineq_scalar((float *)inter_y2, (float *)y2, float(max_box_y2),
+                        seg_len);  // inter_y2
+    __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
+               seg_len);
+    if (algo == 1 && offset != 0.0) {
+      __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+    }
+    computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
+                 seg_len);  // inter_h
+    __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
+               seg_len);  // area_I
+    // get the area of input_box: area = (x2 - x1) * (y2 - y1);
+    if (algo == 1 && offset != 0.0) {
+      __bang_fusion(FUSION_FSA, (float *)inter_y1, (float *)x2, (float *)x1,
+                    offset, seg_len, seg_len);
+      __bang_fusion(FUSION_FSA, (float *)inter_y2, (float *)y2, (float *)y1,
+                    offset, seg_len, seg_len);
+      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
+                 seg_len);  // area
+    } else {
+      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
+      __bang_fusion(FUSION_FSM, (float *)inter_x2, (float *)y2, (float *)y1,
+                    (float *)inter_y1, seg_len, seg_len);
+    }
+    // get the area_U: area + max_area - area_I
+    __bang_fusion(FUSION_FAS, (float *)inter_x2, (float *)inter_x2, max_area,
+                  (float *)inter_x1, seg_len, seg_len);
+    // 2、 select the box
+    // if IOU greater than thres, set the score to zero, abort it: area_U >
+    // area_I * (1 / thresh)?
+    if (thresh_iou > 0.0) {
+      __bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
+                        seg_len);
+    } else {
+      __bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
+                        seg_len);
+    }
+    // process for nan
+    __bang_lt((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
+    __bang_not((float *)inter_x1, (float *)inter_x1, seg_len);
+    __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
+/******NMS COMPUTE END******/
+#else
+    __memcpy(x1 + dt_offset,
+             input_x1_ptr + input_offset + i * max_seg_iou_compute,
+             cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
+             input_num_boxes * sizeof(IN_DT), 3);
+    if (sizeof(IN_DT) == sizeof(half)) {
+      __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, seg_len);
+      __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, seg_len);
+      __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, seg_len);
+      __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, seg_len);
+    }
+    // 1、 compute IOU
+    // get the area_I
+    __bang_write_value((float *)inter_y1, seg_len,
+                       float(max_box[1]));  // max_x1
+    __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
+                    seg_len);  // inter_x1
+    __bang_write_value((float *)inter_y2, seg_len,
+                       float(max_box[3]));  // max_x2
+    __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
+                    seg_len);  // inter_x2
+    __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+               seg_len);
+    if (algo == 1 && offset != 0.0) {
+      __bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
+    }
+    computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
+                 seg_len);  // inter_w
+    __bang_write_value((float *)inter_x2, seg_len,
+                       float(max_box[2]));  // max_y1
+    __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
+                    seg_len);  // inter_y1
+    __bang_write_value((float *)inter_x2, seg_len,
+                       float(max_box[4]));  // max_y2
+    __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
+                    seg_len);  // inter_y2
+    __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
+               seg_len);
+    if (algo == 1 && offset != 0.0) {
+      __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+    }
+    computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
+                 seg_len);  // inter_h
+    __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
+               seg_len);  // area_I
+    // get the area of input_box: area = (x2 - x1) * (y2 - y1);
+    __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
+    __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
+    if (algo == 1 && offset != 0.0) {
+      __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+      __bang_add_scalar((float *)inter_y2, (float *)inter_y2, offset, seg_len);
+    }
+    __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
+               seg_len);  // area
+    // get the area_U: area + max_area - area_I
+    __bang_add_scalar((float *)inter_x2, (float *)inter_x2, float(max_area),
+                      seg_len);
+    __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
+               seg_len);  // area_U
+    // 2、 select the box
+    // if IOU greater than thresh, set the score to zero, abort it: area_U >
+    // area_I * (1 / thresh)?
+    if (thresh_iou > 0.0) {
+      __bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
+                        seg_len);
+    } else {
+      __bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
+                        seg_len);
+    }
+    __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
+    __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
+/******NMS COMPUTE END******/
+#endif
+    // update the score
+    if (sizeof(IN_DT) == sizeof(half)) {
+      convertFloat2half((half *)score, (float *)score, seg_len);
+    }
+    pvLock();
+    __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
+             cpy_len * sizeof(IN_DT), store_dir, cpy_len * sizeof(IN_DT),
+             cpy_len * sizeof(IN_DT), 0);
+    pvUnlock();
+  }
+}
+
+#endif  // NMS_UTILS_HPP_
diff --git a/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
index 13b4af19f6..055ee4f4d0 100644
--- a/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
@@ -53,9 +53,8 @@ __mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
   int w_seg = position.w_end - position.w_start;
   int size = h_seg * w_seg * shape_full.c;
 
-  __memcpy(dst,
-           src + position.n_start * n_offset + position.h_start * h_offset +
-               position.w_start * w_offset,
+  __memcpy(dst, src + position.n_start * n_offset +
+                    position.h_start * h_offset + position.w_start * w_offset,
            size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
            n_seg - 1);
 }
@@ -89,7 +88,7 @@ __mlu_func__ void psamaskCollectForward(
   int elem_count =
       CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
                  NFU_ALIGN_SIZE / sizeof(T));
-  __nramset(y_nram, elem_count, (T)0);
+  __bang_write_value(y_nram, elem_count, (T)0);
 
   int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
   int y_h_offset = shape_seg.w * shape_seg.c;
@@ -155,7 +154,7 @@ __mlu_func__ void psamaskDistributeForward(
       CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
   int elem_count =
       CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
-  __nramset(y_nram_temp, elem_count, (T)0);
+  __bang_write_value(y_nram_temp, elem_count, (T)0);
 
   int y_n_offset = align_hw * align_c;
   int y_h_offset = shape_seg.w * align_c;
@@ -242,7 +241,7 @@ __mlu_func__ void psamaskCollectBackward(
   int elem_count =
       CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
                  NFU_ALIGN_SIZE / sizeof(T));
-  __nramset(dx_nram, elem_count, (T)0);
+  __bang_write_value(dx_nram, elem_count, (T)0);
 
   int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
   int dy_h_offset = shape_seg.w * dy_full.c;
@@ -331,7 +330,8 @@ __mlu_func__ void psamaskDistributeBackward(
   // fill zeros to dx
   T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
   int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
-  __nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0);
+  __bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)),
+                     (T)0);
 
   int dy_n_offset_seg = align_hw * align_c;
   int dy_h_offset_seg = shape_seg.w * align_c;
diff --git a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
index f62554d0ef..c99176ab20 100644
--- a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
@@ -130,10 +130,10 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
           __memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
 
           // interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
-          __bang_mul_const(tmp_cyc1, tmp_cyc1, w1, align_channel);
-          __bang_mul_const(tmp_cyc2, tmp_cyc2, w2, align_channel);
-          __bang_mul_const(tmp_cyc3, tmp_cyc3, w3, align_channel);
-          __bang_mul_const(tmp_cyc4, tmp_cyc4, w4, align_channel);
+          __bang_mul_scalar(tmp_cyc1, tmp_cyc1, w1, align_channel);
+          __bang_mul_scalar(tmp_cyc2, tmp_cyc2, w2, align_channel);
+          __bang_mul_scalar(tmp_cyc3, tmp_cyc3, w3, align_channel);
+          __bang_mul_scalar(tmp_cyc4, tmp_cyc4, w4, align_channel);
 
           __bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
           __bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
@@ -146,7 +146,7 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
       }  // loop_roi_grid_w
     }    // loop_roi_grid_h
     T count_value = (T)(1.0 / count);
-    __bang_mul_const(nram_out, nram_out, count_value, align_channel);
+    __bang_mul_scalar(nram_out, nram_out, count_value, align_channel);
     __memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
   }  // loop_cyc_num
 }
@@ -242,8 +242,8 @@ __mlu_global__ void MLUUnion1KernelRoiAlignAvg(
     case CNRT_FLOAT16: {
       roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
                          channels, pooled_height, pooled_width, input_height,
-                         input_width, sampling_ratio,
-                         (half)spatial_scale, num_rois);
+                         input_width, sampling_ratio, (half)spatial_scale,
+                         num_rois);
     }; break;
     case CNRT_FLOAT32: {
       roialignForwardAvg((float *)input, (float *)rois, (float *)output,
@@ -346,31 +346,31 @@ __mlu_func__ void unionRoiAlignBp(
                                       &x_high, &y_low, &y_high);
           if (x_low >= 0 && y_low >= 0) {
             __memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w1,
-                             c_align);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
-                             1 / count, c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w1,
+                              c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
+                              1 / count, c_align);
             __bang_atomic_add((T *)buffer + c_align,
                               image_offset + y_low * wo * c + x_low * c,
                               (T *)buffer + c_align, c);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w2,
-                             c_align);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
-                             1 / count, c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w2,
+                              c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
+                              1 / count, c_align);
             __bang_atomic_add((T *)buffer + c_align,
                               image_offset + y_low * wo * c + x_high * c,
                               (T *)buffer + c_align, c);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w3,
-                             c_align);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
-                             1 / count, c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w3,
+                              c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
+                              1 / count, c_align);
             __bang_atomic_add((T *)buffer + c_align,
                               image_offset + y_high * wo * c + x_low * c,
                               (T *)buffer + c_align, c);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w4,
-                             c_align);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
-                             1 / count, c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w4,
+                              c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
+                              1 / count, c_align);
             __bang_atomic_add((T *)buffer + c_align,
                               image_offset + y_high * wo * c + x_high * c,
                               (T *)buffer + c_align, c);
@@ -401,34 +401,34 @@ __mlu_func__ void unionRoiAlignBp(
               }
               __memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
                        GDRAM2NRAM);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w1,
-                               align_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
-                               1 / count, align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w1,
+                                align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
+                                1 / count, align_c);
               __bang_atomic_add(
                   (T *)buffer + align_c,
                   image_offset + y_low * wo * c + x_low * c + i * deal_once,
                   (T *)buffer + align_c, deal_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w2,
-                               align_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
-                               1 / count, align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w2,
+                                align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
+                                1 / count, align_c);
               __bang_atomic_add(
                   (T *)buffer + align_c,
                   image_offset + y_low * wo * c + x_high * c + i * deal_once,
                   (T *)buffer + align_c, deal_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w3,
-                               align_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
-                               1 / count, align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w3,
+                                align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
+                                1 / count, align_c);
               __bang_atomic_add(
                   (T *)buffer + align_c,
                   image_offset + y_high * wo * c + x_low * c + i * deal_once,
                   (T *)buffer + align_c, deal_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w4,
-                               align_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
-                               1 / count, align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w4,
+                                align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
+                                1 / count, align_c);
               __bang_atomic_add(
                   (T *)buffer + align_c,
                   image_offset + y_high * wo * c + x_high * c + i * deal_once,
diff --git a/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
index 7186cdfac3..3a6d2d3ba6 100644
--- a/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
@@ -204,11 +204,11 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
       }
 
       if (is_empty) {
-        __nramset((T *)nram_out, c_slice_align, (T)0);
+        __bang_write_value((T *)nram_out, c_slice_align, (T)0);
         __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
                  c_slice * t_size, NRAM2GDRAM);
         if (NULL != argmax) {
-          __nramset((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
+          __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
           __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
                    (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
         }
@@ -238,18 +238,18 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
           for (int i = 0; i < c_slice; i++) {
             nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
           }
-          __bang_add_const((float *)nram_a, (float *)nram_out, (float)bin_y1,
-                           c_slice_align);
-          __bang_mul_const((float *)nram_ping, (float *)nram_a, (float)width,
-                           c_slice_align);
+          __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1,
+                            c_slice_align);
+          __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width,
+                            c_slice_align);
 
           /*compute input_w*/
-          __bang_mul_const((float *)nram_a, (float *)nram_out, (float)bin_wdim,
-                           c_slice_align);
+          __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim,
+                            c_slice_align);
           __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
                      c_slice_align);
-          __bang_add_const((float *)nram_a, (float *)nram_a, (float)bin_x1,
-                           c_slice_align);
+          __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1,
+                            c_slice_align);
           __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
                      c_slice_align);
           convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
@@ -290,9 +290,7 @@ __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
                        rois_num, (float)spatial_scale, (float *)output_data,
                        argmax);
     }; break;
-    default: {
-      break;
-    }
+    default: { break; }
   }
 }
 }  // namespace forward
@@ -328,30 +326,30 @@ __mlu_func__ void convertIndex(
                    align_c);
 
   // Perform 'temp_result - hstart' operation
-  __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
-                   align_c);
+  __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
+                    align_c);
 
   // Perform 'temp_result1 - temp_result2 * width' operation
-  __bang_mul_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
-                   align_c);
+  __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
+                    align_c);
   convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
                    (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
   __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
              (float *)nram_argmax_fp_w, align_c);
 
   // Perform 'temp_result - wstart' operation
-  __bang_sub_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, wstart,
-                   align_c);
+  __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
+                    wstart, align_c);
 
   // Perform 'temp_result = h * w_compute + w' operation
-  __bang_mul_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
-                   w_compute, align_c);
+  __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+                    w_compute, align_c);
   __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
              (float *)nram_argmax_fp_w, align_c);
 
   if (loop_flag == 1) {
-    __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
-                     (loop_id * true_limit), align_c);
+    __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+                      (loop_id * true_limit), align_c);
   }
   convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
                    (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
@@ -460,21 +458,22 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
          */
 
         // Load the data from GDRAM to NRAM.
-        __memcpy((T *)nram_grads + align_c * high_precision,
-                 (const T *)grads + (n * pooled_height * pooled_width +
-                                     ph * pooled_width + pw) *
-                                        channels,
-                 channels * sizeof(T), GDRAM2NRAM);
+        __memcpy(
+            (T *)nram_grads + align_c * high_precision,
+            (const T *)grads +
+                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
+                    channels,
+            channels * sizeof(T), GDRAM2NRAM);
         if (high_precision) {
           __bang_half2float((float *)nram_grads,
                             (half *)nram_grads + align_c * high_precision,
                             align_c);
         }
 
-        __memcpy((int32_t *)nram_argmax,
-                 (const int32_t *)argmax + (n * pooled_height * pooled_width +
-                                            ph * pooled_width + pw) *
-                                               channels,
+        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
+                                             (n * pooled_height * pooled_width +
+                                              ph * pooled_width + pw) *
+                                                 channels,
                  channels * sizeof(int32_t), GDRAM2NRAM);
 
         // Perform pooling operation on NRAM.
@@ -523,20 +522,21 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
          */
 
         // Load the data from GDRAM to NRAM.
-        __memcpy((T *)nram_grads + align_c * high_precision,
-                 (const T *)grads + (n * pooled_height * pooled_width +
-                                     ph * pooled_width + pw) *
-                                        channels,
-                 channels * sizeof(T), GDRAM2NRAM);
+        __memcpy(
+            (T *)nram_grads + align_c * high_precision,
+            (const T *)grads +
+                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
+                    channels,
+            channels * sizeof(T), GDRAM2NRAM);
         if (high_precision) {
           __bang_half2float((float *)nram_grads,
                             (half *)nram_grads + align_c * high_precision,
                             align_c);
         }
-        __memcpy((int32_t *)nram_argmax,
-                 (const int32_t *)argmax + (n * pooled_height * pooled_width +
-                                            ph * pooled_width + pw) *
-                                               channels,
+        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
+                                             (n * pooled_height * pooled_width +
+                                              ph * pooled_width + pw) *
+                                                 channels,
                  channels * sizeof(int32_t), GDRAM2NRAM);
 
         int ping_pong = 0;
@@ -713,9 +713,7 @@ __mlu_global__ void MLUKernelRoiPoolBackward(
                        height, width, pooled_height, pooled_width, rois_num,
                        (const float)spatial_scale, high_precision);
     }; break;
-    default: {
-      break;
-    }
+    default: { break; }
   }
 }
 }  // namespace backward
diff --git a/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
index 7cb6df0e5d..ed64c2b68c 100644
--- a/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
@@ -26,7 +26,7 @@ __mlu_func__ void mluMultiKernelTinShift(
     int t_shift = shifts[n_index * group_size + group_id];
     int index = cur_channel_index % channel_size * hw_size +
                 n_index * time_size * channel_size * hw_size;
-    __nramset(data_nram, MAX_NRAM_SIZE, (char)0);
+    __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
     __asm__ volatile("sync;");
     if (abs(t_shift) >= time_size) {
       __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
@@ -109,7 +109,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence(
     int next_sequence_index =
         index / hw_size / channel_size % time_size + segmentime_size;
     int cur_sequence_index = index / hw_size / channel_size % time_size;
-    __nramset(data_nram, MAX_NRAM_SIZE, (char)0);
+    __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
     __asm__ volatile("sync;");
     if (max_number_hw_per_core == 0) {
       mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
diff --git a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
index 33c4f7de50..51a3003812 100644
--- a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
@@ -16,9 +16,9 @@
 void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
                const cnrtDataType_t data_type_input, const void *boxes_ptr,
                const void *scores_ptr, const int input_num_boxes,
-               const int input_stride, const int max_output_boxes,
-               const float iou_threshold, const float offset,
-               void *workspace_ptr, void *output_size_ptr, void *output_ptr);
+               const int max_output_boxes, const float iou_threshold,
+               const float offset, void *workspace_ptr, void *output_size_ptr,
+               void *output_ptr);
 
 int selectUnionType(uint32_t use_job, int box_num_per_core) {
   // the box_num_per_core should be at least 256, otherwise the real IO
@@ -30,6 +30,46 @@ int selectUnionType(uint32_t use_job, int box_num_per_core) {
   return use_job;
 }
 
+static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
+                               int &core_num_per_class,
+                               const int input_box_num) {
+  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  uint32_t cluster_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  uint32_t job_limit = cluster_number * core_dim;
+  uint32_t core_number = job_limit;
+
+  int box_num_per_core = (input_box_num + core_number - 1) / core_number;
+  int use_job = selectUnionType(job_limit, box_num_per_core);
+  // initiate k_type as Union1
+  k_dim->x = core_dim;
+  k_dim->y = 1;
+  k_dim->z = 1;
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  switch (job_limit) {
+    case CN_KERNEL_CLASS_BLOCK:
+    case CN_KERNEL_CLASS_UNION:
+    case CN_KERNEL_CLASS_UNION2:
+    case CN_KERNEL_CLASS_UNION4:
+    case CN_KERNEL_CLASS_UNION8:
+    case CN_KERNEL_CLASS_UNION16: {
+      if (use_job < 4) {
+        k_dim->x = 1;
+        *k_type = CNRT_FUNC_TYPE_BLOCK;
+      } else if (use_job == 4) {
+        k_dim->x = core_dim;
+        *k_type = CNRT_FUNC_TYPE_UNION1;
+      } else {
+        k_dim->x = use_job;
+        *k_type = (cnrtFunctionType_t)use_job;
+      }
+    }; break;
+    default:
+      LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
+                   << " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
+  }
+  return CNNL_STATUS_SUCCESS;
+}
+
 Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
                             int offset) {
   // dimension parameters check
@@ -53,33 +93,14 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
   }
 
   int input_num_boxes = boxes.size(0);
-  int input_stride = boxes.size(0);
   int max_output_boxes = boxes.size(0);
 
   cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
   cnrtDim3_t k_dim;
   cnrtJobType_t k_type;
-  uint32_t union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  uint32_t job_limit = union_number * core_dim;
-  uint32_t core_number = union_number * core_dim;
-  int box_num_per_core = (input_num_boxes + core_number - 1) / core_number;
-  // initiate k_type as Union1
-  k_dim.x = core_dim;
-  k_dim.y = 1;
-  k_dim.z = 1;
-  k_type = CNRT_FUNC_TYPE_UNION1;
-  int use_job = selectUnionType(job_limit, box_num_per_core);
-  if (use_job < 4) {
-    k_dim.x = 1;
-    k_type = CNRT_FUNC_TYPE_BLOCK;
-  } else if (use_job == 4) {
-    k_dim.x = core_dim;
-    k_type = CNRT_FUNC_TYPE_UNION1;
-  } else {
-    k_dim.x = use_job;
-    k_type = (cnrtFunctionType_t)use_job;
-  }
+
+  int core_num_per_class;
+  policyFunc(&k_dim, &k_type, core_num_per_class, input_num_boxes);
 
   // transpose boxes (n, 4) to (4, n) for better performance
   auto boxes_t = boxes.transpose(0, 1);
@@ -96,6 +117,7 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
   } else {
     space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float);
   }
+
   auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
 
   // get compute queue
@@ -112,12 +134,12 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
   auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
   auto output_size_ptr = output_size_impl->cnnlMalloc();
 
+  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
   CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<<Union" << k_type / core_dim
               << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
   KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
-            input_num_boxes, input_stride, max_output_boxes, iou_threshold,
-            offset, workspace_ptr, output_size_ptr, output_ptr);
-
+            input_num_boxes, max_output_boxes, iou_threshold, offset,
+            workspace_ptr, output_size_ptr, output_ptr);
   int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
   return output.slice(0, 0, output_num);
 }