diff --git a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu index 58e695a015..0f273d2508 100644 --- a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu +++ b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu @@ -88,14 +88,14 @@ __mlu_func__ void bboxOverlapsWorkflow( // right - left + offset ---> left __bang_sub(vec_left, vec_right, vec_left, batches_stride); - __bang_add_const(vec_left, vec_left, (T)offset, batches_stride); + __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride); // bottom - top + offset ---> right __bang_sub(vec_right, vec_bottom, vec_top, batches_stride); - __bang_add_const(vec_right, vec_right, (T)offset, batches_stride); + __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride); // zero vector ---> bottom - __nramset(vec_bottom, batches_stride, 0.f); + __bang_write_value(vec_bottom, batches_stride, 0.f); // width --> vec_left __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride); @@ -107,11 +107,11 @@ __mlu_func__ void bboxOverlapsWorkflow( // get the b1_area // (b1_x2 - b1_x1 + offset) ---> vec_top __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride); - __bang_add_const(vec_top, vec_top, (T)offset, batches_stride); + __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride); // (b1_y2 - b1_y1 + offset) ---> vec_bottom __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride); - __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride); + __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride); // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset) // ---> vec_top; @@ -121,11 +121,11 @@ __mlu_func__ void bboxOverlapsWorkflow( // get the b2_area // (b2_x2 - b2_x1 + offset) ---> b2_x1 __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride); - __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride); + __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride); // (b2_y2 - b2_y1 + offset) ---> b2_y1 __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride); - __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride); + __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride); // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset) // ---> b2_x1; @@ -137,7 +137,7 @@ __mlu_func__ void bboxOverlapsWorkflow( T *inter_s = height; // offset vector ---> vec_b2_y1 - __nramset(vec_b2_y1, batches_stride, T(offset)); + __bang_write_value(vec_b2_y1, batches_stride, T(offset)); T *vec_offset = vec_b2_y1; if (mode == 0) { @@ -164,10 +164,10 @@ __mlu_func__ void bboxOverlapsWorkflow( int32_t base1 = b1 * COORD_NUM; // set bbox1 and bbox2 to nram - __nramset(vec_b1_x1, batches_stride, bbox1[base1]); - __nramset(vec_b1_y1, batches_stride, bbox1[base1 + 1]); - __nramset(vec_b1_x2, batches_stride, bbox1[base1 + 2]); - __nramset(vec_b1_y2, batches_stride, bbox1[base1 + 3]); + __bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]); + __bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]); + __bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]); + __bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]); for (int32_t j = 0; j < num_loop_cpy; j++) { int32_t index2 = j * batches_stride; @@ -195,13 +195,13 @@ __mlu_func__ void bboxOverlapsWorkflow( // right - left + offset ---> left __bang_sub(vec_left, vec_right, vec_left, batches_stride); - __bang_add_const(vec_left, vec_left, (T)offset, batches_stride); + __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride); // bottom - top + offset ---> right __bang_sub(vec_right, vec_bottom, vec_top, batches_stride); - __bang_add_const(vec_right, vec_right, (T)offset, batches_stride); + __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride); // zero vector ---> bottom - __nramset(vec_bottom, batches_stride, (T)0); + __bang_write_value(vec_bottom, batches_stride, (T)0); // width --> vec_left __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride); @@ -213,10 +213,10 @@ __mlu_func__ void bboxOverlapsWorkflow( // get the b1_area // (b1_x2 - b1_x1 + offset) ---> vec_top __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride); - __bang_add_const(vec_top, vec_top, (T)offset, batches_stride); + __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride); // (b1_y2 - b1_y1 + offset) ---> vec_bottom __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride); - __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride); + __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride); // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset) // ---> vec_top; __bang_mul(vec_top, vec_top, vec_bottom, batches_stride); @@ -225,10 +225,10 @@ __mlu_func__ void bboxOverlapsWorkflow( // get the b2_area // (b2_x2 - b2_x1 + offset) ---> b2_x1 __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride); - __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride); + __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride); // (b2_y2 - b2_y1 + offset) ---> b2_y1 __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride); - __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride); + __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride); // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset) // ---> b2_x1; __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride); @@ -239,7 +239,7 @@ __mlu_func__ void bboxOverlapsWorkflow( T *inter_s = height; // offset vector ---> vec_b2_y1 - __nramset(vec_b2_y1, batches_stride, T(offset)); + __bang_write_value(vec_b2_y1, batches_stride, T(offset)); T *vec_offset = vec_b2_y1; if (mode == 0) { diff --git a/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu index ac5ea0d653..8dd6a8e582 100644 --- a/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu +++ b/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu @@ -139,7 +139,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask, blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1; // set output_nram to zero - __nramset(output_nram, param.output_nram_size, T(0)); + __bang_write_value(output_nram, param.output_nram_size, T(0)); // loop blocks of kernel window: grid_dim.(Kh, Kw) for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) { @@ -313,8 +313,8 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask, T *sum = sum_array; for (int g = 0; g < blkSize.G; ++g) { - __bang_mul_const(sum, src, mask_array[mask_index], - param.block_Cg_NFU); + __bang_mul_scalar(sum, src, mask_array[mask_index], + param.block_Cg_NFU); // // NOTE: Since block_Cg_NFU >= block_Cg_stride, // overlapped writing may occur on sum_array. @@ -446,8 +446,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output, T *base_grad_input = (T *)grad_input + input_index; __memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T), GDRAM2NRAM); - __bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff, - ((T *)mask_buff)[mask_index], num_align); + __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff, + ((T *)mask_buff)[mask_index], num_align); __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input, (T *)grad_input_buff, num_align); __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff, @@ -485,8 +485,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output, T *base_grad_input = (T *)grad_input + input_index; __memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T), GDRAM2NRAM); - __bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff, - ((T *)mask_buff)[mask_index], rem_for_loop_align); + __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff, + ((T *)mask_buff)[mask_index], rem_for_loop_align); __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input, (T *)grad_input_buff, rem_for_loop); __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff, @@ -541,12 +541,12 @@ void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, const int wi, const int c, const int k_up, const int group, const int scale) { if (dtype == CNRT_FLOAT16) { - backward::MLUUnion1KernelCarafeBackward - <<>>(input, mask, grad_output, grad_input, - grad_mask, n, hi, wi, c, k_up, group, scale); + backward::MLUUnion1KernelCarafeBackward<<>>( + input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up, + group, scale); } else { - backward::MLUUnion1KernelCarafeBackward - <<>>(input, mask, grad_output, grad_input, - grad_mask, n, hi, wi, c, k_up, group, scale); + backward::MLUUnion1KernelCarafeBackward<<>>( + input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up, + group, scale); } } diff --git a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp index e59099ae8f..e372515985 100644 --- a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp +++ b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp @@ -211,51 +211,52 @@ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src, // get sign bit const float move_23bit = 8388608.0; // 0x80000000 = 1,000000000,0000000000000000000000000000 - __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - 0x80000000); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x80000000); __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition, src_count * sizeof(float), NFU_ALIGN_SIZE); // get 1 or 0 from sign bit // judg is Odd - __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - 0x00000001); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x00000001); __bang_cycle_bor((char *)dst_addition, (char *)dst_addition, (char *)src_addition, src_count * sizeof(float), NFU_ALIGN_SIZE); - __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - 0x80000001); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x80000001); __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); // minus xor, positive num invariant - __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - 0xffffffff); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0xffffffff); __bang_cycle_mul(dst, dst_addition, src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float)); // convert int32 to float32 - __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x7fffff); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x7fffff); __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition, src_count * sizeof(float), NFU_ALIGN_SIZE); - __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - 0x4b000000); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x4b000000); __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition, src_count * sizeof(float), NFU_ALIGN_SIZE); - __bang_sub_const(dst, dst, move_23bit, src_count); + __bang_sub_scalar(dst, dst, move_23bit, src_count); // add one __bang_add(dst, dst, dst_addition, src_count); // set sign for float32 - __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - 0xffffffff); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0xffffffff); __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); - __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - 0x00000001); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x00000001); __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); - __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - 0x80000000); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x80000000); __bang_cycle_band((char *)dst_addition, (char *)dst_addition, (char *)src_addition, src_count * 4, 128); __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4); @@ -291,18 +292,20 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src, // dst_addition = abs(src) __bang_mul(dst_addition, src, (float *)dst, src_count); // if dst_addition < 1.0 , then src_addition + 1, to fix add error. - __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 1.0f); + __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 1.0f); __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count); - __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - 0xbf800000); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0xbf800000); // set negative flag -1.0 = 0xbf80000 __bang_cycle_eq( (float *)dst, (float *)dst, (float *)src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); // to mark all src in [x<-1.0] __bang_active_abs(dst_addition, src, src_count); - __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 8388608.0f); + __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 8388608.0f); // mask shift move 23 __bang_cycle_add_tz( dst_addition, dst_addition, src_addition, src_count, @@ -314,12 +317,12 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src, // to fix max value // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0, // means max value. - __bang_mul_const((float *)dst, (float *)dst, 16777215.0, src_count); + __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count); __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst, src_count * floatDchar); // get low 23bit - __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - (unsigned)0x007fffff); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + (unsigned)0x007fffff); // mask low 23bit is 1 __bang_cycle_band((char *)dst_addition, (char *)dst_addition, (char *)src_addition, src_count * floatDchar, @@ -327,16 +330,36 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src, // set 9 high bit ===> dst // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000 // 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000 - __nramset(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000); + __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000); __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); // src or dst_addition __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition, src_count * floatDchar); - __bang_mul_const((float *)dst, (float *)dst, -2.0, src_count); + __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count); __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * floatDchar); #endif // __BANG_ARCH__ >= 300 } +/*! + * @brief Converts float32 to half data type, + * the rounding mode on MLU200 is rd, on MLU300 is rn. + * + * @param[out] dst + * Pointer to NRAM that stores half type data. + * @param[in] src + * Pointer to NRAM that stores float32 type data. + * @param[in] src_count + * The count of elements in src. + */ +__mlu_func__ inline void convertFloat2half(half *dst, float *src, + int src_count) { +#if __BANG_ARCH__ >= 300 + __bang_float2half_rn(dst, src, src_count); +#else + __bang_float2half_rd(dst, src, src_count); +#endif +} + #endif // COMMON_MLU_HELPER_HPP_ diff --git a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu index 7cb16bb100..fb6185048a 100644 --- a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu +++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu @@ -9,14 +9,9 @@ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *************************************************************************/ -#include "common_mlu_helper.hpp" +#include "nms_utils.hpp" -#define NMS_SIZE (64) #define COORD_DIM (4) -#define MEMORY_CORE (0x80) -#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score -#define REDUCE_NUM \ - (7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input) #define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024) #define SIZE_SRAM_BUF (MAX_SRAM_SIZE) @@ -24,348 +19,129 @@ __nram__ int8_t nram_buffer[SIZE_NRAM_BUF]; __mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF]; -__mlu_func__ void pvLock() { -#if __BANG_ARCH__ == 270 - if (coreId != MEMORY_CORE) { - __bang_lock(0, 0); - } -#endif -} - -__mlu_func__ void pvUnlock() { -#if __BANG_ARCH__ == 270 - if (coreId != MEMORY_CORE) { - __bang_unlock(0, 0); - } -#endif -} - enum Addr { SRAM, GDRAM }; template __mlu_func__ void nms_detection( - uint32_t *output_box_num, const int output_mode, const int input_layout, - OUT_DT *output_data, const Addr dst, IN_DT *input_data_score, - const IN_DT *input_data_box, const Addr src, IN_DT *buffer, - const int buffer_size, IN_DT *sram, const int core_limit, - const int input_box_num, const int input_stride, const int output_stride, - const int keepNum, const float thresh_iou, const float thresh_score, + uint32_t &output_box_num, const int output_mode, OUT_DT *output_dram, + IN_DT *input_data_score, const IN_DT *input_data_box, const Addr input_ram, + IN_DT *sram, const int core_limit, const int input_num_boxes, + const int max_output_size, const float thresh_iou, const float thresh_score, const float offset, const int algo) { - // global value, it is stored in sram with a offset from the begin. - const int flag_offset_size = 28; - int32_t *loop_end_flag = (int32_t *)(sram + flag_offset_size); - loop_end_flag[0] = 0; + // global value + int32_t *exit_flag = (int32_t *)(sram + 28); + exit_flag[0] = 0; // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2 - const int nms_buffer_count1 = 9; + int nms_buffer_count1 = 9; // temp nram buffer to store selected target. - const int nram_save_limit_count = 256; + int nram_save_limit_count = 256; float div_thresh_iou = 1.0 / thresh_iou; // input data ptr - IN_DT *input_score_ptr; - const IN_DT *input_x1_ptr; - const IN_DT *input_y1_ptr; - const IN_DT *input_x2_ptr; - const IN_DT *input_y2_ptr; - input_score_ptr = input_data_score; - input_x1_ptr = input_data_box; - if (input_layout == 0) { - // [boxes_num, 4] - input_y1_ptr = input_x1_ptr + 1; - input_x2_ptr = input_x1_ptr + 2; - input_y2_ptr = input_x1_ptr + 3; - } else if (input_layout == 1) { - // [4, boxes_num] - input_y1_ptr = input_x1_ptr + input_stride; - input_x2_ptr = input_y1_ptr + input_stride; - input_y2_ptr = input_x2_ptr + input_stride; - } - - // nram data ptr - IN_DT *x1; - IN_DT *y1; - IN_DT *x2; - IN_DT *y2; - IN_DT *score; - IN_DT *inter_x1; - IN_DT *inter_y1; - IN_DT *inter_x2; - IN_DT *inter_y2; - IN_DT *max_box; // the max score, x1, y1, x2, y2 - IN_DT *x1_mask; - IN_DT *y1_mask; - IN_DT *x2_mask; - IN_DT *y2_mask; - OUT_DT *nram_save; + const IN_DT *input_x1_ptr = input_data_box; + const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes; + const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes; + const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes; int limit = 0; // find limit when GDRAM or SRAM - int len_core = 0; // the length deal by every core int max_seg_pad = 0; // the max length every repeat int repeat = 0; int remain = 0; int remain_pad = 0; int input_offset = 0; // offset of input_data for current core int nram_save_count = 0; - // mask for collect x1, y1, x2, y2. each mask has 128 elements - const int mask_size = 128; - const int total_mask_size = 512; if (output_mode == 0) { - limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) - - nram_save_limit_count * sizeof(OUT_DT) - - total_mask_size * sizeof(IN_DT)) / + limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) - + nram_save_limit_count * sizeof(OUT_DT)) / (nms_buffer_count1 * sizeof(IN_DT)); } else { - limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) - - nram_save_limit_count * INFO_NUM * sizeof(OUT_DT) - - total_mask_size * sizeof(IN_DT)) / + // 5 maens: score, x1, y1, x2, y2 + limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) - + nram_save_limit_count * 5 * sizeof(OUT_DT)) / (nms_buffer_count1 * sizeof(IN_DT)); } - if (core_limit == 1) { - len_core = input_box_num; - input_offset = 0; - } else { - int avg_core = input_box_num / core_limit; - int rem = input_box_num % core_limit; - len_core = avg_core + (taskId < rem ? 1 : 0); - input_offset = avg_core * taskId + (taskId <= rem ? taskId : rem); - } - max_seg_pad = PAD_DOWN(limit, NMS_SIZE); - repeat = len_core / max_seg_pad; - remain = len_core % max_seg_pad; - remain_pad = PAD_UP(remain, NMS_SIZE); + int max_seg_iou_compute = 0; + int repeat_iou_compute = 0; + int remain_iou_compute = 0; + int remain_pad_iou_compute = 0; - // if datatype is half, we should convert it to float when compute the IoU - int max_seg_iou_compute = - PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE); - int repeat_iou_compute = len_core / max_seg_iou_compute; - int remain_iou_compute = len_core % max_seg_iou_compute; - int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE); - // initial the address point - score = buffer; - x1 = score + max_seg_pad; - y1 = x1 + max_seg_pad; - x2 = y1 + max_seg_pad; - y2 = x2 + max_seg_pad; - inter_x1 = y2 + max_seg_pad; - inter_y1 = inter_x1 + max_seg_pad; - inter_x2 = inter_y1 + max_seg_pad; - inter_y2 = inter_x2 + max_seg_pad; - x1_mask = inter_y2 + max_seg_pad; - y1_mask = x1_mask + mask_size; - x2_mask = y1_mask + mask_size; - y2_mask = x2_mask + mask_size; - max_box = y2_mask + mask_size; // the max score, x1, y1, x2, y2 - // offset two line from max_box - nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE); + getComputeParamsBlockOrU1(sizeof(IN_DT), input_num_boxes, limit, core_limit, + input_offset, max_seg_pad, repeat, remain, + remain_pad, max_seg_iou_compute, repeat_iou_compute, + remain_iou_compute, remain_pad_iou_compute); - // set mask for __bang_collect instruction - if (input_layout == 0) { - __nramset((IN_DT *)x1_mask, total_mask_size, (IN_DT)0); - for (int idx = 0; idx < mask_size; idx++) { - int index = (idx % COORD_DIM) * mask_size + idx; - x1_mask[index] = (IN_DT)1.0; - } - } + // init the data ptr + IN_DT *score = (IN_DT *)nram_buffer; + IN_DT *x1 = score + max_seg_pad; + IN_DT *y1 = x1 + max_seg_pad; + IN_DT *x2 = y1 + max_seg_pad; + IN_DT *y2 = x2 + max_seg_pad; + IN_DT *inter_x1 = y2 + max_seg_pad; + IN_DT *inter_y1 = inter_x1 + max_seg_pad; + IN_DT *inter_x2 = inter_y1 + max_seg_pad; + IN_DT *inter_y2 = inter_x2 + max_seg_pad; + IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2 + OUT_DT *nram_save = + (OUT_DT *)((char *)max_box + + NFU_ALIGN_SIZE); // offset two line from max_box - for (int keep = 0; keep < keepNum; keep++) { // loop until the max_score <= 0 +#if __BANG_ARCH__ >= 300 + float max_box_x1 = 0; + float max_box_y1 = 0; + float max_box_x2 = 0; + float max_box_y2 = 0; +#endif + mluMemcpyDirection_t load_dir = SRAM2NRAM; + mluMemcpyDirection_t store_dir = NRAM2SRAM; + load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM; + store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM; + + for (int keep = 0; keep < max_output_size; + keep++) { // loop until the max_score <= 0 if (core_limit != 1) { __sync_cluster(); // sync before current loop } - /******find max start******/ + /******FIND MAX START******/ int max_index = 0; // the max score index int global_max_index = 0; // for U1 - float max_area = 0; // the max score area + float max_area = 0; // the max socre area max_box[0] = 0; // init 0 - - for (int i = 0; i <= repeat; i++) { - if (i == repeat && remain == 0) { - break; - } - int seg_len = 0; // the length every nms compute - int cpy_len = 0; // the length every nms memcpy - i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad; - // check seg_len exceeds the limit of fp16 or not. 65536 is the largest - // num that half data type could express. - if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) { - // seg length exceeds the max num for fp16 datatype! - return; - } - i == repeat ? cpy_len = remain : cpy_len = max_seg_pad; - /******nms load start******/ - mluMemcpyDirection_t load_dir = SRAM2NRAM; - if (src == SRAM) { - load_dir = SRAM2NRAM; - } else { - load_dir = GDRAM2NRAM; - } - __nramset(score, seg_len, (IN_DT)0); - __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad, - cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), - cpy_len * sizeof(IN_DT), 0); - - /******nms load end******/ - - __bang_max(inter_x1, score, seg_len); - if (inter_x1[0] > max_box[0]) { - max_box[0] = inter_x1[0]; - - if (sizeof(IN_DT) == sizeof(half)) { - max_index = ((uint16_t *)inter_x1)[1] + input_offset + - i * max_seg_pad; // offset start from head of input_data - } else if (sizeof(IN_DT) == sizeof(float)) { - max_index = ((uint32_t *)inter_x1)[1] + input_offset + - i * max_seg_pad; // offset start from head of input_data - } - } - } // for repeat - - int stride = 1; - if (input_layout == 0) { - stride = input_stride; - } else if (input_layout == 1) { - stride = 1; - } + findCoreMaxBox(input_data_score, score, inter_x1, max_box, input_x1_ptr, + input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir, + input_offset, repeat, remain, remain_pad, max_seg_pad, + max_index); if (core_limit == 1) { - max_box[1] = input_x1_ptr[max_index * stride]; - max_box[2] = input_y1_ptr[max_index * stride]; - max_box[3] = input_x2_ptr[max_index * stride]; - max_box[4] = input_y2_ptr[max_index * stride]; - if (algo == 0 || offset == 0.0) { - max_area = ((float)max_box[3] - (float)max_box[1]) * - ((float)max_box[4] - (float)max_box[2]); - } else { - max_area = ((float)max_box[3] - (float)max_box[1] + offset) * - ((float)max_box[4] - (float)max_box[2] + offset); - } - input_score_ptr[max_index] = 0; +#if __BANG_ARCH__ >= 300 + calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1, + max_box_x2, max_box_y2); +#else + calMaxArea(max_box, algo, offset, max_area); +#endif + input_data_score[max_index] = 0; global_max_index = max_index; - ((uint32_t *)(max_box + INFO_NUM))[0] = max_index; } else if (core_limit == 4) { - // find the max with sram - // the max box's x1, y1, x2, y2 on every core - if (coreId != MEMORY_CORE) { - max_box[1] = input_x1_ptr[max_index * stride]; - max_box[2] = input_y1_ptr[max_index * stride]; - max_box[3] = input_x2_ptr[max_index * stride]; - max_box[4] = input_y2_ptr[max_index * stride]; - } - ((uint32_t *)(max_box + INFO_NUM))[0] = max_index; - // copy every core's box info to sram, form: score---x1---y1---x2---y2--- - for (int i = 0; i < INFO_NUM; i++) { - __memcpy(sram + i * core_limit + taskId, max_box + i, 1 * sizeof(IN_DT), - NRAM2SRAM); - } - // copy every core's max_index to sram, use 2 half to store max_index - __memcpy(sram + INFO_NUM * core_limit + taskId * 2, max_box + INFO_NUM, - sizeof(uint32_t), - NRAM2SRAM); // int32_t datatype __sync_cluster(); + findClusterMaxBox(sram, max_box, inter_x1, input_data_score, core_limit); - // copy score from sram to nram and find the max - __nramset(inter_x1, NMS_SIZE, (IN_DT)0); - __memcpy(inter_x1, sram, core_limit * sizeof(IN_DT), SRAM2NRAM); - __bang_max(max_box, inter_x1, NMS_SIZE); - int max_core = 0; - if (sizeof(IN_DT) == sizeof(half)) { - max_core = ((uint16_t *)max_box)[1]; - } else if (sizeof(IN_DT) == sizeof(float)) { - max_core = ((uint32_t *)max_box)[1]; - } - - // copy the max box from SRAM to NRAM - __memcpy(max_box + 1, sram + 1 * core_limit + max_core, 1 * sizeof(IN_DT), - SRAM2NRAM); // x1 - __memcpy(max_box + 2, sram + 2 * core_limit + max_core, 1 * sizeof(IN_DT), - SRAM2NRAM); // y1 - __memcpy(max_box + 3, sram + 3 * core_limit + max_core, 1 * sizeof(IN_DT), - SRAM2NRAM); // x2 - __memcpy(max_box + 4, sram + 4 * core_limit + max_core, 1 * sizeof(IN_DT), - SRAM2NRAM); // y2 - __memcpy(max_box + 5, sram + 5 * core_limit + 2 * max_core, - sizeof(uint32_t), SRAM2NRAM); - if (algo == 0 || offset == 0.0) { - max_area = ((float)max_box[3] - (float)max_box[1]) * - ((float)max_box[4] - (float)max_box[2]); - } else { - max_area = ((float)max_box[3] - (float)max_box[1] + offset) * - ((float)max_box[4] - (float)max_box[2] + offset); - } - global_max_index = ((uint32_t *)(max_box + INFO_NUM))[0]; - input_score_ptr[global_max_index] = 0; +#if __BANG_ARCH__ >= 300 + calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1, + max_box_x2, max_box_y2); +#else + calMaxArea(max_box, algo, offset, max_area); +#endif + global_max_index = ((uint32_t *)(max_box + 5))[0]; + input_data_score[global_max_index] = 0; } // by now, we get: max_score|max_index|max_box|max_area - /******find max end******/ - - /******nms store start******/ - // store to nram - if (float(max_box[0]) > thresh_score) { - OUT_DT *save_ptr; - int save_offset = 0; - int save_str_num = 0; - save_ptr = nram_save; - save_offset = nram_save_count; - save_str_num = nram_save_limit_count; - if (coreId == 0) { - if (output_mode == 0) { // index1, index2, ... - __memcpy(save_ptr + save_offset, (uint32_t *)(max_box + INFO_NUM), - 1 * sizeof(uint32_t), NRAM2NRAM, 1 * sizeof(uint32_t), - 1 * sizeof(uint32_t), 0); - } else if (output_mode == 1) { // score, x1, y1, x2, y2 - __memcpy(save_ptr + save_offset * INFO_NUM, max_box, - INFO_NUM * sizeof(IN_DT), NRAM2NRAM, - INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0); - } else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2--- - __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), - NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), - 4); - } - } - nram_save_count++; - (*output_box_num)++; - } + /******FIND MAX END******/ - // store to sram/gdram - if (*output_box_num != 0) { - mluMemcpyDirection_t store_dir = NRAM2GDRAM; - if (dst == SRAM) { - store_dir = NRAM2SRAM; - } else { // dst == GDRAM - store_dir = NRAM2GDRAM; - } - if ((nram_save_count == nram_save_limit_count) || - (float(max_box[0]) <= thresh_score) || keep == keepNum - 1) { - if (nram_save_count != 0) { - if (coreId == 0) { - if (output_mode == 0) { // index1, index2, ... - pvLock(); - __memcpy(output_data, nram_save, - nram_save_count * sizeof(uint32_t), store_dir); - pvUnlock(); - output_data += nram_save_count; - } else if (output_mode == 1) { // score, x1, y1, x2, y2 - pvLock(); - __memcpy(output_data, nram_save, - nram_save_count * INFO_NUM * sizeof(IN_DT), store_dir); - pvUnlock(); - output_data += nram_save_count * INFO_NUM; - } else if (output_mode == - 2) { // score---, x1---, y1---, x2---, y2--- - pvLock(); - __memcpy(output_data, nram_save, nram_save_count * sizeof(IN_DT), - store_dir, output_stride * sizeof(IN_DT), - nram_save_limit_count * sizeof(IN_DT), 4); - pvUnlock(); - output_data += nram_save_count; - } - nram_save_count = 0; - } - } - } // if move data nram->sram/gdram - } // if dst + storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count, + max_output_size, thresh_score, output_mode, nram_save_count, + output_box_num); // if the max score <= 0, end if (core_limit == 1) { @@ -375,190 +151,40 @@ __mlu_func__ void nms_detection( } else { if (float(max_box[0]) <= thresh_score) { if (coreId == 0) { - loop_end_flag[0] = 1; + exit_flag[0] = 1; } } __sync_cluster(); - if (loop_end_flag[0] == 1) { + if (exit_flag[0] == 1) { break; } } - /******nms store end******/ - - // To solve half data accuracy, we convert half to float to calculate IoU. - for (int i = 0; i <= repeat_iou_compute; i++) { - if (i == repeat_iou_compute && remain_iou_compute == 0) { - break; - } - int seg_len = 0; // the length every nms compute - int cpy_len = 0; // the length every nms memcpy - i == repeat_iou_compute ? seg_len = remain_pad_iou_compute - : seg_len = max_seg_iou_compute; - i == repeat_iou_compute ? cpy_len = remain_iou_compute - : cpy_len = max_seg_iou_compute; - - /******nms load start******/ - mluMemcpyDirection_t load_dir = SRAM2NRAM; - if (src == SRAM) { - load_dir = SRAM2NRAM; - } else { - load_dir = GDRAM2NRAM; - } - - __nramset((float *)score, seg_len, 0.0f); - int dt_offset = 0; - if (sizeof(IN_DT) == sizeof(float)) { - __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad, - cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), - cpy_len * sizeof(IN_DT), 0); - dt_offset = 0; - } else if (sizeof(IN_DT) == sizeof(half)) { - __nramset(x1, seg_len, half(0)); - __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute, - cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), - cpy_len * sizeof(IN_DT), 0); - __bang_half2float((float *)score, (half *)x1, seg_len); - dt_offset = max_seg_iou_compute; - } - - if (input_layout == 0) { - // the following number 4 means x1, y1, x2, y2 - __memcpy( - inter_x1, - input_x1_ptr + (input_offset + i * max_seg_iou_compute) * COORD_DIM, - cpy_len * COORD_DIM * sizeof(IN_DT), load_dir, - cpy_len * COORD_DIM * sizeof(IN_DT), - cpy_len * COORD_DIM * sizeof(IN_DT), 0); - // here use collect instruction to transpose the [n, 4] shape into [4, - // n] shape to avoid - // discrete memory accessing. - for (int c_i = 0; c_i < COORD_DIM * seg_len / mask_size; c_i++) { - // the following number 32 means 32 elements will be selected out by - // once operation - __bang_collect(x1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size, - x1_mask, mask_size); - __bang_collect(y1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size, - y1_mask, mask_size); - __bang_collect(x2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size, - x2_mask, mask_size); - __bang_collect(y2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size, - y2_mask, mask_size); - } - } else if (input_layout == 1) { - __memcpy(x1 + dt_offset, - input_x1_ptr + input_offset + i * max_seg_iou_compute, - cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), - cpy_len * sizeof(IN_DT), 0); - __memcpy(y1 + dt_offset, - input_y1_ptr + input_offset + i * max_seg_iou_compute, - cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), - cpy_len * sizeof(IN_DT), 0); - __memcpy(x2 + dt_offset, - input_x2_ptr + input_offset + i * max_seg_iou_compute, - cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), - cpy_len * sizeof(IN_DT), 0); - __memcpy(y2 + dt_offset, - input_y2_ptr + input_offset + i * max_seg_iou_compute, - cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), - cpy_len * sizeof(IN_DT), 0); - } - /******nms load end******/ - - /******nms compute start******/ - if (sizeof(IN_DT) == sizeof(half)) { - __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, - seg_len); - __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, - seg_len); - __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, - seg_len); - __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, - seg_len); - } - // 1、 compute IOU - // get the area_I - __nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1 - __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1, - seg_len); // inter_x1 - __nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2 - __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2, - seg_len); // inter_x2 - __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, - seg_len); - if (algo == 1 && offset != 0.0) { - __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len); - } - __bang_active_relu((float *)inter_x1, (float *)inter_x1, - seg_len); // inter_w - __nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1 - __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2, - seg_len); // inter_y1 - __nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2 - __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2, - seg_len); // inter_y2 - __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1, - seg_len); - if (algo == 1 && offset != 0.0) { - __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len); - } - __bang_active_relu((float *)inter_y1, (float *)inter_y1, - seg_len); // inter_h - __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1, - seg_len); // area_I - // get the area of input_box: area = (x2 - x1) * (y2 - y1); - __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len); - __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len); - if (algo == 1 && offset != 0.0) { - __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len); - __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len); - } - __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2, - seg_len); // area - // get the area_U: area + max_area - area_I - __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area), - seg_len); - __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1, - seg_len); // area_U - // 2、 select the box - // if IOU greater than thres, set the score to zero, abort it: area_U > - // area_I * (1 / thresh)? - if (thresh_iou > 0.0) { - __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou, - seg_len); - } else { - __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou, - seg_len); - } - __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, - seg_len); - __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len); - /******nms compute end******/ - - // update the score - mluMemcpyDirection_t update_dir = NRAM2SRAM; - if (dst == SRAM) { - update_dir = NRAM2SRAM; - } else { - update_dir = NRAM2GDRAM; - } - if (sizeof(IN_DT) == sizeof(half)) { - __bang_float2half_rd((half *)score, (float *)score, seg_len); - } - pvLock(); - __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score, - cpy_len * sizeof(IN_DT), update_dir, cpy_len * sizeof(IN_DT), - cpy_len * sizeof(IN_DT), 0); - pvUnlock(); - } // for repeat - } // for keepNum +/******NMS STORE END******/ +#if __BANG_ARCH__ >= 300 + scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr, + input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, + inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box_x1, + max_box_y1, max_box_x2, max_box_y2, nram_save, + repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute, + max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou, + input_offset, offset, max_area, input_num_boxes, algo); +#else + scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr, + input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, + inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box[1], + max_box[2], max_box[3], max_box[4], nram_save, + repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute, + max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou, + input_offset, offset, max_area, input_num_boxes, algo); +#endif + } // for max_output_size } __mlu_global__ void MLUUnion1KernelNMS( const void *input_boxes, const void *input_confidence, - const int input_num_boxes, const int input_stride, - const int max_output_size, const float iou_threshold, - const float confidence_threshold, const int mode, const int input_layout, - void *workspace, void *result_num, void *output, + const int input_num_boxes, const int max_output_size, + const float iou_threshold, const float confidence_threshold, + const int output_mode, void *workspace, void *result_num, void *output, const cnrtDataType_t data_type_input, const float offset, const int algo) { if (data_type_input == CNRT_FLOAT16) { __memcpy(workspace, input_confidence, input_num_boxes * sizeof(half), @@ -569,82 +195,48 @@ __mlu_global__ void MLUUnion1KernelNMS( } else { } - int output_stride = max_output_size; - uint32_t result_box_num = 0; - if (mode == 0) { - uint32_t *out_data = (uint32_t *)output; - switch (data_type_input) { - default: { return; } - case CNRT_FLOAT16: { - half *boxes_data = (half *)input_boxes; - half *confi_data = (half *)workspace; - half *buffer = (half *)nram_buffer; - half *sram = (half *)sram_buffer; - - nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, - confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, - sram, taskDim, input_num_boxes, input_stride, - output_stride, max_output_size, iou_threshold, - confidence_threshold, offset, algo); - ((uint32_t *)result_num)[0] = result_box_num; - }; break; - case CNRT_FLOAT32: { - float *boxes_data = (float *)input_boxes; - float *confi_data = (float *)workspace; - float *buffer = (float *)nram_buffer; - float *sram = (float *)sram_buffer; + uint32_t output_box_num = 0; + float *score_data = (float *)workspace; + float *boxes_data = (float *)input_boxes; + float *sram = (float *)sram_buffer; - nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, - confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, - sram, taskDim, input_num_boxes, input_stride, - output_stride, max_output_size, iou_threshold, - confidence_threshold, offset, algo); - ((uint32_t *)result_num)[0] = result_box_num; - }; break; + if (output_mode == 0) { + if (data_type_input == CNRT_FLOAT32) { + nms_detection(output_box_num, output_mode, (uint32_t *)output, score_data, + boxes_data, GDRAM, sram, taskDim, input_num_boxes, + max_output_size, iou_threshold, confidence_threshold, + offset, algo); + } else { + nms_detection(output_box_num, output_mode, (uint32_t *)output, + (half *)score_data, (half *)boxes_data, GDRAM, (half *)sram, + taskDim, input_num_boxes, max_output_size, iou_threshold, + confidence_threshold, offset, algo); } } else { - switch (data_type_input) { - default: { return; } - case CNRT_FLOAT16: { - half *boxes_data = (half *)input_boxes; - half *confi_data = (half *)workspace; - half *out_data = (half *)output; - half *buffer = (half *)nram_buffer; - half *sram = (half *)sram_buffer; - - nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, - confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, - sram, taskDim, input_num_boxes, input_stride, - output_stride, max_output_size, iou_threshold, - confidence_threshold, offset, algo); - ((uint32_t *)result_num)[0] = result_box_num; - }; break; - case CNRT_FLOAT32: { - float *boxes_data = (float *)input_boxes; - float *confi_data = (float *)workspace; - float *out_data = (float *)output; - float *buffer = (float *)nram_buffer; - float *sram = (float *)sram_buffer; - - nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, - confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, - sram, taskDim, input_num_boxes, input_stride, - output_stride, max_output_size, iou_threshold, - confidence_threshold, offset, algo); - ((uint32_t *)result_num)[0] = result_box_num; - }; break; + if (data_type_input == CNRT_FLOAT32) { + nms_detection(output_box_num, output_mode, (float *)output, score_data, + boxes_data, GDRAM, sram, taskDim, input_num_boxes, + max_output_size, iou_threshold, confidence_threshold, + offset, algo); + } else { + nms_detection(output_box_num, output_mode, (half *)output, + (half *)score_data, (half *)boxes_data, GDRAM, (half *)sram, + taskDim, input_num_boxes, max_output_size, iou_threshold, + confidence_threshold, offset, algo); } } + ((uint32_t *)result_num)[0] = output_box_num; } template __mlu_func__ void nms_detection_ux( - int32_t *loop_end_flag, uint32_t &output_box_num, OUT_DT *output_dram, + int32_t *exit_flag, uint32_t &output_box_num, OUT_DT *output_dram, IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram, - const int input_layout, const int input_num_boxes, const int input_stride, - const int max_output_size, const float thresh_iou, const float thresh_score, - const float offset, const int output_mode, const int algo) { - loop_end_flag[0] = 0; + const int input_num_boxes, const int max_output_size, + const float thresh_iou, const float thresh_score, const float offset, + const int output_mode, const int algo) { + exit_flag[0] = 0; + IN_DT *sram = (IN_DT *)sram_buffer; // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2 @@ -654,16 +246,10 @@ __mlu_func__ void nms_detection_ux( float div_thresh_iou = 1.0 / thresh_iou; // input data ptr - IN_DT *input_score_ptr; - const IN_DT *input_x1_ptr; - const IN_DT *input_y1_ptr; - const IN_DT *input_x2_ptr; - const IN_DT *input_y2_ptr; - input_score_ptr = score_data; - input_x1_ptr = boxes_data; - input_y1_ptr = input_x1_ptr + input_stride; - input_x2_ptr = input_y1_ptr + input_stride; - input_y2_ptr = input_x2_ptr + input_stride; + const IN_DT *input_x1_ptr = boxes_data; + const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes; + const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes; + const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes; int limit = 0; // find limit when GDRAM or SRAM int max_seg_pad = 0; // the max length every repeat @@ -682,41 +268,16 @@ __mlu_func__ void nms_detection_ux( (nms_buffer_count1 * sizeof(IN_DT)); } - // data split - int avg_cluster = input_num_boxes / clusterDim; - int rem_cluster = input_num_boxes % clusterDim; - int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0); - int cluster_offset = avg_cluster * clusterId + - (clusterId <= rem_cluster ? clusterId : rem_cluster); - - int avg_core = len_cluster / coreDim; - int rem_core = len_cluster % coreDim; - int len_core = avg_core + (coreId < rem_core ? 1 : 0); - int core_offset = - avg_core * coreId + (coreId <= rem_core ? coreId : rem_core); - int input_offset = cluster_offset + core_offset; - - max_seg_pad = PAD_DOWN(limit, NMS_SIZE); - - // core 0 of each cluster calculate the max score index - int max_index_avg_core = input_num_boxes / clusterDim; - int max_index_rem_core = input_num_boxes % clusterDim; - int max_index_len_core = - max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0); - int max_index_input_offset = - max_index_avg_core * clusterId + - (clusterId <= max_index_rem_core ? clusterId : max_index_rem_core); - repeat = max_index_len_core / max_seg_pad; - remain = max_index_len_core % max_seg_pad; - remain_pad = PAD_UP(remain, NMS_SIZE); - - // if datatype is fp16, we should cvt to fp32 when compute iou - int max_seg_iou_compute = - PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE); - int repeat_iou_compute = len_core / max_seg_iou_compute; - int remain_iou_compute = len_core % max_seg_iou_compute; - int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE); + int input_offset = 0; + int max_seg_iou_compute = 0; + int repeat_iou_compute = 0; + int remain_iou_compute = 0; + int remain_pad_iou_compute = 0; + getComputeParamsUx(sizeof(IN_DT), input_num_boxes, limit, input_offset, + max_seg_pad, repeat, remain, remain_pad, + max_seg_iou_compute, repeat_iou_compute, + remain_iou_compute, remain_pad_iou_compute); // init the nram ptr IN_DT *score = (IN_DT *)nram_buffer; IN_DT *x1 = score + max_seg_pad; @@ -731,320 +292,94 @@ __mlu_func__ void nms_detection_ux( OUT_DT *nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE); // offset two line from max_box - - mluMemcpyDirection_t input_load_dir = SRAM2NRAM; - mluMemcpyDirection_t input_store_dir = NRAM2SRAM; - input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM; - input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM; +#if __BANG_ARCH__ >= 300 + float max_box_x1 = 0; + float max_box_y1 = 0; + float max_box_x2 = 0; + float max_box_y2 = 0; +#endif + mluMemcpyDirection_t load_dir = SRAM2NRAM; + mluMemcpyDirection_t store_dir = NRAM2SRAM; + load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM; + store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM; for (int keep = 0; keep < max_output_size; keep++) { // loop until the max_score <= 0 __sync_all(); - /******FIND MAX START******/ int max_index = 0; int global_max_index = 0; // for Ux float max_area = 0; // the max socre area max_box[0] = 0; // init 0 if (coreId == 0) { - for (int i = 0; i <= repeat; i++) { - if (i == repeat && remain == 0) { - break; - } - - int seg_len = (i == repeat) - ? remain_pad - : max_seg_pad; // the length every nms compute - // check seg_len exceeds the limit of fp16 or not. 65536 is the largest - // num - // that fp16 could express. - if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) { - return; - } - int cpy_len = (i == repeat) - ? remain - : max_seg_pad; // the length every nms memcpy - - /******NMS LOAD START******/ - __bang_write_zero(score, seg_len); - __memcpy(score, - input_score_ptr + max_index_input_offset + i * max_seg_pad, - cpy_len * sizeof(IN_DT), input_load_dir, - cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); - - /******NMS LOAD END******/ - - __bang_max(inter_x1, score, seg_len); - if (inter_x1[0] > max_box[0]) { - max_box[0] = inter_x1[0]; - if (sizeof(IN_DT) == sizeof(half)) { - max_index = - ((uint16_t *)inter_x1)[1] + max_index_input_offset + - i * max_seg_pad; // offset start from head of input_data - } else if (sizeof(IN_DT) == sizeof(float)) { - max_index = - ((uint32_t *)inter_x1)[1] + max_index_input_offset + - i * max_seg_pad; // offset start from head of input_data - } - } - } // for repeat - - // the max box's x1, y1, x2, y2 on every cluster - max_box[1] = input_x1_ptr[max_index]; - max_box[2] = input_y1_ptr[max_index]; - max_box[3] = input_x2_ptr[max_index]; - max_box[4] = input_y2_ptr[max_index]; - ((uint32_t *)(max_box + 5))[0] = max_index; + findCoreMaxBox(score_data, score, inter_x1, max_box, input_x1_ptr, + input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir, + input_offset, repeat, remain, remain_pad, max_seg_pad, + max_index); // copy max box info to sram __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM); } __sync_all(); - // copy all partial max to the sram of cluster 0 - if (clusterId != 0) { - __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT), - SRAM2SRAM, 0); - } - __sync_all(); - - // reduce between clusters to get the global max box - if (clusterId == 0) { - if (coreId == 0) { - __bang_write_zero(inter_x1, NMS_SIZE); - __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT), - REDUCE_NUM * sizeof(IN_DT), clusterDim - 1); - __bang_max(max_box, inter_x1, NMS_SIZE); - int max_cluster = (sizeof(IN_DT) == sizeof(half)) - ? ((uint16_t *)max_box)[1] - : ((uint32_t *)max_box)[1]; - __memcpy(max_box, sram + max_cluster * REDUCE_NUM, - REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM); - __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM); - } - __sync_cluster(); - if (coreId == 0x80 && clusterDim > 1) { - // broadcast global max box to each cluster's sram - for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) { - __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM, - cluster_idx); - } - } - __sync_cluster(); - } - __sync_all(); +#if __BANG_ARCH__ <= 372 + findGlobalMaxBox(max_box, sram, inter_x1); +#endif - // copy the global max box to max_box - __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM); - if (algo == 0 || offset == 0.0) { - max_area = ((float)max_box[3] - (float)max_box[1]) * - ((float)max_box[4] - (float)max_box[2]); - } else { - max_area = ((float)max_box[3] - (float)max_box[1] + offset) * - ((float)max_box[4] - (float)max_box[2] + offset); - } +#if __BANG_ARCH__ >= 300 + calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1, + max_box_x2, max_box_y2); +#else + calMaxArea(max_box, algo, offset, max_area); +#endif global_max_index = ((uint32_t *)(max_box + 5))[0]; - if (coreId != 0x80) { - input_score_ptr[global_max_index] = 0; + if (coreId != MEMORY_CORE) { + score_data[global_max_index] = 0; } - // by now, we get: max_score|max_index|max_box|max_area - /******FIND MAX END******/ - /******NMS STORE START******/ - // store to nram - if (float(max_box[0]) > thresh_score) { - OUT_DT *save_ptr; - int save_offset = 0; - int save_str_num = 0; - save_ptr = nram_save; - save_offset = nram_save_count; - save_str_num = nram_save_limit_count; - if (clusterId == 0 && coreId == 0) { - if (output_mode == 0) { // index1, index2, ... - save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0]; - } else if (output_mode == 1) { // score, x1, y1, x2, y2 - __memcpy(save_ptr + save_offset * INFO_NUM, max_box, - INFO_NUM * sizeof(IN_DT), NRAM2NRAM, - INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0); - } else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2--- - __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), - NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), - 4); - } - } - nram_save_count++; - output_box_num++; - } - - // store to sram/gdram - if (output_box_num != 0) { - if ((nram_save_count == nram_save_limit_count) || - (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) { - if (nram_save_count != 0) { - if (clusterId == 0 && coreId == 0) { - if (output_mode == 0) { // index1, index2, ... - pvLock(); - __memcpy(output_dram, nram_save, - nram_save_count * sizeof(uint32_t), NRAM2GDRAM); - pvUnlock(); - output_dram += nram_save_count; - } else if (output_mode == 1) { // score, x1, y1, x2, y2 - pvLock(); - __memcpy(output_dram, nram_save, - nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM); - pvUnlock(); - output_dram += nram_save_count * INFO_NUM; - } else if (output_mode == - 2) { // score---, x1---, y1---, x2---, y2--- - pvLock(); - __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT), - NRAM2GDRAM, max_output_size * sizeof(IN_DT), - nram_save_limit_count * sizeof(IN_DT), 4); - pvUnlock(); - output_dram += nram_save_count; - } - nram_save_count = 0; - } - } - } // if move data nram->sram/gdram - } // if dst + storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count, + max_output_size, thresh_score, output_mode, nram_save_count, + output_box_num); if (float(max_box[0]) <= thresh_score) { if (clusterId == 0 && coreId == 0) { - loop_end_flag[0] = 1; // dram + exit_flag[0] = 1; // dram } } __sync_all(); - if (loop_end_flag[0] == 1) { + if (exit_flag[0] == 1) { break; } - /******NMS STORE END******/ - - // To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU. - for (int i = 0; i <= repeat_iou_compute; i++) { - if (i == repeat_iou_compute && remain_iou_compute == 0) { - break; - } - int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute - : max_seg_iou_compute; - int cpy_len = - (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute; - - /******NMS LOAD START******/ - __nramset((float *)score, seg_len, 0.0f); - int dt_offset = 0; - if (sizeof(IN_DT) == sizeof(float)) { - __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad, - cpy_len * sizeof(IN_DT), input_load_dir, - cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); - dt_offset = 0; - } else if (sizeof(IN_DT) == sizeof(half)) { - __nramset(x1, seg_len, half(0)); - __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute, - cpy_len * sizeof(IN_DT), input_load_dir, - cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); - __bang_half2float((float *)score, (half *)x1, seg_len); - dt_offset = max_seg_iou_compute; - } - - __memcpy(x1 + dt_offset, - input_x1_ptr + input_offset + i * max_seg_iou_compute, - cpy_len * sizeof(IN_DT), input_load_dir, - max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3); - /******NMS LOAD END******/ - - /******NMS COMPUTE START******/ - if (sizeof(IN_DT) == sizeof(half)) { - __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, - seg_len); - __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, - seg_len); - __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, - seg_len); - __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, - seg_len); - } - // 1、 compute IOU - // get the area_I - __nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1 - __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1, - seg_len); // inter_x1 - __nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2 - __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2, - seg_len); // inter_x2 - __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, - seg_len); - if (algo == 1 && offset != 0.0) { - __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len); - } - __bang_active_relu((float *)inter_x1, (float *)inter_x1, - seg_len); // inter_w - __nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1 - __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2, - seg_len); // inter_y1 - __nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2 - __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2, - seg_len); // inter_y2 - __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1, - seg_len); - if (algo == 1 && offset != 0.0) { - __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len); - } - __bang_active_relu((float *)inter_y1, (float *)inter_y1, - seg_len); // inter_h - __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1, - seg_len); // area_I - // get the area of input_box: area = (x2 - x1) * (y2 - y1); - __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len); - __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len); - if (algo == 1 && offset != 0.0) { - __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len); - __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len); - } - __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2, - seg_len); // area - // get the area_U: area + max_area - area_I - __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area), - seg_len); - __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1, - seg_len); // area_U - // 2、 select the box - // if IOU greater than thres, set the score to zero, abort it: area_U > - // area_I * (1 / thresh)? - if (thresh_iou > 0.0) { - __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou, - seg_len); - } else { - __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou, - seg_len); - } - __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, - seg_len); - __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len); - /******NMS COMPUTE END******/ - - if (sizeof(IN_DT) == 2) { - __bang_float2half_rd((half *)score, (float *)score, seg_len); - } - pvLock(); - __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score, - cpy_len * sizeof(IN_DT), input_store_dir, - cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); - pvUnlock(); - } // for repeat - } // for max_output_size +/******NMS STORE END******/ +#if __BANG_ARCH__ >= 300 + scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr, + input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1, + inter_y1, inter_x2, inter_y2, max_box, max_box_x1, max_box_y1, + max_box_x2, max_box_y2, nram_save, repeat_iou_compute, + remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute, + max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset, + max_area, input_num_boxes, algo); +#else + scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr, + input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1, + inter_y1, inter_x2, inter_y2, max_box, max_box[1], max_box[2], + max_box[3], max_box[4], nram_save, repeat_iou_compute, + remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute, + max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset, + max_area, input_num_boxes, algo); +#endif + } // for max_output_size } __mlu_global__ void MLUUionXKernelNMS( const void *input_boxes, const void *input_confidence, - const int input_num_boxes, const int input_layout, const int input_stride, - const int max_output_size, const float iou_threshold, - const float confidence_threshold, const float offset, - const cnrtDataType_t data_type_input, const int output_mode, const int algo, - void *workspace, void *result_num, void *output) { + const int input_num_boxes, const int max_output_size, + const float iou_threshold, const float confidence_threshold, + const float offset, const cnrtDataType_t data_type_input, + const int output_mode, const int algo, void *workspace, void *result_num, + void *output) { int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2; - int32_t *loop_end_flag = - (int32_t *)((char *)workspace + - INFO_NUM * input_num_boxes * input_dwidth); + int32_t *exit_flag = (int32_t *)((char *)workspace + + INFO_NUM * input_num_boxes * input_dwidth); int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth; int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size; @@ -1062,88 +397,55 @@ __mlu_global__ void MLUUionXKernelNMS( __memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM); } __sync_cluster(); + uint32_t output_box_num = 0; + float *score_data; + float *boxes_data; + score_data = (input_ram == SRAM) ? (float *)sram_score : (float *)workspace; + boxes_data = (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes; + if (output_mode == 0) { - uint32_t *output_dram = (uint32_t *)output; - switch (data_type_input) { - default: { return; } - case CNRT_FLOAT16: { - half *score_data; - half *boxes_data; - score_data = - (input_ram == SRAM) ? (half *)sram_score : (half *)workspace; - boxes_data = - (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes; - nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data, - boxes_data, input_ram, input_layout, input_num_boxes, - input_stride, max_output_size, iou_threshold, - confidence_threshold, offset, output_mode, algo); - ((uint32_t *)result_num)[0] = output_box_num; - }; break; - case CNRT_FLOAT32: { - float *score_data; - float *boxes_data; - score_data = - (input_ram == SRAM) ? (float *)sram_score : (float *)workspace; - boxes_data = - (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes; - nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data, - boxes_data, input_ram, input_layout, input_num_boxes, - input_stride, max_output_size, iou_threshold, - confidence_threshold, offset, output_mode, algo); - ((uint32_t *)result_num)[0] = output_box_num; - }; break; + if (data_type_input == CNRT_FLOAT32) { + nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output, + score_data, boxes_data, input_ram, input_num_boxes, + max_output_size, iou_threshold, confidence_threshold, + offset, output_mode, algo); + } else { + nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output, + (half *)score_data, (half *)boxes_data, input_ram, + input_num_boxes, max_output_size, iou_threshold, + confidence_threshold, offset, output_mode, algo); } } else { - switch (data_type_input) { - default: { return; } - case CNRT_FLOAT16: { - half *output_dram = (half *)output; - half *score_data; - half *boxes_data; - score_data = - (input_ram == SRAM) ? (half *)sram_score : (half *)workspace; - boxes_data = - (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes; - nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data, - boxes_data, input_ram, input_layout, input_num_boxes, - input_stride, max_output_size, iou_threshold, - confidence_threshold, offset, output_mode, algo); - ((uint32_t *)result_num)[0] = output_box_num; - }; break; - case CNRT_FLOAT32: { - float *output_dram = (float *)output; - float *score_data; - float *boxes_data; - score_data = - (input_ram == SRAM) ? (float *)sram_score : (float *)workspace; - boxes_data = - (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes; - nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data, - boxes_data, input_ram, input_layout, input_num_boxes, - input_stride, max_output_size, iou_threshold, - confidence_threshold, offset, output_mode, algo); - ((uint32_t *)result_num)[0] = output_box_num; - }; break; + if (data_type_input == CNRT_FLOAT32) { + nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data, + boxes_data, input_ram, input_num_boxes, max_output_size, + iou_threshold, confidence_threshold, offset, output_mode, + algo); + } else { + nms_detection_ux(exit_flag, output_box_num, (half *)output, + (half *)score_data, (half *)boxes_data, input_ram, + input_num_boxes, max_output_size, iou_threshold, + confidence_threshold, offset, output_mode, algo); } } + ((uint32_t *)result_num)[0] = output_box_num; } void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const cnrtDataType_t data_type_input, const void *boxes_ptr, const void *scores_ptr, const int input_num_boxes, - const int input_stride, const int max_output_boxes, - const float iou_threshold, const float offset, - void *workspace_ptr, void *output_size_ptr, void *output_ptr) { + const int max_output_boxes, const float iou_threshold, + const float offset, void *workspace_ptr, void *output_size_ptr, + void *output_ptr) { switch (k_type) { default: { return; } case CNRT_FUNC_TYPE_BLOCK: case CNRT_FUNC_TYPE_UNION1: { MLUUnion1KernelNMS<<>>( - boxes_ptr, scores_ptr, input_num_boxes, input_stride, + (void *)boxes_ptr, (void *)scores_ptr, input_num_boxes, max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, - /*output_mode=*/0, - /*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr, + /*output_mode=*/0, workspace_ptr, output_size_ptr, output_ptr, data_type_input, offset, /*algo=*/1); }; break; case CNRT_FUNC_TYPE_UNION2: @@ -1151,11 +453,10 @@ void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, case CNRT_FUNC_TYPE_UNION8: case CNRT_FUNC_TYPE_UNION16: { MLUUionXKernelNMS<<>>( - boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1, - input_stride, max_output_boxes, iou_threshold, - /*confidence_threshold=*/0.0, offset, data_type_input, - /*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr, - output_ptr); + (void *)boxes_ptr, (void *)scores_ptr, input_num_boxes, + max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, offset, + data_type_input, /*output_mode=*/0, /*algo=*/1, workspace_ptr, + output_size_ptr, output_ptr); }; break; } } diff --git a/mmcv/ops/csrc/common/mlu/nms_utils.hpp b/mmcv/ops/csrc/common/mlu/nms_utils.hpp new file mode 100644 index 0000000000..61f5ba95df --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/nms_utils.hpp @@ -0,0 +1,553 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef NMS_UTILS_HPP_ +#define NMS_UTILS_HPP_ +#include "common_mlu_helper.hpp" + +#define NMS_SIZE (64) +#define NMS_UP(x, y) (x / y + (int)(x % y > 0)) * y +#define NMS_DOWN(x, y) (x / y) * y +#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score +#define MEMORY_CORE (0x80) +#define REDUCE_NUM \ + (7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input) + +__mlu_func__ void pvLock() { +#if __BANG_ARCH__ == 270 + if (coreId != MEMORY_CORE) { + __bang_lock(0, 0); + } +#endif +} + +__mlu_func__ void pvUnlock() { +#if __BANG_ARCH__ == 270 + if (coreId != MEMORY_CORE) { + __bang_unlock(0, 0); + } +#endif +} + +template +static __mlu_func__ void computeReluN(T *nram_dst, T *nram_src, void *nram_tmp, + const int deal_num, + const T threshold = 0) { + if (threshold < 0) { + return; + } + if (threshold) { +#if __BANG_ARCH__ >= 300 + __bang_relun(nram_dst, nram_src, deal_num, threshold); +#else + int align_num = NFU_ALIGN_SIZE / sizeof(T); + T *nram_aux_a = (T *)nram_tmp; + T *nram_aux_b = nram_aux_a + deal_num; + T *nram_zero = nram_aux_b + align_num; + __bang_write_value(nram_aux_b, align_num, threshold); + __bang_write_zero(nram_zero, align_num); + __bang_cycle_lt((T *)nram_aux_a, nram_src, (T *)nram_aux_b, deal_num, + align_num); + __bang_mul(nram_dst, nram_src, (T *)nram_aux_a, deal_num); + __bang_cycle_eq((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_zero, deal_num, + align_num); + __bang_cycle_mul((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_aux_b, + deal_num, align_num); + __bang_add(nram_dst, nram_dst, (T *)nram_aux_a, deal_num); + __bang_cycle_gt((T *)nram_aux_a, nram_dst, (T *)nram_zero, deal_num, + align_num); + __bang_mul(nram_dst, nram_dst, (T *)nram_aux_a, deal_num); +#endif + } else { +#if __BANG_ARCH__ >= 300 + __bang_relu(nram_dst, nram_src, deal_num); +#else + __bang_active_relu(nram_dst, nram_src, deal_num); +#endif + } +} + +__mlu_func__ void getComputeParamsBlockOrU1( + const int input_dwidth, const int input_box_num, const int limit, + const int core_limit, int &input_offset, int &max_seg_pad, int &repeat, + int &remain, int &remain_pad, int &max_seg_iou_compute, + int &repeat_iou_compute, int &remain_iou_compute, + int &remain_pad_iou_compute) { + int avg_core = input_box_num / core_limit; + int rem = input_box_num % core_limit; + int len_core = avg_core + (coreId < rem ? 1 : 0); + input_offset = avg_core * coreId + (coreId <= rem ? coreId : rem); + max_seg_pad = NMS_DOWN(limit, NMS_SIZE); + repeat = len_core / max_seg_pad; + remain = len_core % max_seg_pad; + remain_pad = NMS_UP(remain, NMS_SIZE); + + // if datatype is fp16, we should cvt to fp32 when compute iou + max_seg_iou_compute = NMS_DOWN(max_seg_pad / (4 / input_dwidth), NMS_SIZE); + repeat_iou_compute = len_core / max_seg_iou_compute; + remain_iou_compute = len_core % max_seg_iou_compute; + remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE); +} + +__mlu_func__ void getComputeParamsUx( + const int input_dwidth, const int input_num_boxes, const int limit, + int &input_offset, int &max_seg_pad, int &repeat, int &remain, + int &remain_pad, int &max_seg_iou_compute, int &repeat_iou_compute, + int &remain_iou_compute, int &remain_pad_iou_compute) { + // data split + int avg_cluster = input_num_boxes / clusterDim; + int rem_cluster = input_num_boxes % clusterDim; + int len_cluster = avg_cluster + (clusterId < rem_cluster); + int cluster_offset = avg_cluster * clusterId + + (clusterId <= rem_cluster ? clusterId : rem_cluster); + + int avg_core = len_cluster / coreDim; + int rem_core = len_cluster % coreDim; + int len_core = avg_core + (coreId < rem_core); + int core_offset = + avg_core * coreId + (coreId <= rem_core ? coreId : rem_core); + input_offset = cluster_offset + core_offset; + + max_seg_pad = NMS_DOWN(limit, NMS_SIZE); + + // core 0 of each cluster calculate the max score index + int max_index_len_core = avg_cluster + (clusterId < rem_cluster); + repeat = max_index_len_core / max_seg_pad; + remain = max_index_len_core % max_seg_pad; + remain_pad = NMS_UP(remain, NMS_SIZE); + // if datatype is fp16, we should cvt to fp32 when compute iou + max_seg_iou_compute = + NMS_DOWN(max_seg_pad / (sizeof(float) / input_dwidth), NMS_SIZE); + repeat_iou_compute = len_core / max_seg_iou_compute; + remain_iou_compute = len_core % max_seg_iou_compute; + remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE); +} + +template +__mlu_func__ void findGlobalMaxBox(IN_DT *max_box, IN_DT *sram, + IN_DT *inter_x1) { + // copy all partial max to the sram of cluster 0 + if (clusterId != 0) { + __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT), + SRAM2SRAM, 0); + } + __sync_all(); + + // reduce between clusters to get the global max box + if (clusterId == 0) { + if (coreId == 0) { + __bang_write_zero(inter_x1, NMS_SIZE); + __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT), + REDUCE_NUM * sizeof(IN_DT), clusterDim - 1); + __bang_max(max_box, inter_x1, NMS_SIZE); + int max_cluster = (sizeof(IN_DT) == sizeof(half)) + ? ((uint16_t *)max_box)[1] + : ((uint32_t *)max_box)[1]; + __memcpy(max_box, sram + max_cluster * REDUCE_NUM, + REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM); + __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM); + } + __sync_cluster(); + if (coreId == 0x80 && clusterDim > 1) { + // broadcast global max box to each cluster's sram + for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) { + __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM, + cluster_idx); + } + } + __sync_cluster(); + } + __sync_all(); + + // copy the global max box to max_box + __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM); +} + +template +__mlu_func__ void findCoreMaxBox( + IN_DT *input_score_ptr, IN_DT *score, IN_DT *inter_x1, IN_DT *max_box, + const IN_DT *input_x1_ptr, const IN_DT *input_y1_ptr, + const IN_DT *input_x2_ptr, const IN_DT *input_y2_ptr, + const mluMemcpyDirection_t load_dir, const int input_offset, + const int repeat, const int remain, const int remain_pad, + const int max_seg_pad, int &max_index) { + if (coreId != 0x80) { + for (int i = 0; i <= repeat; i++) { + if (i == repeat && remain == 0) { + break; + } + int seg_len = 0; // the length every nms compute + int cpy_len = 0; // the length every nms memcpy + i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad; + i == repeat ? cpy_len = remain : cpy_len = max_seg_pad; + /******NMS LOAD START******/ + __bang_write_zero(score, seg_len); + __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + + /******NMS LOAD END******/ + + __bang_max(inter_x1, score, seg_len); + if (inter_x1[0] > max_box[0]) { + max_box[0] = inter_x1[0]; + if (sizeof(IN_DT) == sizeof(half)) { + max_index = ((uint16_t *)inter_x1)[1] + input_offset + + i * max_seg_pad; // offset start from head of input_data + } else if (sizeof(IN_DT) == sizeof(float)) { + max_index = ((uint32_t *)inter_x1)[1] + input_offset + + i * max_seg_pad; // offset start from head of input_data + } + } + } // for repeat + // the max box's x1, y1, x2, y2 on every core + max_box[1] = input_x1_ptr[max_index]; + max_box[2] = input_y1_ptr[max_index]; + max_box[3] = input_x2_ptr[max_index]; + max_box[4] = input_y2_ptr[max_index]; + ((uint32_t *)(max_box + 5))[0] = max_index; + } +} + +template +__mlu_func__ void findClusterMaxBox(IN_DT *sram, IN_DT *max_box, + IN_DT *inter_x1, IN_DT *input_data_score, + const int core_limit) { + // find the max with sram + // copy every core's box info to sram, form: score---x1---y1---x2---y2--- + __memcpy(sram + REDUCE_NUM * coreId, max_box, REDUCE_NUM * sizeof(IN_DT), + NRAM2SRAM); // int32_t datatype + __sync_cluster(); + + // copy score from sram to nram and find the max + __bang_write_zero(inter_x1, 64); + __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT), + REDUCE_NUM * sizeof(IN_DT), coreDim - 1); + __bang_max(max_box, inter_x1, 64); + int max_core = sizeof(IN_DT) == sizeof(half) ? ((uint16_t *)max_box)[1] + : ((uint32_t *)max_box)[1]; + // copy the max box to max_box + __memcpy(max_box, sram + max_core * REDUCE_NUM, REDUCE_NUM * sizeof(IN_DT), + SRAM2NRAM); +} + +/*****************************************************************************/ +/*******************************CALCULATE MAX AREA****************************/ +/*****************************************************************************/ + +template +__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset, + float &max_area) { + if (algo == 0 || offset == 0.0) { + max_area = ((float)max_box[3] - (float)max_box[1]) * + ((float)max_box[4] - (float)max_box[2]); + } else { + max_area = ((float)max_box[3] - (float)max_box[1] + offset) * + ((float)max_box[4] - (float)max_box[2] + offset); + } +} + +template +__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset, + float &max_area, float &max_box_x1, + float &max_box_y1, float &max_box_x2, + float &max_box_y2) { + // the case of random inf will break the requirement of x1<=x2, y1<=y2 + // so exchange it if it happens. + max_box_x1 = float(max_box[1]); + max_box_x2 = float(max_box[3]); + if (max_box[1] > max_box[3]) { + max_box_x1 = float(max_box[3]); + max_box_x2 = float(max_box[1]); + } + max_box_y1 = float(max_box[2]); + max_box_y2 = float(max_box[4]); + if (max_box[2] > max_box[4]) { + max_box_y1 = float(max_box[4]); + max_box_y2 = float(max_box[2]); + } + if (algo == 0 || offset == 0.0) { + max_area = (max_box_x2 - max_box_x1) * (max_box_y2 - max_box_y1); + } else { + max_area = + (max_box_x2 - max_box_x1 + offset) * (max_box_y2 - max_box_y1 + offset); + } +} + +/***********************************************************************/ +/*******************************STORE RESULT****************************/ +/***********************************************************************/ +template +__mlu_func__ void storeResult(IN_DT *max_box, OUT_DT *nram_save, + OUT_DT *&output_dram, const int keep, + const int nram_save_limit_count, + const int max_output_size, + const float thresh_score, const int output_mode, + int &nram_save_count, uint32_t &output_box_num) { + /******NMS STORE START******/ + // store to nram + if (float(max_box[0]) > thresh_score) { + OUT_DT *save_ptr; + int save_offset = 0; + int save_str_num = 0; + save_ptr = nram_save; + save_offset = nram_save_count; + save_str_num = nram_save_limit_count; + if (clusterId == 0 && coreId == 0) { + if (output_mode == 0) { // index1, index2, ... + save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0]; + } else if (output_mode == 1) { // score, x1, y1, x2, y2 + __memcpy(save_ptr + save_offset * INFO_NUM, max_box, + INFO_NUM * sizeof(IN_DT), NRAM2NRAM, INFO_NUM * sizeof(IN_DT), + INFO_NUM * sizeof(IN_DT), 0); + } else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2--- + __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), NRAM2NRAM, + save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), 4); + } + } + nram_save_count++; + output_box_num++; + } + + // store to sram/gdram + if (output_box_num != 0) { + if ((nram_save_count == nram_save_limit_count) || + (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) { + if (nram_save_count != 0) { + if (clusterId == 0 && coreId == 0) { + if (output_mode == 0) { // index1, index2, ... + pvLock(); + __memcpy(output_dram, nram_save, nram_save_count * sizeof(uint32_t), + NRAM2GDRAM); + pvUnlock(); + output_dram += nram_save_count; + } else if (output_mode == 1) { // score, x1, y1, x2, y2 + pvLock(); + __memcpy(output_dram, nram_save, + nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM); + pvUnlock(); + output_dram += nram_save_count * INFO_NUM; + } else if (output_mode == + 2) { // score---, x1---, y1---, x2---, y2--- + pvLock(); + __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT), + NRAM2GDRAM, max_output_size * sizeof(IN_DT), + nram_save_limit_count * sizeof(IN_DT), 4); + pvUnlock(); + output_dram += nram_save_count; + } + nram_save_count = 0; + } + } + } // if move data nram->sram/gdram + } // if dst +} + +template +__mlu_func__ void scoreUpdate( + IN_DT *input_score_ptr, const mluMemcpyDirection_t load_dir, + const mluMemcpyDirection_t store_dir, const IN_DT *input_x1_ptr, + const IN_DT *input_y1_ptr, const IN_DT *input_x2_ptr, + const IN_DT *input_y2_ptr, IN_DT *x1, IN_DT *y1, IN_DT *x2, IN_DT *y2, + IN_DT *score, IN_DT *inter_x1, IN_DT *inter_y1, IN_DT *inter_x2, + IN_DT *inter_y2, IN_DT *max_box, const float max_box_x1, + const float max_box_y1, const float max_box_x2, const float max_box_y2, + OUT_DT *nram_save, int repeat_iou_compute, int remain_iou_compute, + int remain_pad_iou_compute, int max_seg_iou_compute, int max_seg_pad, + const float thresh_iou, const float div_thresh_iou, const int input_offset, + const float offset, const float max_area, const int input_num_boxes, + const int algo) { + for (int i = 0; i <= repeat_iou_compute; i++) { + if (i == repeat_iou_compute && remain_iou_compute == 0) { + break; + } + int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute + : max_seg_iou_compute; + int cpy_len = + (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute; + /******NMS LOAD START******/ + int dt_offset = 0; + if (sizeof(IN_DT) == sizeof(float)) { + __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + dt_offset = 0; + } else if (sizeof(IN_DT) == sizeof(half)) { + __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + __bang_half2float((float *)score, (half *)x1, seg_len); + dt_offset = max_seg_iou_compute; + } +#if __BANG_ARCH__ >= 300 + __memcpy(inter_x1 + dt_offset, + input_x1_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT), + input_num_boxes * sizeof(IN_DT), 3); + + if (sizeof(IN_DT) == sizeof(half)) { + __bang_half2float((float *)inter_x1, + (half *)inter_x1 + max_seg_iou_compute, seg_len); + __bang_half2float((float *)inter_y1, + (half *)inter_y1 + max_seg_iou_compute, seg_len); + __bang_half2float((float *)inter_x2, + (half *)inter_x2 + max_seg_iou_compute, seg_len); + __bang_half2float((float *)inter_y2, + (half *)inter_y2 + max_seg_iou_compute, seg_len); + } + // box transfer + __bang_minequal((float *)x1, (float *)inter_x1, (float *)inter_x2, seg_len); + __bang_maxequal((float *)x2, (float *)inter_x1, (float *)inter_x2, seg_len); + __bang_minequal((float *)y1, (float *)inter_y1, (float *)inter_y2, seg_len); + __bang_maxequal((float *)y2, (float *)inter_y1, (float *)inter_y2, seg_len); + // 1、 compute IOU + // get the area_I + __bang_maxeq_scalar((float *)inter_x1, (float *)x1, max_box_x1, + seg_len); // inter_x1 + __bang_mineq_scalar((float *)inter_x2, (float *)x2, max_box_x2, + seg_len); // inter_x2 + __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, + seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len); + } + computeReluN((float *)inter_x1, (float *)inter_x1, NULL, + seg_len); // inter_w + __bang_maxeq_scalar((float *)inter_y1, (float *)y1, float(max_box_y1), + seg_len); // inter_y1 + __bang_mineq_scalar((float *)inter_y2, (float *)y2, float(max_box_y2), + seg_len); // inter_y2 + __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1, + seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len); + } + computeReluN((float *)inter_y1, (float *)inter_y1, NULL, + seg_len); // inter_h + __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1, + seg_len); // area_I + // get the area of input_box: area = (x2 - x1) * (y2 - y1); + if (algo == 1 && offset != 0.0) { + __bang_fusion(FUSION_FSA, (float *)inter_y1, (float *)x2, (float *)x1, + offset, seg_len, seg_len); + __bang_fusion(FUSION_FSA, (float *)inter_y2, (float *)y2, (float *)y1, + offset, seg_len, seg_len); + __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2, + seg_len); // area + } else { + __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len); + __bang_fusion(FUSION_FSM, (float *)inter_x2, (float *)y2, (float *)y1, + (float *)inter_y1, seg_len, seg_len); + } + // get the area_U: area + max_area - area_I + __bang_fusion(FUSION_FAS, (float *)inter_x2, (float *)inter_x2, max_area, + (float *)inter_x1, seg_len, seg_len); + // 2、 select the box + // if IOU greater than thres, set the score to zero, abort it: area_U > + // area_I * (1 / thresh)? + if (thresh_iou > 0.0) { + __bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou, + seg_len); + } else { + __bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou, + seg_len); + } + // process for nan + __bang_lt((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len); + __bang_not((float *)inter_x1, (float *)inter_x1, seg_len); + __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len); +/******NMS COMPUTE END******/ +#else + __memcpy(x1 + dt_offset, + input_x1_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT), + input_num_boxes * sizeof(IN_DT), 3); + if (sizeof(IN_DT) == sizeof(half)) { + __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, seg_len); + __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, seg_len); + __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, seg_len); + __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, seg_len); + } + // 1、 compute IOU + // get the area_I + __bang_write_value((float *)inter_y1, seg_len, + float(max_box[1])); // max_x1 + __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1, + seg_len); // inter_x1 + __bang_write_value((float *)inter_y2, seg_len, + float(max_box[3])); // max_x2 + __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2, + seg_len); // inter_x2 + __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, + seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len); + } + computeReluN((float *)inter_x1, (float *)inter_x1, NULL, + seg_len); // inter_w + __bang_write_value((float *)inter_x2, seg_len, + float(max_box[2])); // max_y1 + __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2, + seg_len); // inter_y1 + __bang_write_value((float *)inter_x2, seg_len, + float(max_box[4])); // max_y2 + __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2, + seg_len); // inter_y2 + __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1, + seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len); + } + computeReluN((float *)inter_y1, (float *)inter_y1, NULL, + seg_len); // inter_h + __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1, + seg_len); // area_I + // get the area of input_box: area = (x2 - x1) * (y2 - y1); + __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len); + __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len); + __bang_add_scalar((float *)inter_y2, (float *)inter_y2, offset, seg_len); + } + __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2, + seg_len); // area + // get the area_U: area + max_area - area_I + __bang_add_scalar((float *)inter_x2, (float *)inter_x2, float(max_area), + seg_len); + __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1, + seg_len); // area_U + // 2、 select the box + // if IOU greater than thresh, set the score to zero, abort it: area_U > + // area_I * (1 / thresh)? + if (thresh_iou > 0.0) { + __bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou, + seg_len); + } else { + __bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou, + seg_len); + } + __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len); + __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len); +/******NMS COMPUTE END******/ +#endif + // update the score + if (sizeof(IN_DT) == sizeof(half)) { + convertFloat2half((half *)score, (float *)score, seg_len); + } + pvLock(); + __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score, + cpy_len * sizeof(IN_DT), store_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + pvUnlock(); + } +} + +#endif // NMS_UTILS_HPP_ diff --git a/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu index 13b4af19f6..055ee4f4d0 100644 --- a/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu +++ b/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu @@ -53,9 +53,8 @@ __mlu_func__ void loadDataFromDramToNram(T *dst, const T *src, int w_seg = position.w_end - position.w_start; int size = h_seg * w_seg * shape_full.c; - __memcpy(dst, - src + position.n_start * n_offset + position.h_start * h_offset + - position.w_start * w_offset, + __memcpy(dst, src + position.n_start * n_offset + + position.h_start * h_offset + position.w_start * w_offset, size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T), n_seg - 1); } @@ -89,7 +88,7 @@ __mlu_func__ void psamaskCollectForward( int elem_count = CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c, NFU_ALIGN_SIZE / sizeof(T)); - __nramset(y_nram, elem_count, (T)0); + __bang_write_value(y_nram, elem_count, (T)0); int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c; int y_h_offset = shape_seg.w * shape_seg.c; @@ -155,7 +154,7 @@ __mlu_func__ void psamaskDistributeForward( CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T)); int elem_count = CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T)); - __nramset(y_nram_temp, elem_count, (T)0); + __bang_write_value(y_nram_temp, elem_count, (T)0); int y_n_offset = align_hw * align_c; int y_h_offset = shape_seg.w * align_c; @@ -242,7 +241,7 @@ __mlu_func__ void psamaskCollectBackward( int elem_count = CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c, NFU_ALIGN_SIZE / sizeof(T)); - __nramset(dx_nram, elem_count, (T)0); + __bang_write_value(dx_nram, elem_count, (T)0); int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c; int dy_h_offset = shape_seg.w * dy_full.c; @@ -331,7 +330,8 @@ __mlu_func__ void psamaskDistributeBackward( // fill zeros to dx T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c; int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c; - __nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0); + __bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), + (T)0); int dy_n_offset_seg = align_hw * align_c; int dy_h_offset_seg = shape_seg.w * align_c; diff --git a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu index f62554d0ef..c99176ab20 100644 --- a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu +++ b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu @@ -130,10 +130,10 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core, __memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM); // interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4 - __bang_mul_const(tmp_cyc1, tmp_cyc1, w1, align_channel); - __bang_mul_const(tmp_cyc2, tmp_cyc2, w2, align_channel); - __bang_mul_const(tmp_cyc3, tmp_cyc3, w3, align_channel); - __bang_mul_const(tmp_cyc4, tmp_cyc4, w4, align_channel); + __bang_mul_scalar(tmp_cyc1, tmp_cyc1, w1, align_channel); + __bang_mul_scalar(tmp_cyc2, tmp_cyc2, w2, align_channel); + __bang_mul_scalar(tmp_cyc3, tmp_cyc3, w3, align_channel); + __bang_mul_scalar(tmp_cyc4, tmp_cyc4, w4, align_channel); __bang_add(nram_in, tmp_cyc1, nram_in, align_channel); __bang_add(nram_in, tmp_cyc2, nram_in, align_channel); @@ -146,7 +146,7 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core, } // loop_roi_grid_w } // loop_roi_grid_h T count_value = (T)(1.0 / count); - __bang_mul_const(nram_out, nram_out, count_value, align_channel); + __bang_mul_scalar(nram_out, nram_out, count_value, align_channel); __memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM); } // loop_cyc_num } @@ -242,8 +242,8 @@ __mlu_global__ void MLUUnion1KernelRoiAlignAvg( case CNRT_FLOAT16: { roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned, channels, pooled_height, pooled_width, input_height, - input_width, sampling_ratio, - (half)spatial_scale, num_rois); + input_width, sampling_ratio, (half)spatial_scale, + num_rois); }; break; case CNRT_FLOAT32: { roialignForwardAvg((float *)input, (float *)rois, (float *)output, @@ -346,31 +346,31 @@ __mlu_func__ void unionRoiAlignBp( &x_high, &y_low, &y_high); if (x_low >= 0 && y_low >= 0) { __memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM); - __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w1, - c_align); - __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, - 1 / count, c_align); + __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w1, + c_align); + __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align, + 1 / count, c_align); __bang_atomic_add((T *)buffer + c_align, image_offset + y_low * wo * c + x_low * c, (T *)buffer + c_align, c); - __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w2, - c_align); - __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, - 1 / count, c_align); + __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w2, + c_align); + __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align, + 1 / count, c_align); __bang_atomic_add((T *)buffer + c_align, image_offset + y_low * wo * c + x_high * c, (T *)buffer + c_align, c); - __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w3, - c_align); - __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, - 1 / count, c_align); + __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w3, + c_align); + __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align, + 1 / count, c_align); __bang_atomic_add((T *)buffer + c_align, image_offset + y_high * wo * c + x_low * c, (T *)buffer + c_align, c); - __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w4, - c_align); - __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, - 1 / count, c_align); + __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w4, + c_align); + __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align, + 1 / count, c_align); __bang_atomic_add((T *)buffer + c_align, image_offset + y_high * wo * c + x_high * c, (T *)buffer + c_align, c); @@ -401,34 +401,34 @@ __mlu_func__ void unionRoiAlignBp( } __memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T), GDRAM2NRAM); - __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w1, - align_c); - __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, - 1 / count, align_c); + __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w1, + align_c); + __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c, + 1 / count, align_c); __bang_atomic_add( (T *)buffer + align_c, image_offset + y_low * wo * c + x_low * c + i * deal_once, (T *)buffer + align_c, deal_c); - __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w2, - align_c); - __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, - 1 / count, align_c); + __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w2, + align_c); + __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c, + 1 / count, align_c); __bang_atomic_add( (T *)buffer + align_c, image_offset + y_low * wo * c + x_high * c + i * deal_once, (T *)buffer + align_c, deal_c); - __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w3, - align_c); - __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, - 1 / count, align_c); + __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w3, + align_c); + __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c, + 1 / count, align_c); __bang_atomic_add( (T *)buffer + align_c, image_offset + y_high * wo * c + x_low * c + i * deal_once, (T *)buffer + align_c, deal_c); - __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w4, - align_c); - __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, - 1 / count, align_c); + __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w4, + align_c); + __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c, + 1 / count, align_c); __bang_atomic_add( (T *)buffer + align_c, image_offset + y_high * wo * c + x_high * c + i * deal_once, diff --git a/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu index 7186cdfac3..3a6d2d3ba6 100644 --- a/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu +++ b/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu @@ -204,11 +204,11 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch, } if (is_empty) { - __nramset((T *)nram_out, c_slice_align, (T)0); + __bang_write_value((T *)nram_out, c_slice_align, (T)0); __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out, c_slice * t_size, NRAM2GDRAM); if (NULL != argmax) { - __nramset((int32_t *)nram_out, c_slice_align, (int32_t)(-1)); + __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1)); __memcpy((int32_t *)argmax_base + dst_offset + c_offset, (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM); } @@ -238,18 +238,18 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch, for (int i = 0; i < c_slice; i++) { nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim); } - __bang_add_const((float *)nram_a, (float *)nram_out, (float)bin_y1, - c_slice_align); - __bang_mul_const((float *)nram_ping, (float *)nram_a, (float)width, - c_slice_align); + __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1, + c_slice_align); + __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width, + c_slice_align); /*compute input_w*/ - __bang_mul_const((float *)nram_a, (float *)nram_out, (float)bin_wdim, - c_slice_align); + __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim, + c_slice_align); __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a, c_slice_align); - __bang_add_const((float *)nram_a, (float *)nram_a, (float)bin_x1, - c_slice_align); + __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1, + c_slice_align); __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a, c_slice_align); convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a, @@ -290,9 +290,7 @@ __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type, rois_num, (float)spatial_scale, (float *)output_data, argmax); }; break; - default: { - break; - } + default: { break; } } } } // namespace forward @@ -328,30 +326,30 @@ __mlu_func__ void convertIndex( align_c); // Perform 'temp_result - hstart' operation - __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart, - align_c); + __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart, + align_c); // Perform 'temp_result1 - temp_result2 * width' operation - __bang_mul_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width, - align_c); + __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width, + align_c); convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1, (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c); __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, (float *)nram_argmax_fp_w, align_c); // Perform 'temp_result - wstart' operation - __bang_sub_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, wstart, - align_c); + __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, + wstart, align_c); // Perform 'temp_result = h * w_compute + w' operation - __bang_mul_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, - w_compute, align_c); + __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, + w_compute, align_c); __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_w, align_c); if (loop_flag == 1) { - __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, - (loop_id * true_limit), align_c); + __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, + (loop_id * true_limit), align_c); } convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1, (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2, @@ -460,21 +458,22 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads, */ // Load the data from GDRAM to NRAM. - __memcpy((T *)nram_grads + align_c * high_precision, - (const T *)grads + (n * pooled_height * pooled_width + - ph * pooled_width + pw) * - channels, - channels * sizeof(T), GDRAM2NRAM); + __memcpy( + (T *)nram_grads + align_c * high_precision, + (const T *)grads + + (n * pooled_height * pooled_width + ph * pooled_width + pw) * + channels, + channels * sizeof(T), GDRAM2NRAM); if (high_precision) { __bang_half2float((float *)nram_grads, (half *)nram_grads + align_c * high_precision, align_c); } - __memcpy((int32_t *)nram_argmax, - (const int32_t *)argmax + (n * pooled_height * pooled_width + - ph * pooled_width + pw) * - channels, + __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax + + (n * pooled_height * pooled_width + + ph * pooled_width + pw) * + channels, channels * sizeof(int32_t), GDRAM2NRAM); // Perform pooling operation on NRAM. @@ -523,20 +522,21 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads, */ // Load the data from GDRAM to NRAM. - __memcpy((T *)nram_grads + align_c * high_precision, - (const T *)grads + (n * pooled_height * pooled_width + - ph * pooled_width + pw) * - channels, - channels * sizeof(T), GDRAM2NRAM); + __memcpy( + (T *)nram_grads + align_c * high_precision, + (const T *)grads + + (n * pooled_height * pooled_width + ph * pooled_width + pw) * + channels, + channels * sizeof(T), GDRAM2NRAM); if (high_precision) { __bang_half2float((float *)nram_grads, (half *)nram_grads + align_c * high_precision, align_c); } - __memcpy((int32_t *)nram_argmax, - (const int32_t *)argmax + (n * pooled_height * pooled_width + - ph * pooled_width + pw) * - channels, + __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax + + (n * pooled_height * pooled_width + + ph * pooled_width + pw) * + channels, channels * sizeof(int32_t), GDRAM2NRAM); int ping_pong = 0; @@ -713,9 +713,7 @@ __mlu_global__ void MLUKernelRoiPoolBackward( height, width, pooled_height, pooled_width, rois_num, (const float)spatial_scale, high_precision); }; break; - default: { - break; - } + default: { break; } } } } // namespace backward diff --git a/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu index 7cb6df0e5d..ed64c2b68c 100644 --- a/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu +++ b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu @@ -26,7 +26,7 @@ __mlu_func__ void mluMultiKernelTinShift( int t_shift = shifts[n_index * group_size + group_id]; int index = cur_channel_index % channel_size * hw_size + n_index * time_size * channel_size * hw_size; - __nramset(data_nram, MAX_NRAM_SIZE, (char)0); + __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0); __asm__ volatile("sync;"); if (abs(t_shift) >= time_size) { __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, @@ -109,7 +109,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence( int next_sequence_index = index / hw_size / channel_size % time_size + segmentime_size; int cur_sequence_index = index / hw_size / channel_size % time_size; - __nramset(data_nram, MAX_NRAM_SIZE, (char)0); + __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0); __asm__ volatile("sync;"); if (max_number_hw_per_core == 0) { mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index, diff --git a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp index 33c4f7de50..51a3003812 100644 --- a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp +++ b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp @@ -16,9 +16,9 @@ void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const cnrtDataType_t data_type_input, const void *boxes_ptr, const void *scores_ptr, const int input_num_boxes, - const int input_stride, const int max_output_boxes, - const float iou_threshold, const float offset, - void *workspace_ptr, void *output_size_ptr, void *output_ptr); + const int max_output_boxes, const float iou_threshold, + const float offset, void *workspace_ptr, void *output_size_ptr, + void *output_ptr); int selectUnionType(uint32_t use_job, int box_num_per_core) { // the box_num_per_core should be at least 256, otherwise the real IO @@ -30,6 +30,46 @@ int selectUnionType(uint32_t use_job, int box_num_per_core) { return use_job; } +static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type, + int &core_num_per_class, + const int input_box_num) { + uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + uint32_t cluster_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount); + uint32_t job_limit = cluster_number * core_dim; + uint32_t core_number = job_limit; + + int box_num_per_core = (input_box_num + core_number - 1) / core_number; + int use_job = selectUnionType(job_limit, box_num_per_core); + // initiate k_type as Union1 + k_dim->x = core_dim; + k_dim->y = 1; + k_dim->z = 1; + *k_type = CNRT_FUNC_TYPE_UNION1; + switch (job_limit) { + case CN_KERNEL_CLASS_BLOCK: + case CN_KERNEL_CLASS_UNION: + case CN_KERNEL_CLASS_UNION2: + case CN_KERNEL_CLASS_UNION4: + case CN_KERNEL_CLASS_UNION8: + case CN_KERNEL_CLASS_UNION16: { + if (use_job < 4) { + k_dim->x = 1; + *k_type = CNRT_FUNC_TYPE_BLOCK; + } else if (use_job == 4) { + k_dim->x = core_dim; + *k_type = CNRT_FUNC_TYPE_UNION1; + } else { + k_dim->x = use_job; + *k_type = (cnrtFunctionType_t)use_job; + } + }; break; + default: + LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number." + << " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task."; + } + return CNNL_STATUS_SUCCESS; +} + Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold, int offset) { // dimension parameters check @@ -53,33 +93,14 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold, } int input_num_boxes = boxes.size(0); - int input_stride = boxes.size(0); int max_output_boxes = boxes.size(0); cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype()); cnrtDim3_t k_dim; cnrtJobType_t k_type; - uint32_t union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount); - uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); - uint32_t job_limit = union_number * core_dim; - uint32_t core_number = union_number * core_dim; - int box_num_per_core = (input_num_boxes + core_number - 1) / core_number; - // initiate k_type as Union1 - k_dim.x = core_dim; - k_dim.y = 1; - k_dim.z = 1; - k_type = CNRT_FUNC_TYPE_UNION1; - int use_job = selectUnionType(job_limit, box_num_per_core); - if (use_job < 4) { - k_dim.x = 1; - k_type = CNRT_FUNC_TYPE_BLOCK; - } else if (use_job == 4) { - k_dim.x = core_dim; - k_type = CNRT_FUNC_TYPE_UNION1; - } else { - k_dim.x = use_job; - k_type = (cnrtFunctionType_t)use_job; - } + + int core_num_per_class; + policyFunc(&k_dim, &k_type, core_num_per_class, input_num_boxes); // transpose boxes (n, 4) to (4, n) for better performance auto boxes_t = boxes.transpose(0, 1); @@ -96,6 +117,7 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold, } else { space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float); } + auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte)); // get compute queue @@ -112,12 +134,12 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold, auto output_size_impl = torch_mlu::getMluTensorImpl(output_size); auto output_size_ptr = output_size_impl->cnnlMalloc(); + uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<>>"; KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr, - input_num_boxes, input_stride, max_output_boxes, iou_threshold, - offset, workspace_ptr, output_size_ptr, output_ptr); - + input_num_boxes, max_output_boxes, iou_threshold, offset, + workspace_ptr, output_size_ptr, output_ptr); int output_num = *static_cast(output_size.cpu().data_ptr()); return output.slice(0, 0, output_num); }